In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import re

  from pandas.core import (


# 数据加载

In [2]:
# 训练数据
economy_df = pd.read_csv('经济train.csv')
military_df = pd.read_csv('军事train.csv')
tech_df = pd.read_csv('科技train.csv')
society_df = pd.read_csv('社会train.csv')
sports_df = pd.read_csv('体育train.csv')
culture_df = pd.read_csv('文化train.csv')
politics_df = pd.read_csv('政治train.csv')

In [3]:
# 标签数据
label_data = pd.read_csv('labeldata.csv')

# 数据清洗

In [4]:
# 清理文本数据的函数
def clean_text(text):
    text = text.lower()  # 转小写
    text = re.sub(r'\d+', '', text)  # 去掉数字
    text = re.sub(r'[^\w\s]', '', text)  # 去掉标点符号
    return text

In [6]:
# 对所有训练集的 'Content' 列进行清洗
for df in [economy_df, military_df, tech_df, society_df, sports_df, culture_df, politics_df]:
    df['Content'] = df['Content'].fillna('').astype(str)  # 将 NaN 转换为空字符串，并确保是字符串类型
    df['Content'] = df['Content'].apply(clean_text)

In [7]:
# 对所有训练集的 'Content' 列进行清洗
for df in [economy_df, military_df, tech_df, society_df, sports_df, culture_df, politics_df]:
    df['Content'] = df['Content'].apply(clean_text)

# 合并标签数据和训练数据

In [8]:
label_data = label_data[label_data['train_predict'] == 'train']
merged_data = label_data.merge(
    pd.concat([economy_df, military_df, tech_df, society_df, sports_df, culture_df, politics_df]),
    left_on='data_row', right_index=True
)

# 特征提取：使用 TF-IDF 向量化

In [9]:
tfidf = TfidfVectorizer(max_features=10000)  # 可以调整特征数
X = tfidf.fit_transform(merged_data['Content'])

In [11]:
print(merged_data.columns)

Index(['data_row', '信息的对华情感', '信息的私人情感（不指向CHN）', '作者是机构的可能性', 'train_predict',
       'Area', 'KeyWords', 'Name', 'Verified', 'Language', 'Content',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')


In [12]:
# 标签：对华情感和私人情感
y_wang = merged_data['信息的对华情感']  # 对华情感标签
y_private = merged_data['信息的私人情感（不指向CHN）']  # 私人情感标签

# 数据拆分：分割训练集和验证集

In [13]:
X_train, X_val, y_train_wang, y_val_wang = train_test_split(X, y_wang, test_size=0.2, random_state=42)
X_train, X_val, y_train_private, y_val_private = train_test_split(X, y_private, test_size=0.2, random_state=42)

# 模型训练

In [15]:
from sklearn.preprocessing import LabelEncoder

# 创建 LabelEncoder 对象
label_encoder = LabelEncoder()

# 对对华情感标签进行编码
y_train_wang_encoded = label_encoder.fit_transform(y_train_wang)  # 训练集
y_val_wang_encoded = label_encoder.transform(y_val_wang)  # 验证集

# 对私人情感标签进行编码
y_train_private_encoded = label_encoder.fit_transform(y_train_private)
y_val_private_encoded = label_encoder.transform(y_val_private)

# 训练模型
model_wang = XGBClassifier(eval_metric='mlogloss')
model_wang.fit(X_train, y_train_wang_encoded)  # 训练时使用编码后的标签


# 模型评估

In [19]:
# 对华情感分类模型训练
model_wang = XGBClassifier(eval_metric='mlogloss')

# 确保训练时使用了编码后的标签
model_wang.fit(X_train, y_train_wang_encoded)  # 训练时使用编码后的标签

# 对验证集进行预测
y_pred_wang = model_wang.predict(X_val)

# 输出分类报告
print("对华情感分类报告:")
print(classification_report(y_val_wang_encoded, y_pred_wang))  # 使用编码后的标签进行评估

对华情感分类报告:
              precision    recall  f1-score   support

           0       0.37      0.41      0.39       100
           1       0.42      0.47      0.44       137
           2       0.42      0.34      0.38       141

    accuracy                           0.40       378
   macro avg       0.40      0.41      0.40       378
weighted avg       0.41      0.40      0.40       378



In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# 确保为私人情感创建并训练模型
model_private = XGBClassifier(eval_metric='mlogloss')  # 创建私人情感模型
model_private.fit(X_train, y_train_private_encoded)  # 使用编码后的私人情感标签训练模型

# 对验证集进行预测
y_pred_private = model_private.predict(X_val)

# 输出分类报告
print("私人情感分类报告:")
print(classification_report(y_val_private_encoded, y_pred_private))  # 使用编码后的标签进行评估

私人情感分类报告:
              precision    recall  f1-score   support

           0       0.40      0.60      0.48       129
           1       0.37      0.26      0.31       130
           2       0.38      0.30      0.34       119

    accuracy                           0.39       378
   macro avg       0.39      0.39      0.38       378
weighted avg       0.39      0.39      0.38       378



# 对预测集进行预测

In [23]:
# 如果文件是用制表符分隔的
predict_data = pd.read_csv('标签的数据无打码.txt', sep='\t', header=None)

# 如果文件是用其他分隔符（如空格、逗号等）分隔的，可以尝试指定适当的分隔符
# predict_data = pd.read_csv('标签的数据无打码.txt', sep=',', header=None)  # 如果是逗号分隔

# 如果你只是想跳过有问题的行，也可以加上 error_bad_lines=False 来忽略它们
# predict_data = pd.read_csv('标签的数据无打码.txt', sep='\t', header=None, error_bad_lines=False)

# 处理文本数据
predict_data['Content'] = predict_data[0].apply(clean_text)

# 转换特征
X_predict = tfidf.transform(predict_data['Content'])


In [24]:
# 预测对华情感
pred_wang = model_wang.predict(X_predict)

In [25]:
# 预测私人情感
pred_private = model_private.predict(X_predict)

In [26]:
# 输出预测结果
predict_data['对华情感预测'] = pred_wang
predict_data['私人情感预测'] = pred_private

In [27]:
# 保存预测结果
predict_data.to_csv('predict_results.csv', index=False)

print("预测结果已保存到 'predict_results.csv'")

预测结果已保存到 'predict_results.csv'
