# 数据加载与合并

In [1]:
import pandas as pd
import os

In [2]:
# 定义训练集路径
data_files = [
    "经济train.csv",
    "军事train.csv",
    "科技train.csv",
    "社会train.csv",
    "体育train.csv",
    "文化train.csv",
    "政治train.csv"
]

In [3]:
# 读取并合并所有训练数据
train_data = pd.DataFrame()

In [4]:
for file in data_files:
    temp_df = pd.read_csv(file)
    temp_df['Category'] = file.split('train')[0]  # 给每个数据集加上标签列（如：经济，政治等）
    train_data = pd.concat([train_data, temp_df])

In [5]:
# 检查数据
print(train_data.head())

  Area               KeyWords             Name Verified Language  \
0   经济  high-speed rail china  XIE Yongjun 解勇军        f       en   
1   经济  high-speed rail china  XIE Yongjun 解勇军        f       en   
2   经济  high-speed rail china  XIE Yongjun 解勇军        f       en   
3   经济  high-speed rail china  XIE Yongjun 解勇军        f       en   
4   经济  high-speed rail china  XIE Yongjun 解勇军        f       en   

                                           Content Unnamed: 6 Unnamed: 7  \
0  High-speed rail here and there in China (37/46)        NaN        NaN   
1  High-speed rail here and there in China (36/46)        NaN        NaN   
2  High-speed rail here and there in China (35/46)        NaN        NaN   
3  High-speed rail here and there in China (34/46)        NaN        NaN   
4  High-speed rail here and there in China (33/46)        NaN        NaN   

  Unnamed: 8 Unnamed: 9  ... Unnamed: 15 Unnamed: 16 Unnamed: 17 Unnamed: 18  \
0        NaN        NaN  ...         NaN         NaN  

# 数据预处理

In [6]:
import string

In [7]:
# 数据清理
train_data.dropna(subset=['Content'], inplace=True)  # 去掉Content为空的行

In [8]:
# 文本预处理函数
def preprocess_text(text):
    text = text.lower()  # 转小写
    text = text.translate(str.maketrans('', '', string.punctuation))  # 去除标点符号
    return text

In [9]:
# 预处理训练集的文本
train_data['Processed_Content'] = train_data['Content'].apply(preprocess_text)

In [10]:
# 查看处理后的数据
print(train_data[['Category', 'Processed_Content']].head())

  Category                            Processed_Content
0       经济  highspeed rail here and there in china 3746
1       经济  highspeed rail here and there in china 3646
2       经济  highspeed rail here and there in china 3546
3       经济  highspeed rail here and there in china 3446
4       经济  highspeed rail here and there in china 3346


# 特征提取（TF-IDF）

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# 创建TF-IDF向量化器
vectorizer = TfidfVectorizer(max_features=5000)

In [14]:
# 训练TF-IDF模型并转化训练数据
X_train = vectorizer.fit_transform(train_data['Processed_Content'])

In [15]:
# 查看特征维度
print(X_train.shape)

(79470, 5000)


# 标签编码

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
# 初始化LabelEncoder
label_encoder = LabelEncoder()

In [18]:
# 对类别标签进行编码
y_train = label_encoder.fit_transform(train_data['Category'])

In [19]:
# 查看编码后的标签
print(y_train[:10])

[6 6 6 6 6 6 6 6 6 6]


# 训练支持向量机模型

In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [21]:
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [22]:
# 初始化支持向量机模型（使用线性内核）
svm_model = SVC(kernel='linear', random_state=42)

In [23]:
# 训练模型
svm_model.fit(X_train, y_train)

In [24]:
# 在验证集上进行预测
y_pred = svm_model.predict(X_val)

In [25]:
# 输出分类报告
print(classification_report(y_val, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

          体育       0.97      0.88      0.92      2037
          军事       0.97      0.84      0.90       668
          政治       0.87      0.84      0.86      1969
          文化       0.83      0.98      0.90      5789
          社会       0.93      0.80      0.86      1358
          科技       0.93      0.77      0.85      2014
          经济       0.90      0.86      0.88      2059

    accuracy                           0.89     15894
   macro avg       0.92      0.85      0.88     15894
weighted avg       0.89      0.89      0.88     15894



# 预测新数据

In [26]:
# 加载预测集数据
with open('predict.txt', 'r', encoding='utf-8') as f:
    predict_data = f.read()

In [27]:
# 预处理预测集数据
predict_data_processed = preprocess_text(predict_data)

In [28]:
# 将预测数据转化为TF-IDF特征
X_predict = vectorizer.transform([predict_data_processed])

In [29]:
# 使用训练好的SVM模型进行预测
predicted_category = svm_model.predict(X_predict)

In [30]:
# 将预测的类别标签转换为对应的类别名称
predicted_category_name = label_encoder.inverse_transform(predicted_category)

In [31]:
print("预测类别:", predicted_category_name[0])

预测类别: 军事
