# 数据加载与预处理

In [1]:
import pandas as pd
import os
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

  from pandas.core import (


In [2]:
# 定义训练集路径
data_files = [
    "经济train.csv", "军事train.csv", "科技train.csv",
    "社会train.csv", "体育train.csv", "文化train.csv", "政治train.csv"
]

In [3]:
# 读取并合并所有训练数据
train_data = pd.DataFrame()

In [4]:
for file in data_files:
    temp_df = pd.read_csv(file)
    temp_df['Category'] = file.split('train')[0]  # 给每个数据集加上标签列（如：经济，政治等）
    train_data = pd.concat([train_data, temp_df])

In [5]:
# 数据清理
train_data.dropna(subset=['Content'], inplace=True)  # 去掉Content为空的行

In [6]:
# 文本预处理函数
def preprocess_text(text):
    text = text.lower()  # 转小写
    text = text.translate(str.maketrans('', '', string.punctuation))  # 去除标点符号
    return text

In [7]:
# 预处理训练集的文本
train_data['Processed_Content'] = train_data['Content'].apply(preprocess_text)

In [8]:
# 查看处理后的数据
print(train_data[['Category', 'Processed_Content']].head())

  Category                            Processed_Content
0       经济  highspeed rail here and there in china 3746
1       经济  highspeed rail here and there in china 3646
2       经济  highspeed rail here and there in china 3546
3       经济  highspeed rail here and there in china 3446
4       经济  highspeed rail here and there in china 3346


# 文本向量化与填充

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [10]:
# 定义Tokenizer并进行训练
tokenizer = Tokenizer(num_words=10000)  # 设置最大词汇数
tokenizer.fit_on_texts(train_data['Processed_Content'])

In [11]:
# 将文本转化为序列
X = tokenizer.texts_to_sequences(train_data['Processed_Content'])

In [12]:
# 填充序列，使其具有相同的长度
max_sequence_length = 500  # 设定最大序列长度
X = pad_sequences(X, maxlen=max_sequence_length)

In [13]:
# 查看数据
print(X.shape)

(79470, 500)


# 标签编码

In [14]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_data['Category'])

In [15]:
# 查看编码后的标签
print(y[:10])

[6 6 6 6 6 6 6 6 6 6]


# 构建LSTM模型

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [17]:
# LSTM模型架构
model = Sequential()

In [18]:
# 嵌入层：将文本序列转化为稠密向量
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length))



In [19]:
# LSTM层：添加LSTM层并使用Dropout防止过拟合
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

In [20]:
# 全连接层：用于输出分类结果
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))  # Dropout层，防止过拟合
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # 使用softmax激活函数用于多类分类

In [21]:
# 编译模型
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
# 查看模型架构
model.summary()

# 训练模型

In [23]:
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 训练模型
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2154s[0m 2s/step - accuracy: 0.6413 - loss: 1.0104 - val_accuracy: 0.8847 - val_loss: 0.3663
Epoch 2/5
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1811s[0m 2s/step - accuracy: 0.8985 - loss: 0.3355 - val_accuracy: 0.8875 - val_loss: 0.3617
Epoch 3/5
[1m994/994[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1785s[0m 2s/step - accuracy: 0.9123 - loss: 0.2837 - val_accuracy: 0.8844 - val_loss: 0.3945
Epoch 4/5
[1m819/994[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m5:07[0m 2s/step - accuracy: 0.9242 - loss: 0.2409

In [None]:
# 评估模型性能
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# 假设 X_val 和 y_val 是验证集的特征和标签
# 对于 LSTM 模型，首先评估损失和准确率
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

# 使用模型进行预测
y_pred = model.predict(X_val)

# 对于二分类问题，我们通常取概率大于 0.5 的作为预测结果
# 如果是二分类问题，模型的输出应该是概率，需要转换为类别
y_pred_classes = (y_pred > 0.5).astype("int32")

# 计算精确率（Precision）
precision = precision_score(y_val, y_pred_classes, average='weighted')  # 对于多类别问题，使用 'weighted'
print(f"Precision: {precision:.4f}")

# 计算召回率（Recall）
recall = recall_score(y_val, y_pred_classes, average='weighted')  # 对于多类别问题，使用 'weighted'
print(f"Recall: {recall:.4f}")

# 计算 F1 分数
f1 = f1_score(y_val, y_pred_classes, average='weighted')  # 对于多类别问题，使用 'weighted'
print(f"F1-Score: {f1:.4f}")

# 计算 ROC AUC（如果是二分类问题）
if len(np.unique(y_val)) == 2:  # 确保是二分类问题
    # 获取模型的预测概率（假设模型的输出是概率）
    y_pred_prob = model.predict(X_val)[:, 1]  # 取第二类的概率

    # 计算 ROC AUC
    roc_auc = roc_auc_score(y_val, y_pred_prob)
    print(f"ROC AUC: {roc_auc:.4f}")

    # 计算 ROC 曲线
    fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)

    # 绘制 ROC 曲线
    plt.figure(figsize=(6, 6))
    plt.plot(fpr, tpr, color='b', label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.show()
else:
    print("ROC AUC is only applicable to binary classification problems.")


# 预测新数据

In [None]:
# 加载预测数据
with open('predict.txt', 'r', encoding='utf-8') as f:
    predict_data = f.read()

In [None]:
# 预处理文本
predict_data_processed = preprocess_text(predict_data)

In [None]:
# 将预测文本转化为序列
X_predict = tokenizer.texts_to_sequences([predict_data_processed])

In [None]:
# 填充序列
X_predict = pad_sequences(X_predict, maxlen=max_sequence_length)

In [None]:
# 使用训练好的模型进行预测
predicted_category = model.predict(X_predict)
predicted_category_label = np.argmax(predicted_category, axis=1)

In [None]:
# 将预测的标签转换为类别名称
predicted_category_name = label_encoder.inverse_transform(predicted_category_label)

In [None]:
print("预测类别:", predicted_category_name[0])