# 安装所需的库

In [1]:
pip install transformers datasets torch scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade typing_extensions

Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install --upgrade accelerate



In [5]:
!pip uninstall keras -y

Found existing installation: keras 3.8.0
Uninstalling keras-3.8.0:
  Successfully uninstalled keras-3.8.0


In [6]:
!pip install tf-keras

Collecting tf-keras
  Obtaining dependency information for tf-keras from https://files.pythonhosted.org/packages/8a/ed/e08afca471299b04a34cd548e64e89d0153eda0e6cf9b715356777e24774/tf_keras-2.18.0-py3-none-any.whl.metadata
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow<2.19,>=2.18->tf-keras)
  Obtaining dependency information for keras>=3.5.0 from https://files.pythonhosted.org/packages/fe/cf/aea9087c4d7fafe956a0cc0ff6c3327d10fb8442cda50f992a2186921fa0/keras-3.8.0-py3-none-any.whl.metadata
  Using cached keras-3.8.0-py3-none-any.whl.metadata (5.8 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached keras-3.8.0-py3-none-any.whl (1.3 MB)
Installing collected packages: keras, tf-keras
Successfully installed keras-3.8.0 tf-keras-2.18.0


# 数据加载和预处理

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

  from pandas.core import (


In [2]:
# 加载训练数据
files = ['经济train.csv', '军事train.csv', '科技train.csv', '社会train.csv', '体育train.csv', '文化train.csv', '政治train.csv']
df_list = []

In [3]:
for file in files:
    # 读取每个文件
    df = pd.read_csv(file)
    # 添加标签列
    df['Label'] = file.split('train.csv')[0]  # 文件名作为标签
    df_list.append(df)

In [4]:
# 合并所有数据
df = pd.concat(df_list, ignore_index=True)

In [5]:
# 选择内容列和标签列
df = df[['Content', 'Label']]

In [6]:
# 标签编码
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

In [7]:
# 查看数据
df.head()

Unnamed: 0,Content,Label
0,High-speed rail here and there in China (37/46),6
1,High-speed rail here and there in China (36/46),6
2,High-speed rail here and there in China (35/46),6
3,High-speed rail here and there in China (34/46),6
4,High-speed rail here and there in China (33/46),6


# BERT模型微调

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [9]:
# 定义Dataset类，封装数据
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        
        # Tokenize the text and encode it
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# 加载BERT预训练模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
# 分割训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(df['Content'].values, df['Label'].values, test_size=0.1, random_state=42)

In [12]:
# 创建训练集和验证集的Dataset对象
train_dataset = TextDataset(X_train, y_train, tokenizer, max_len=128)
val_dataset = TextDataset(X_val, y_val, tokenizer, max_len=128)

In [13]:
# 加载BERT模型用于序列分类
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir='./results',          # 输出结果目录
    num_train_epochs=3,              # 训练轮数
    per_device_train_batch_size=8,   # 每个设备上的训练批次大小
    per_device_eval_batch_size=16,   # 每个设备上的评估批次大小
    warmup_steps=500,                # 学习率预热步数
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
    logging_steps=10,
    evaluation_strategy="epoch"      # 每个epoch进行评估
)



In [15]:
# 使用Trainer API进行训练
trainer = Trainer(
    model=model,                         # 训练的BERT模型
    args=training_args,                  # 训练的参数
    train_dataset=train_dataset,         # 训练集
    eval_dataset=val_dataset             # 验证集
)

In [None]:
# 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss


# 保存并加载模型

In [24]:
# 保存模型
model.save_pretrained('./bert_model')
tokenizer.save_pretrained('./bert_model')

('./bert_model/tokenizer_config.json',
 './bert_model/special_tokens_map.json',
 './bert_model/vocab.txt',
 './bert_model/added_tokens.json')

In [25]:
# 加载模型
model = BertForSequenceClassification.from_pretrained('./bert_model')
tokenizer = BertTokenizer.from_pretrained('./bert_model')

# 使用训练好的模型进行预测

In [26]:
# 加载预测集数据
predict_text = open('predict.txt', 'r').readlines()

In [27]:
# 对预测集进行处理
predict_dataset = TextDataset(predict_text, [0]*len(predict_text), tokenizer, max_len=128)

In [28]:
# 使用训练好的模型进行预测
model.eval()
predictions = []

In [29]:
with torch.no_grad():
    for item in predict_dataset:
        input_ids = item['input_ids'].unsqueeze(0)  # 增加batch维度
        attention_mask = item['attention_mask'].unsqueeze(0)
        output = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = output.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        predictions.append(predicted_class)

In [30]:
# 将预测结果转换回原始标签
predicted_labels = le.inverse_transform(predictions)
print(predicted_labels)

['文化' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '科技' '经济' '经济' '经济' '经济'
 '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济'
 '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济'
 '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济'
 '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济'
 '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '经济' '军事' '军事' '军事'
 '军事' '军事' '军事' '军事' '军事' '军事' '军事' '军事' '军事' '军事' '军事' '军事' '军事' '军事'
 '军事' '军事' '文化' '军事' '政治' '军事' '军事' '军事' '军事' '文化' '军事' '军事' '军事' '军事'
 '科技' '军事' '军事' '政治' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技'
 '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技'
 '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技'
 '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '社会' '科技' '科技' '科技'
 '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技' '科技'
 '科技' '科技' '科技' '科技' '科技' '科技' '社会' '社会' '社会' '社会' '社会' '社会' '社会' '社会'
 '社会' 