In [None]:
import pandas as pd

# 1. 定义文件与类别的映射关系
file_to_label = [
    'weibo_with_categories_partial.csv',
    'weibo_with_categories_partial2.csv',
    'weibo_with_categories_partial3.csv',
    'weibo_with_categories_partial4.csv'
]

# 2. 加载并标注数据
dfs = []
for file in file_to_label:
    df = pd.read_csv(file)
    dfs.append(df)

# 3. 合并数据并打乱
full_df = pd.concat(dfs, ignore_index=True).sample(frac=1)
print(f"总样本数: {len(full_df)}")
print(f"类别分布:\n{full_df['话题分类'].value_counts()}")

# 4. 保存整合数据（可选）
full_df.to_csv("labeled_weibo_data.csv", index=False)

总样本数: 400
类别分布:
话题分类
3    143
0    140
2     64
1     53
Name: count, dtype: int64


In [None]:
import re
import jieba

# 1. 清洗微博文本
def clean_weibo(text):
    # 保留话题和@用户，去除其他特殊符号
    text = re.sub(r"(?<![@#\w])[^\w\s#@]+", "", str(text))
    # 去除URL
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    return text.strip()

# 2. 添加清洗后的列
full_df['cleaned_text'] = full_df['微博正文'].apply(clean_weibo)

# 3. 检查空值
full_df = full_df.dropna(subset=['cleaned_text'])

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import os
os.environ["WANDB_DISABLED"] = "true"  # 添加到代码开头
# 1. 划分训练集/验证集
train_df, val_df = train_test_split(
    full_df,
    test_size=0.2,
    stratify=full_df['话题分类'],
    random_state=42
)

# 2. 初始化Tokenizer（适配微博内容）
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
tokenizer.add_tokens(['#', '@'])  # 添加微博特殊符号

# 3. 创建优化后的Dataset类
class WeiboDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.texts)

# 4. 创建DataLoader
train_dataset = WeiboDataset(
    train_df['cleaned_text'].tolist(),
    train_df['话题分类'].tolist(),
    tokenizer
)
val_dataset = WeiboDataset(
    val_df['cleaned_text'].tolist(),
    val_df['话题分类'].tolist(),
    tokenizer
)

# 5. 加载模型（适配新token）
model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese",
    num_labels=4
)
model.resize_token_embeddings(len(tokenizer))  # 调整嵌入层

# 6. 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=3e-5,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=400,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# 7. 开始训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m120090416[0m ([33m120090416-the-chinese-university-of-hong-kong[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


TrainOutput(global_step=50, training_loss=0.7988160705566406, metrics={'train_runtime': 2479.4311, 'train_samples_per_second': 0.645, 'train_steps_per_second': 0.02, 'total_flos': 105246312038400.0, 'train_loss': 0.7988160705566406, 'epoch': 5.0})

In [10]:
 from sklearn.metrics import classification_report

# 1. 验证集预测
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# 2. 分类报告
print(classification_report(
    val_df['话题分类'],
    preds,
    target_names=["皮肤", "赛事", "活动", "其他"],
    digits=4
))

# 3. 错误样本分析
val_df['pred'] = preds
error_samples = val_df[val_df['话题分类'] != val_df['pred']]
error_samples[['cleaned_text', '话题分类', 'pred']].to_csv("error_samples.csv", index=False)

              precision    recall  f1-score   support

          皮肤     0.7931    0.8214    0.8070        28
          赛事     0.7778    0.7000    0.7368        10
          活动     0.4615    0.4615    0.4615        13
          其他     0.8621    0.8621    0.8621        29

    accuracy                         0.7625        80
   macro avg     0.7236    0.7113    0.7169        80
weighted avg     0.7623    0.7625    0.7621        80



In [9]:
from transformers import pipeline
import pandas as pd

# 1. 保存最佳模型
trainer.save_model("./best_weibo_classifier")

# 2. 创建预测管道
classifier = pipeline(
    "text-classification",
    model="./best_weibo_classifier",
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# 3. 批量预测函数
def predict_weibo(texts, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        preds = classifier(batch)
        results.extend([{
            'text': batch[j],
            'label': int(pred['label'].split('_')[-1]),
            'score': pred['score']
        } for j, pred in enumerate(preds)])
    return pd.DataFrame(results)

# 4. 使用示例
new_data = pd.read_csv("new_weibos.csv")
predictions = predict_weibo(new_data['content'].tolist())
predictions.to_csv("classified_results.csv", index=False)

Device set to use cpu


FileNotFoundError: [Errno 2] No such file or directory: 'new_weibos.csv'