In [None]:
import os

# 设置代理环境变量
os.environ['http_proxy'] = 'http://100.64.0.2:11080'
os.environ['https_proxy'] = 'http://100.64.0.2:11080'

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import BertTokenizer
import math

In [None]:
import argparse
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
parser = argparse.ArgumentParser(description='Bert Training')
parser.add_argument('--num_heads', type=int, default=8, help='注意力头数')
parser.add_argument('--num_layers', type=int, default=8, help='Transformer 层数')
parser.add_argument('--d_model', type=int, default=768, help='模型维度')
parser.add_argument('--dropout', type=float, default=0.1, help='Dropout 概率')
parser.add_argument('--d_ff', type=int, default=1024, help='前馈网络维度')
parser.add_argument('--max_len', type=int, default=100, help='输入序列最大长度')
parser.add_argument('--data_path', type=str, 
                    default="weibo_senti_100k.csv", 
                    help='数据集路径')
parser.add_argument('--batch_size', type=int, default=600, help='批大小')
parser.add_argument('--epoch', type=int, default=10, help='训练轮数')
parser.add_argument('--lr', type=float, default=1e-5, help='学习率')
parser.add_argument('--vocab_size', type=int, default=tokenizer.vocab_size, help='词汇表大小')
parser.add_argument('--print_freq', type=int, default=1, help='打印损失频率')

In [None]:
class MutiheadAttn(nn.Module):
    def __init__(self, num_heads, d_model, dropout=0.1):
       super(MutiheadAttn, self).__init__()
       assert d_model % num_heads == 0
       self.head_dim = d_model // num_heads  
       self.q = nn.Linear(d_model, d_model)
       self.k = nn.Linear(d_model, d_model)
       self.v = nn.Linear(d_model, d_model)
       
       self.dropout = nn.Dropout(dropout)
       
    def forward(self,x):
        q = self.q(x).view(x.size(0), x.size(1), -1, self.head_dim)
        k = self.k(x).view(x.size(0), x.size(1), -1, self.head_dim)
        v = self.v(x).view(x.size(0), x.size(1), -1, self.head_dim)
        
        q ,k , v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        attn = torch.matmul(q, k.transpose(-2,-1)) / (self.head_dim ** 0.5)
        logits = F.softmax(attn, dim=-1)
        logits = self.dropout(logits)
        attn = torch.matmul(logits, v)
        output = attn.transpose(1,2).contiguous().view(x.size(0), x.size(1), -1)
        return output

In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        
        # 创建一个位置编码矩阵
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()  # [max_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))  # 计算每个维度的频率
        
        # 使用正弦和余弦计算位置编码
        pe[:, 0::2] = torch.sin(position * div_term)  # 偶数维度使用sin
        pe[:, 1::2] = torch.cos(position * div_term)  # 奇数维度使用cos
        
        pe = pe.unsqueeze(0)  # 在第0维加一个batch维度
        self.register_buffer('pe', pe)  # 注册为buffer，确保不会被训练优化

    def forward(self, x):
        # x的形状是 (batch_size, seq_len, d_model)
        seq_len = x.size(1)
        
        # 获取位置编码矩阵的前seq_len个位置编码
        pe = self.pe[:, :seq_len, :]
        
        # 将位置编码添加到输入embedding上
        return x + pe

In [None]:
class Bert(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_len=100, dropout=0.1):
        super(Bert, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, d_model)
        self.position_encoding = PositionalEncoding(d_model, max_len)
        
        self.encoder_layers = nn.ModuleList([
            nn.ModuleList([MutiheadAttn(num_heads, d_model, dropout), FeedForward(d_model, d_ff, dropout)]) 
            for _ in range(num_layers)
        ])
        
        self.layer_norm = nn.LayerNorm(d_model)
        self.output_layer = nn.Linear(d_model, vocab_size)  # You can adjust this based on the task
        self.final = nn.Linear(vocab_size, 2)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        # x shape: [batch_size, seq_len]
        x = self.embeddings(x)  # Embedding layer: [batch_size, seq_len, d_model]
        x = self.position_encoding(x)  # Add positional encoding

        for attn, ff in self.encoder_layers:
            # Multi-head attention
            x = attn(x)
            x = self.layer_norm(x + attn(x))  # Residual connection

            # Feed-forward layer
            x = ff(x)
            x = self.layer_norm(x + ff(x))  # Residual connection
        x = self.output_layer(x) # batch_size, seq_len, vocab_size
        x = x.mean(dim=1)  # [batch_size, seq_len, vocab_size] -> [batch_size, vocab_size]
        x = self.final(x)  # [batch_size, vocab_size] -> [batch_size, 2]
        x = self.softmax(x)
        return x

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [None]:
class moodtxtDataset(Dataset):
    def __init__(self, file_path):
        import pandas as pd
        self.data = pd.read_csv(file_path)
        self.texts = self.data['review'].tolist()
        self.labels = self.data['label'].tolist()
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        text = self.tokenizer.encode(text, add_special_tokens=True, max_length=100, padding='max_length', truncation=True)
        text = torch.tensor(text, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        return text, label

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd

def train(args):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # 创建完整数据集
    full_dataset = moodtxtDataset(args.data_path)
    print(set(full_dataset.data['label']))  # 查看标签分布
    
    # 按 8:2 划分训练集和测试集
    train_size = int(0.8 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_dataset, test_dataset = random_split(
        full_dataset, 
        [train_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )
    
    # 创建 DataLoader
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, 
                             num_workers=48, pin_memory=True, drop_last=True)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True,
                            num_workers=16, pin_memory=True)
    
    # 初始化模型
    model = Bert(args.vocab_size, args.d_model, args.num_heads, 
                args.num_layers, args.d_ff, args.max_len, args.dropout).to(device)
    
    # 多卡并行
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])
    
    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = CosineAnnealingLR(optimizer, T_max=args.epoch, eta_min=1e-6)
    
    # 初始化结果存储
    epoch_result_df = pd.DataFrame(columns=["Epoch", "Train Loss", "Test Loss", 
                                           "Train Acc", "Test Acc", "Train F1", "Test F1",
                                           "Train Precision", "Test Precision", 
                                           "Train Recall", "Test Recall", "Learning Rate"])
    
    batch_result_df = pd.DataFrame(columns=["Epoch", "Batch", "Loss", "Acc",
                                           "Precision", "Recall", "F1", "Learning Rate"])
    
    for epoch in range(args.epoch):
        # ======== 训练阶段 ========
        model.train()
        epoch_train_loss, total_correct = 0.0, 0
        all_preds, all_labels = [], []
        
        for batch_idx, (texts, labels) in enumerate(train_loader):
            texts, labels = texts.to(device), labels.to(device)
            
            # 前向传播
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            
            # 反向传播
            loss.backward()
            optimizer.step()
            
            # 计算batch指标
            preds = torch.argmax(outputs, dim=1)
            batch_correct = (preds == labels).sum().item()
            batch_acc = batch_correct / texts.size(0)
            
            # 转换数据到CPU计算指标
            labels_np = labels.cpu().numpy()
            preds_np = preds.cpu().numpy()
            
            # 记录batch结果
            batch_result = {
                "Epoch": epoch+1,
                "Batch": batch_idx+1,
                "Loss": loss.item(),
                "Acc": batch_acc,
                "Precision": precision_score(labels_np, preds_np, average='macro', zero_division=0),
                "Recall": recall_score(labels_np, preds_np, average='macro', zero_division=0),
                "F1": f1_score(labels_np, preds_np, average='macro', zero_division=0),
                "Learning Rate": optimizer.param_groups[0]['lr']
            }
            batch_result_df = pd.concat([batch_result_df, pd.DataFrame([batch_result])], ignore_index=True)
            
            # 累积epoch指标
            total_correct += batch_correct
            epoch_train_loss += loss.item() * texts.size(0)
            all_preds.extend(preds_np)
            all_labels.extend(labels_np)
            
            # 打印进度
            if (batch_idx + 1) % args.print_freq == 0:
                print(f"Epoch [{epoch+1}/{args.epoch}] | Batch [{batch_idx+1}/{len(train_loader)}] | "
                      f"Loss: {loss.item():.4f} | Acc: {batch_acc:.2%}")
        
        # 计算训练集整体指标
        train_acc = total_correct / len(train_dataset)
        train_precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
        train_recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
        train_f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
        avg_train_loss = epoch_train_loss / len(train_dataset)
        
        # ======== 测试阶段 ========
        model.eval()
        test_loss, test_correct = 0.0, 0
        test_preds, test_labels = [], []
        
        with torch.no_grad():
            for texts, labels in test_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
                
                # 计算损失
                loss = criterion(outputs, labels)
                test_loss += loss.item() * labels.size(0)
                
                # 记录预测结果
                preds = torch.argmax(outputs, dim=1)
                test_correct += (preds == labels).sum().item()
                test_preds.extend(preds.cpu().numpy())
                test_labels.extend(labels.cpu().numpy())
        
        # 计算测试集指标
        test_acc = test_correct / len(test_dataset)
        test_precision = precision_score(test_labels, test_preds, average='macro', zero_division=0)
        test_recall = recall_score(test_labels, test_preds, average='macro', zero_division=0)
        test_f1 = f1_score(test_labels, test_preds, average='macro', zero_division=0)
        avg_test_loss = test_loss / len(test_dataset)
        
        # 更新学习率
        scheduler.step()
        
        # ======== 记录结果 ========
        epoch_result_df = pd.concat([epoch_result_df, pd.DataFrame({
            "Epoch": [epoch+1],
            "Train Loss": [avg_train_loss],
            "Test Loss": [avg_test_loss],
            "Train Acc": [train_acc],
            "Test Acc": [test_acc],
            "Train F1": [train_f1],
            "Test F1": [test_f1],
            "Train Precision": [train_precision],
            "Test Precision": [test_precision],
            "Train Recall": [train_recall],
            "Test Recall": [test_recall],
            "Learning Rate": [optimizer.param_groups[0]['lr']]
        })], ignore_index=True)
        
        # 打印epoch总结
        print(f"\nEpoch {epoch+1} Summary:")
        print(f"Train Loss: {avg_train_loss:.4f} | Acc: {train_acc:.2%} | "
              f"Precision: {train_precision:.4f} | Recall: {train_recall:.4f} | F1: {train_f1:.4f}")
        print(f"Test  Loss: {avg_test_loss:.4f} | Acc: {test_acc:.2%} | "
              f"Precision: {test_precision:.4f} | Recall: {test_recall:.4f} | F1: {test_f1:.4f}")
        print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.2e}\n")
        
        # 保存模型
        torch.save(model.module.state_dict() if hasattr(model, 'module') else model.state_dict(),
                  f"model_epoch_{epoch+1}.pth")
    
    # 保存结果
    epoch_result_df.to_csv('epoch_training_metrics.csv', index=False)
    batch_result_df.to_csv('batch_training_metrics.csv', index=False)

In [None]:
args, unknown_args = parser.parse_known_args()

In [None]:
train(args)

In [None]:
def predict(text):
    # 初始化模型
    model = Bert(args.vocab_size, args.d_model, args.num_heads, 
                args.num_layers, args.d_ff, args.max_len, args.dropout)
    
    # 加载预训练权重（关键新增部分）
    model.load_state_dict(torch.load('model_epoch_2.pth'))  # 替换为你的权重路径
    
    # 设置为评估模式
    model.eval()
    
    with torch.no_grad():
        # 文本编码
        text = tokenizer.encode(text, 
                              add_special_tokens=True,
                              max_length=100,
                              padding='max_length',
                              truncation=True)
        text = torch.tensor(text, dtype=torch.long).unsqueeze(0)
        
        # 前向传播
        output = model(text)
        print(output)
        # 获取预测结果（修正了参数缺失问题）
        preds = torch.argmax(output, dim=-1)  # 添加了output参数
        
        return preds
predict('')