In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import numpy as np
import os
import random
import time
import datetime

class TextDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def set_seed(seed_value=42):
    """设置随机种子，确保结果可复现"""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def load_data(file_path, tokenizer, max_length=128):
    """加载并处理数据，不使用pandas"""
    try:
        sentences = []
        labels = []
        
        with open(file_path, 'r', encoding='utf-8') as f:
            # 跳过标题行
            next(f)
            
            for line in f:
                line = line.strip()
                if not line:
                    continue
                    
                # 假设CSV格式为: sentence,label
                parts = line.split(',')
                if len(parts) >= 2:
                    sentence = ','.join(parts[:-1])  # 处理句子中可能包含的逗号
                    label = int(parts[-1])
                    sentences.append(sentence)
                    labels.append(label)
        
        dataset = TextDataset(sentences, labels, tokenizer, max_length)
        return dataset
    except Exception as e:
        print(f"数据加载错误: {e}")
        return None

def create_data_loader(dataset, batch_size, sampler_type='random'):
    """创建数据加载器"""
    if sampler_type == 'random':
        sampler = torch.utils.data.RandomSampler(dataset)
    else:
        sampler = torch.utils.data.SequentialSampler(dataset)
    
    return DataLoader(dataset, sampler=sampler, batch_size=batch_size)

def calculate_accuracy(preds, labels):
    """计算准确率，不使用sklearn"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def calculate_metrics(preds, labels):
    """计算精确率、召回率和F1分数，不使用sklearn"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    tp = np.sum((pred_flat == 1) & (labels_flat == 1))
    tn = np.sum((pred_flat == 0) & (labels_flat == 0))
    fp = np.sum((pred_flat == 1) & (labels_flat == 0))
    fn = np.sum((pred_flat == 0) & (labels_flat == 1))
    
    # 避免除零错误
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': (tp + tn) / len(labels_flat),
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def calculate_metrics_multiclass(preds, labels):
    """计算多分类任务的指标"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    # 简单的准确率计算
    accuracy = np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    # 类别统计
    classes = np.unique(np.concatenate([pred_flat, labels_flat]))
    print(f"发现的类别: {classes}")
    
    return {
        'accuracy': accuracy,
        'precision': 0,  # 多分类下需要更复杂的计算
        'recall': 0,     # 同上
        'f1': 0          # 同上
    }

def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, epochs, save_path):
    """训练模型"""
    total_t0 = time.time()
    best_val_loss = float('inf')
    
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    for epoch_i in range(0, epochs):
        print(f'======== Epoch {epoch_i + 1} / {epochs} ========')
        print('Training...')
        
        t0 = time.time()
        total_train_loss = 0
        
        model.train()
        
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print(f'  Batch {step} of {len(train_dataloader)}. Elapsed: {elapsed}')
            
            b_input_ids = batch['input_ids'].to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            
            model.zero_grad()
            
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
            
            loss = outputs.loss
            total_train_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        training_time = format_time(time.time() - t0)
        
        print(f"  Average training loss: {avg_train_loss:.4f}")
        print(f"  Training epoch took: {training_time}")
        
        print("\nRunning Validation...")
        t0 = time.time()
        
        model.eval()
        
        total_eval_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in val_dataloader:
            b_input_ids = batch['input_ids'].to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            
            with torch.no_grad():
                outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
            
            loss = outputs.loss
            total_eval_loss += loss.item()
            
            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            all_preds.append(logits)
            all_labels.append(label_ids)
        
        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        
        avg_val_loss = total_eval_loss / len(val_dataloader)
        metrics = calculate_metrics(all_preds, all_labels)
        validation_time = format_time(time.time() - t0)
        
        print(f"  Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Validation Loss: {avg_val_loss:.4f}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1']:.4f}")
        print(f"  Validation took: {validation_time}")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model_path = os.path.join(save_path, 'best_model2.pt')
            torch.save(model.state_dict(), model_path)
            print(f"  Best model saved at: {model_path}")
    
    print(f"\nTraining complete! Total training took {format_time(time.time()-total_t0)}")
    return best_val_loss

def format_time(elapsed):
    """将时间格式化为 hh:mm:ss"""
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def inspect_dataset(file_path):
    """检查数据集中的标签分布"""
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        next(f)  # 跳过标题行
        for line in f:
            parts = line.strip().split(',')
            if len(parts) >= 2:
                label = int(parts[-1])
                labels.append(label)
    
    unique_labels = np.unique(labels)
    counts = {label: labels.count(label) for label in unique_labels}
    
    print(f"文件 {file_path} 中的标签分布:")
    print(f"唯一标签值: {unique_labels}")
    print(f"标签计数: {counts}")
    
    return unique_labels, counts

def main():
    # 设置参数
    SEED = 42
    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    EPSILON = 1e-8
    EPOCHS = 4
    MAX_LENGTH = 128
    SAVE_PATH = 'model_output'
    TRAIN_FILE = 'training.csv'
    VAL_FILE = 'validation.csv'
    
    set_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=6,
        output_attentions=False,
        output_hidden_states=False,
        ignore_mismatched_sizes=True
    )
    
    model.to(device)
    
    print("Loading data...")
    train_dataset = load_data(TRAIN_FILE, tokenizer, MAX_LENGTH)
    val_dataset = load_data(VAL_FILE, tokenizer, MAX_LENGTH)
    
    if train_dataset is None or val_dataset is None:
        print("数据加载失败，程序退出")
        return
    
    train_labels, train_counts = inspect_dataset(TRAIN_FILE)
    val_labels, val_counts = inspect_dataset(VAL_FILE)

    # 确保模型配置与数据匹配
    num_labels = max(max(train_labels), max(val_labels)) + 1
    print(f"检测到的最大标签值: {num_labels-1}，设置num_labels={num_labels}")

    train_dataloader = create_data_loader(train_dataset, BATCH_SIZE, 'random')
    val_dataloader = create_data_loader(val_dataset, BATCH_SIZE, 'sequential')
    
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    
    print("Starting training...")
    best_val_loss = train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, 
                                device, EPOCHS, SAVE_PATH)
    
    print(f"Training completed with best validation loss: {best_val_loss:.4f}")

if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert-base-uncased and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu
Loading data...
文件 training.csv 中的标签分布:
唯一标签值: [0 1 2 3 4 5]
标签计数: {np.int64(0): 4665, np.int64(1): 5362, np.int64(2): 1304, np.int64(3): 2159, np.int64(4): 1937, np.int64(5): 572}
文件 validation.csv 中的标签分布:
唯一标签值: [0 1 2 3 4 5]
标签计数: {np.int64(0): 549, np.int64(1): 704, np.int64(2): 178, np.int64(3): 275, np.int64(4): 212, np.int64(5): 81}
检测到的最大标签值: 5，设置num_labels=6
Starting training...
Training...
  Batch 40 of 1000. Elapsed: 0:00:53
  Batch 80 of 1000. Elapsed: 0:01:42
  Batch 120 of 1000. Elapsed: 0:02:30
  Batch 160 of 1000. Elapsed: 0:03:19
  Batch 200 of 1000. Elapsed: 0:04:08
  Batch 240 of 1000. Elapsed: 0:04:58
  Batch 280 of 1000. Elapsed: 0:05:47
  Batch 320 of 1000. Elapsed: 0:06:36
  Batch 360 of 1000. Elapsed: 0:07:25
  Batch 400 of 1000. Elapsed: 0:08:14
  Batch 440 of 1000. Elapsed: 0:09:06
  Batch 480 of 1000. Elapsed: 0:09:54
  Batch 520 of 1000. Elapsed: 0:10:44
  Batch 560 of 1000. Elapsed: 0:11:32
  Batch 600 of 1000. Elapsed: 0:12:20
  Batch 640

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import os
from torch.utils.data import DataLoader  # 添加这一行

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def load_data(file_path, tokenizer, max_length=128):
    """加载并处理数据，不使用pandas"""
    try:
        sentences = []
        labels = []
        
        with open(file_path, 'r', encoding='utf-8') as f:
            # 跳过标题行
            next(f)
            
            for line in f:
                line = line.strip()
                if not line:
                    continue
                    
                # 假设CSV格式为: sentence,label
                parts = line.split(',')
                if len(parts) >= 2:
                    sentence = ','.join(parts[:-1])  # 处理句子中可能包含的逗号
                    label = int(parts[-1])
                    sentences.append(sentence)
                    labels.append(label)
        
        dataset = TextDataset(sentences, labels, tokenizer, max_length)
        return dataset
    except Exception as e:
        print(f"数据加载错误: {e}")
        return None

def create_data_loader(dataset, batch_size, sampler_type='random'):
    """创建数据加载器"""
    if sampler_type == 'random':
        sampler = torch.utils.data.RandomSampler(dataset)
    else:
        sampler = torch.utils.data.SequentialSampler(dataset)
    
    return DataLoader(dataset, sampler=sampler, batch_size=batch_size)

def calculate_metrics(preds, labels):
    """计算评估指标，不使用sklearn"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    tp = np.sum((pred_flat == 1) & (labels_flat == 1))
    tn = np.sum((pred_flat == 0) & (labels_flat == 0))
    fp = np.sum((pred_flat == 1) & (labels_flat == 0))
    fn = np.sum((pred_flat == 0) & (labels_flat == 1))
    
    # 避免除零错误
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': (tp + tn) / len(labels_flat),
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def evaluate_model(model, dataloader, device):
    """评估模型性能"""
    model.eval()
    
    all_preds = []
    all_labels = []
    
    for batch in dataloader:
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        
        with torch.no_grad():
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        all_preds.append(logits)
        all_labels.append(label_ids)
    
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    
    return calculate_metrics(all_preds, all_labels)

def predict_text(model, tokenizer, text, device, max_length=128):
    """预测单个文本的情感"""
    model.eval()
    
    inputs = tokenizer(text, truncation=True, padding='max_length', 
                      max_length=max_length, return_tensors='pt')
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1)
    confidence, predicted_class = torch.max(probs, dim=1)
    
    predicted_class = predicted_class.item()
    confidence = confidence.item()
    probs = probs.cpu().numpy()[0]
    
    return {
        'predicted_class': predicted_class,
        'confidence': confidence,
        'probabilities': probs
    }

def main():
    # 设置参数
    MODEL_PATH = 'model_output/best_model2.pt'
    TEST_FILE = 'test.csv'
    MAX_LENGTH = 128
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=6,
        output_attentions=False,
        output_hidden_states=False,
        ignore_mismatched_sizes=True
    )
    
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
        print(f"模型权重已加载: {MODEL_PATH}")
    else:
        print(f"模型文件不存在: {MODEL_PATH}")
        return
    
    model.to(device)
    
    print("Evaluating model...")
    test_dataset = load_data(TEST_FILE, tokenizer, MAX_LENGTH)
    
    if test_dataset is not None:
        test_dataloader = create_data_loader(test_dataset, batch_size=16, sampler_type='sequential')
        metrics = evaluate_model(model, test_dataloader, device)
        
        print("\nEvaluation Results:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-Score: {metrics['f1']:.4f}")
    
    print("\nExample Predictions:")
    test_texts = [
        "This is a great product!",
        "I did not like the service at all.",
        "The food was delicious and the staff were friendly.",
        "I will never come back to this place again."
    ]
    
    for text in test_texts:
        result = predict_text(model, tokenizer, text, device, MAX_LENGTH)
        print(f"\nText: {text}")
        print(f"Predicted Class: {result['predicted_class']}")
        print(f"Confidence: {result['confidence']:.4f}")
        print(f"Probabilities: {result['probabilities']}")

if __name__ == "__main__":
    main()    

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu
模型权重已加载: model_output/best_model2.pt
Evaluating model...

Evaluation Results:
Accuracy: 0.6033
Precision: 0.9969
Recall: 1.0000
F1-Score: 0.9984

Example Predictions:

Text: This is a great product!
Predicted Class: 1
Confidence: 0.9974
Probabilities: [1.6023201e-04 9.9740666e-01 1.8177849e-03 1.9936159e-04 1.0628299e-04
 3.0962826e-04]

Text: I did not like the service at all.
Predicted Class: 3
Confidence: 0.5788
Probabilities: [0.08676189 0.2030707  0.10872705 0.5788078  0.01574844 0.00688421]

Text: The food was delicious and the staff were friendly.
Predicted Class: 1
Confidence: 0.9942
Probabilities: [1.3832860e-04 9.9415404e-01 5.0942395e-03 2.8245529e-04 9.3238341e-05
 2.3778724e-04]

Text: I will never come back to this place again.
Predicted Class: 0
Confidence: 0.8965
Probabilities: [0.8965042  0.01780757 0.00754229 0.06316152 0.0139906  0.00099382]
