# 下载bert-base-chinese

In [14]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

from transformers import AutoTokenizer, AutoModel

# 设置模型保存目录
save_directory = "./bert-base-chinese-local"

# 检查模型是否已存在
if os.path.exists(os.path.join(save_directory, "tokenizer_config.json")) and \
   os.path.exists(os.path.join(save_directory, "config.json")):
    print(f"本地模型已存在于 {save_directory}，直接加载")
    tokenizer = AutoTokenizer.from_pretrained(save_directory, local_files_only=True)
    model = AutoModel.from_pretrained(save_directory, local_files_only=True)
    print("模型已从本地成功加载")
else:
    print(f"警告：本地模型目录 {save_directory} 不完整，请确保已正确下载")

本地模型已存在于 ./bert-base-chinese-local，直接加载
模型已从本地成功加载


# 1

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import sys

# 修改: 添加项目根目录到Python路径 (适应本地环境)
project_root = os.path.dirname(os.path.abspath('__file__'))
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to system path")

# 打印当前工作目录和Python路径，用于调试
print(f"Current working directory: {os.getcwd()}")
print(f"Python path: {sys.path}")

# 检查src目录是否存在
src_path = os.path.join(project_root, 'src')
if os.path.exists(src_path):
    print(f"src directory found: {src_path}")
else:
    print(f"Warning: src directory not found at {src_path}")

# 显示src/models目录的内容
models_path = os.path.join(src_path, 'models')
if os.path.exists(models_path):
    print(f"Models directory found: {models_path}")
    print("Files in models directory:")
    for f in os.listdir(models_path):
        print(f"  - {f}")
else:
    print(f"Warning: models directory not found at {models_path}")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Added /root/Code/Multi-task Project to system path
Current working directory: /root/Code/Multi-task Project
Python path: ['/root/miniconda3/envs/multi/lib/python310.zip', '/root/miniconda3/envs/multi/lib/python3.10', '/root/miniconda3/envs/multi/lib/python3.10/lib-dynload', '', '/root/miniconda3/envs/multi/lib/python3.10/site-packages', '/root/Code/Multi-task Project']
src directory found: /root/Code/Multi-task Project/src
Models directory found: /root/Code/Multi-task Project/src/models
Files in models directory:
  - core_seq2seq.py
  - __init__.py
  - __pycache__


# 2

In [9]:
import json
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from IPython.display import display, HTML
import sys

# 添加项目根目录到Python路径
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

# 导入我们自己的Seq2SeqTransformer模型
from src.models.core_seq2seq import Seq2SeqTransformer, PositionalEncoding

# 设置随机种子以保证可重复性
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# 检测设备
def get_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"使用GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU内存总量: {torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024:.2f} GB")
        print(f"可用GPU数量: {torch.cuda.device_count()}")
    else:
        device = torch.device("cpu")
        print("使用CPU")
    
    return device

# 流式加载JSON数据并随机抽样
def load_json_data_with_sampling(file_path, sample_percentage=10, max_samples=None, task_types=None):
    sample_prob = sample_percentage / 100
    sampled_data = []
    line_count = 0
    
    print(f"Reading file and sampling {sample_percentage}% of data...")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            line_count += 1
            
            if random.random() <= sample_prob:
                try:
                    item = json.loads(line.strip())
                    
                    if task_types is not None and item.get('type') not in task_types:
                        continue
                        
                    sampled_data.append(item)
                    
                    if max_samples and len(sampled_data) >= max_samples:
                        break
                        
                except json.JSONDecodeError:
                    continue
    
    print(f"Total lines read: {line_count}")
    print(f"Sampled data size: {len(sampled_data)}")
    
    return sampled_data

# 自定义数据集类 - 修改以适应Seq2SeqTransformer模型
class MultiTaskDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512, pad_idx=0):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_idx = pad_idx
        
        # 收集所有可能的标签
        self.all_labels = set()
        for item in data:
            if 'answer_choices' in item and item['answer_choices']:
                for choice in item['answer_choices']:
                    self.all_labels.add(choice)
        self.label_map = {label: i for i, label in enumerate(sorted(self.all_labels))}
        print(f"Found {len(self.label_map)} unique labels")
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        try:
            item = self.data[idx]
            input_text = item['input']
            target_text = item['target']
            task_type = item['type']
            
            # 针对不同任务类型的处理
            if task_type == 'nli':
                parts = input_text.split('，')
                if len(parts) > 1:
                    premise = parts[0].strip()
                    hypothesis_parts = parts[1:]
                    hypothesis = '，'.join(hypothesis_parts).split('？')[0].strip() if '？' in parts[-1] else '，'.join(hypothesis_parts).strip()
                    input_text = f"{premise} [SEP] {hypothesis}"
            
            # 数据增强 - 对于分类任务可以添加一些随机噪声或替换
            if task_type == 'classify' and random.random() < 0.2:
                words = list(input_text)
                # 随机替换、删除或插入字符
                for i in range(min(5, len(words))):
                    pos = random.randint(0, len(words) - 1)
                    if random.random() < 0.5:  # 替换
                        words[pos] = random.choice("的地得了吗呢啊哦嗯呀哈")
                    else:  # 删除
                        if len(words) > 10:  # 确保不会删除太多
                            words.pop(pos)
                            break
                input_text = ''.join(words)
            
            # 使用更有效的tokenizer处理方式
            encoding = self.tokenizer(
                input_text, 
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_tensors='pt',
                return_token_type_ids=True  # 特别对NLI任务有用
            )
            
            # 处理目标文本为ID
            answer_choices = item.get('answer_choices', [])
            
            # 处理标签
            if task_type in ['classify', 'nli'] and answer_choices:
                try:
                    if target_text in self.label_map:
                        label_idx = self.label_map[target_text]
                    elif target_text in answer_choices:
                        label_idx = answer_choices.index(target_text)
                    else:
                        label_idx = 0
                    label = torch.tensor(label_idx, dtype=torch.long)
                except ValueError:
                    label = torch.tensor(0, dtype=torch.long)
            else:
                label = torch.tensor(-100, dtype=torch.long)
            
            for key in encoding:
                encoding[key] = encoding[key].squeeze(0)
            
            # 为BERT模型准备必要的数据
            return {
                'input_ids': encoding['input_ids'],
                'attention_mask': encoding['attention_mask'],
                'token_type_ids': encoding.get('token_type_ids', None),
                'src_padding_mask': (encoding['input_ids'] == self.pad_idx),
                'label': label,
                'task_type': task_type,
                'target_text': target_text,
                'answer_choices': answer_choices
            }
        except Exception as e:
            print(f"Error processing item at index {idx}: {e}")
            # 返回一个默认项，避免批处理失败
            return None

def custom_collate_fn(batch):
    """处理不同大小的批次数据"""
    # 过滤掉None值
    batch = [item for item in batch if item is not None]
    
    if len(batch) == 0:
        return {}
    
    # 收集所有键
    keys = batch[0].keys()
    
    result = {}
    for key in keys:
        if key == 'task_type' or key == 'target_text' or key == 'answer_choices':
            # 对于字符串或列表类型的字段，直接添加到结果中
            result[key] = [item[key] for item in batch]
        elif isinstance(batch[0][key], torch.Tensor):
            # 检查所有项目的形状是否一致
            shapes = [item[key].shape for item in batch]
            if all(shape == shapes[0] for shape in shapes):
                # 如果所有形状一致，则进行常规的堆叠
                result[key] = torch.stack([item[key] for item in batch])
            else:
                # 如果形状不一致，则进行填充或其他处理
                # 这里我们使用最大长度进行填充
                max_len = max(shape[0] for shape in shapes)
                padded_tensors = []
                for item in batch:
                    tensor = item[key]
                    if tensor.shape[0] < max_len:
                        padding = torch.zeros(max_len - tensor.shape[0], *tensor.shape[1:], 
                                             dtype=tensor.dtype, device=tensor.device)
                        tensor = torch.cat([tensor, padding], dim=0)
                    padded_tensors.append(tensor)
                result[key] = torch.stack(padded_tensors)
    
    return result

# 数据加载器创建函数
def create_dataloaders(train_data, val_data, tokenizer, batch_size=8):
    train_dataset = MultiTaskDataset(train_data, tokenizer)
    val_dataset = MultiTaskDataset(val_data, tokenizer)
    
    # 使用num_workers加速数据加载
    num_workers = min(4, os.cpu_count())
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=num_workers,
        pin_memory=True,  # GPU加速
        collate_fn=custom_collate_fn  # 使用自定义的collate函数
    )
    
    val_loader = DataLoader(
        val_dataset, 
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=True,  # GPU加速
        collate_fn=custom_collate_fn  # 使用自定义的collate函数
    )
    
    return train_loader, val_loader, train_dataset.label_map

# 使用预训练的BERT替代自定义编码器
class MultitaskBertModel(nn.Module):
    def __init__(self, num_labels=2):
        super(MultitaskBertModel, self).__init__()
        
        # 加载预训练的BERT模型
        self.bert = AutoModel.from_pretrained("./bert-base-chinese-local")
        
        # 任务特定的头部
        self.classify_head = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.nli_head = nn.Linear(self.bert.config.hidden_size, num_labels)
        
        # 初始化权重
        nn.init.xavier_uniform_(self.classify_head.weight)
        nn.init.xavier_uniform_(self.nli_head.weight)
    
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, task_type=None):
        # 使用BERT提取特征
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # 获取[CLS]的表示用于分类
        pooled_output = outputs.pooler_output
        
        if isinstance(task_type, list):
            # 批处理多个样本时使用
            results = []
            for i, t in enumerate(task_type):
                if t == 'classify':
                    results.append(self.classify_head(pooled_output[i:i+1]))
                elif t == 'nli':
                    results.append(self.nli_head(pooled_output[i:i+1]))
                else:
                    results.append(self.classify_head(pooled_output[i:i+1]))
            return torch.cat(results, dim=0)
        else:
            # 单个样本或同类型批次处理
            if task_type == 'classify':
                return self.classify_head(pooled_output)
            elif task_type == 'nli':
                return self.nli_head(pooled_output)
            else:
                return self.classify_head(pooled_output)

# 训练函数 - 优化以适用于GPU
def train(model, train_loader, val_loader, optimizer, scheduler, device, best_model_path, num_epochs=3, eval_steps=100):
    best_val_score = 0
    global_step = 0
    scaler = torch.cuda.amp.GradScaler()  # 使用混合精度训练
    patience = 3  # 早停的耐心参数
    early_stopping_counter = 0  # 早停计数器
    
    # 用于Kaggle输出的结果记录
    results_history = []
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for batch_idx, batch in enumerate(progress_bar):
            # 检查批次是否为空
            if not batch or len(batch) == 0:
                print("跳过空批次")
                continue
                
            # 检查输入IDs是否存在
            if 'input_ids' not in batch:
                print(f"批次 {batch_idx} 中没有 input_ids，跳过")
                continue
        
        for batch_idx, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch.get('attention_mask', None)
            if attention_mask is not None:
                attention_mask = attention_mask.to(device)
            src_padding_mask = batch.get('src_padding_mask', None)
            if src_padding_mask is not None:
                src_padding_mask = src_padding_mask.to(device)
            labels = batch['label'].to(device)
            task_types = batch['task_type']
            
            optimizer.zero_grad()
            
            # 仅处理有效样本
            valid_indices = (labels != -100).nonzero(as_tuple=True)[0]
            
            if len(valid_indices) > 0:
                batch_input_ids = input_ids[valid_indices]
                batch_attention_mask = attention_mask[valid_indices] if attention_mask is not None else None
                batch_src_padding_mask = src_padding_mask[valid_indices] if src_padding_mask is not None else None
                batch_labels = labels[valid_indices]
                batch_task_types = [task_types[i] for i in valid_indices]
                
                # 使用混合精度训练
                with torch.cuda.amp.autocast():
                    # 获取token_type_ids并移至设备
                    token_type_ids = batch.get('token_type_ids', None)
                    if token_type_ids is not None:
                        # 先将索引移到与token_type_ids相同的设备上 (CPU)
                        cpu_valid_indices = valid_indices.cpu()
                        # 使用CPU上的索引切片CPU上的张量
                        token_type_ids = token_type_ids[cpu_valid_indices]
                        # 然后将结果移到目标设备
                        token_type_ids = token_type_ids.to(device)
                        
                    outputs = model(
                        input_ids=batch_input_ids,
                        attention_mask=batch_attention_mask,
                        token_type_ids=token_type_ids,
                        task_type=batch_task_types
                    )
                    
                    # 为不同任务设置权重
                    task_weights = {
                        'classify': 1.0,
                        'nli': 1.5  # 如果NLI任务较难，可以给它更高的权重
                    }
                    
                    # 计算每个样本的任务权重
                    sample_weights = torch.tensor([task_weights.get(t, 1.0) for t in batch_task_types], 
                                            device=device)
                    
                    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
                    losses = loss_fn(outputs, batch_labels)
                    # 应用任务权重
                    loss = (losses * sample_weights).mean()
                
                # 使用混合精度训练优化反向传播
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                
                scheduler.step()
                
                total_loss += loss.item()
                global_step += 1
            
            progress_bar.set_postfix({"Loss": f"{total_loss/(batch_idx+1):.4f}"})
            
            if global_step % eval_steps == 0:                
                val_results = evaluate(model, val_loader, device)
                results_history.append({
                    'step': global_step,
                    'epoch': epoch+1,
                    'val_results': val_results
                })
                
                # 在Kaggle中显示美观的验证结果
                display(HTML(f"""
                <h4>Validation Results at Step {global_step}</h4>
                <ul>
                    <li>Overall Accuracy: {val_results.get('overall_accuracy', 0):.4f}</li>
                    <li>Classify Accuracy: {val_results.get('classify_accuracy', 0):.4f}</li>
                    <li>NLI Accuracy: {val_results.get('nli_accuracy', 0):.4f}</li>
                </ul>
                """))
                
                model.train()
                
                val_score = sum(val_results.values()) / len(val_results) if val_results else 0
                if val_score > best_val_score:
                    best_val_score = val_score
                    torch.save(model.state_dict(), best_model_path)
                    print(f"New best model saved with score: {best_val_score:.4f} at {best_model_path}")
        
        # 每个epoch结束后的评估
        val_results = evaluate(model, val_loader, device)
        results_history.append({
            'epoch': epoch+1,
            'val_results': val_results
        })
        
        display(HTML(f"""
        <h3>End of Epoch {epoch+1} Validation Results</h3>
        <ul>
            <li>Overall Accuracy: {val_results.get('overall_accuracy', 0):.4f}</li>
            <li>Classify Accuracy: {val_results.get('classify_accuracy', 0):.4f}</li>
            <li>NLI Accuracy: {val_results.get('nli_accuracy', 0):.4f}</li>
        </ul>
        """))
        
        val_score = sum(val_results.values()) / len(val_results) if val_results else 0
        if val_score > best_val_score:
            best_val_score = val_score
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model saved with score: {best_val_score:.4f} at {best_model_path}")
            
        # GPU内存回收
        torch.cuda.empty_cache()
    
    return results_history

# 评估函数保持不变
def evaluate(model, dataloader, device):
    model.eval()
    
    all_preds = []
    all_labels = []
    all_task_types = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch.get('attention_mask', None)
            if attention_mask is not None:
                attention_mask = attention_mask.to(device)
            src_padding_mask = batch.get('src_padding_mask', None)
            if src_padding_mask is not None:
                src_padding_mask = src_padding_mask.to(device)
            labels = batch['label'].to(device)
            task_types = batch['task_type']
            
            valid_indices = (labels != -100).nonzero(as_tuple=True)[0]
            
            if len(valid_indices) > 0:
                batch_input_ids = input_ids[valid_indices]
                batch_attention_mask = attention_mask[valid_indices] if attention_mask is not None else None
                batch_src_padding_mask = src_padding_mask[valid_indices] if src_padding_mask is not None else None
                batch_labels = labels[valid_indices]
                batch_task_types = [task_types[i] for i in valid_indices]
                
                # 使用混合精度进行推理
                with torch.cuda.amp.autocast():
                    # 获取token_type_ids并移至设备
                    token_type_ids = batch.get('token_type_ids', None)
                    if token_type_ids is not None:
                        # 先将索引移到与token_type_ids相同的设备上 (CPU)
                        cpu_valid_indices = valid_indices.cpu()
                        # 使用CPU上的索引切片CPU上的张量
                        token_type_ids = token_type_ids[cpu_valid_indices]
                        # 然后将结果移到目标设备
                        token_type_ids = token_type_ids.to(device)
                        
                    outputs = model(
                        input_ids=batch_input_ids,
                        attention_mask=batch_attention_mask,
                        token_type_ids=token_type_ids,
                        task_type=batch_task_types
                    )
                
                preds = torch.argmax(outputs, dim=-1)
                
                # 转移到CPU以进行后续处理
                preds = preds.cpu().numpy()
                batch_labels = batch_labels.cpu().numpy()
                
                all_preds.extend(preds)
                all_labels.extend(batch_labels)
                all_task_types.extend(batch_task_types)
    
    results = {}
    if all_preds:
        overall_acc = accuracy_score(all_labels, all_preds)
        results['overall_accuracy'] = overall_acc
        
        # 计算F1分数
        f1 = f1_score(all_labels, all_preds, average='weighted')
        results['overall_f1'] = f1
    
    # 计算任务特定指标
    task_metrics = {}
    for task_type in set(all_task_types):
        task_indices = [i for i, t in enumerate(all_task_types) if t == task_type]
        if task_indices:
            task_preds = [all_preds[i] for i in task_indices]
            task_labels = [all_labels[i] for i in task_indices]
            task_acc = accuracy_score(task_labels, task_preds)
            task_f1 = f1_score(task_labels, task_preds, average='weighted')
            
            results[f'{task_type}_accuracy'] = task_acc
            results[f'{task_type}_f1'] = task_f1
            
            task_metrics[task_type] = {
                'accuracy': task_acc,
                'f1': task_f1,
                'support': len(task_indices)
            }
    
    # 在Kaggle中显示详细结果表格
    if task_metrics:
        metrics_df = pd.DataFrame.from_dict(task_metrics, orient='index')
        display(HTML("<h4>Detailed Task Metrics</h4>"))
        display(metrics_df)
    
    return results

def main():
    # 参数设置
    # 检测运行环境并设置相应的路径
    is_kaggle = os.path.exists('/kaggle') and '/kaggle/working' in os.getcwd()

    # 确保使用正确的项目根目录
    project_root = os.path.dirname(os.path.abspath('__file__'))
    possible_paths = [
        os.path.dirname(os.path.abspath('__file__')),  # 当前文件所在目录
        os.getcwd(),  # 当前工作目录
        '/root/Code/Multi-task Project'  # 从错误消息推断的完整路径
    ]

    for path in possible_paths:
        if os.path.exists(path):
            project_root = path
            break

    print(f"Using project root: {project_root}")

    # 定义best_model_path变量
    if is_kaggle:
        model_path = '/kaggle/working/best_multitask_transformer.pt'
        best_model_path = model_path  # 添加这行定义best_model_path
        output_dir = '/kaggle/working/outputs'
        data_file = '/kaggle/input/multitask-data/data.jsonl'  # 根据实际路径调整
    else:
        # 本地环境 - 尝试多种可能的模型路径
        model_paths = [
            os.path.join(project_root, "outputs", "transformer_model", "best_multitask_transformer.pt"),
            os.path.join(project_root, "test_outputs", "multitask_model.pth"),
            os.path.join(project_root, "best_multitask_transformer.pt"),  # 检查根目录
            "./best_multitask_transformer.pt"  # 检查当前目录
        ]
        
        model_path = None
        for path in model_paths:
            if os.path.exists(path):
                model_path = path
                break
        
        if model_path is None:
            # 如果所有路径都不存在，使用默认路径但创建一个新模型
            model_path = os.path.join(project_root, "outputs", "transformer_model", "best_multitask_transformer.pt")
            print(f"警告: 没有找到现有模型，将创建一个新模型实例")
        
        best_model_path = model_path  # 添加这行定义best_model_path
            
        # 设置本地环境的输出目录和数据文件路径
        output_dir = os.path.join(project_root, "outputs", "transformer_model")
        data_file = os.path.join(project_root, "data/raw", "pCLUE_train_3.json") 

    print(f"使用模型路径: {model_path}")
    
    sample_percentage = 80
    max_samples = 45000
    batch_size = 32  # 增加批次大小
    epochs = 8  # 增加训练轮数
    lr = 5e-6  # 降低学习率
    weight_decay = 0.01  # 添加权重衰减
    eval_steps = 200
    seed = 42
    task_types = ["classify", "nli"]
    
    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    
    # 检查数据文件是否存在
    if not os.path.exists(data_file):
        raise FileNotFoundError(f"数据文件 '{data_file}' 不存在，请检查路径")
    
    set_seed(seed)
    
    # 获取设备
    device = get_device()
    
    # 显示开始信息
    display(HTML("<h2>Starting Multitask Transformer Training with Shared Encoder</h2>"))
    display(HTML(f"""
    <ul>
        <li>Device: {device}</li>
        <li>Data File: {data_file}</li>
        <li>Output Directory: {output_dir}</li>
        <li>Sample Percentage: {sample_percentage}%</li>
        <li>Max Samples: {max_samples}</li>
        <li>Batch Size: {batch_size}</li>
        <li>Epochs: {epochs}</li>
        <li>Learning Rate: {lr}</li>
    </ul>
    """))
    
    # 加载数据
    print("Loading and sampling data...")
    all_data = load_json_data_with_sampling(
        data_file,
        sample_percentage=sample_percentage,
        max_samples=max_samples,
        task_types=task_types
    )
    
    # 确保每个样本都有答案选项
    for item in all_data:
        if 'answer_choices' not in item or not item['answer_choices']:
            item['answer_choices'] = ["是的", "不是"] if item['type'] == 'nli' else ["选项A", "选项B"]
    
    # 分割数据集
    train_size = int(0.8 * len(all_data))
    random.shuffle(all_data)
    train_data = all_data[:train_size]
    val_data = all_data[train_size:]
    
    display(HTML(f"""
    <h4>Dataset Info</h4>
    <ul>
        <li>Total Samples: {len(all_data)}</li>
        <li>Training Samples: {len(train_data)}</li>
        <li>Validation Samples: {len(val_data)}</li>
    </ul>
    """))
    
    local_model_path = "./bert-base-chinese-local"
    if os.path.exists(os.path.join(local_model_path, "tokenizer_config.json")):
        print(f"从本地加载分词器: {local_model_path}")
        tokenizer = AutoTokenizer.from_pretrained(local_model_path, local_files_only=True)
    else:
        print(f"警告：本地分词器未找到，尝试降级方案")
        # 尝试使用相对路径
        alternative_paths = [
            os.path.join(project_root, "bert-base-chinese-local"),
            "/root/Code/Multi-task Project/bert-base-chinese-local"
        ]
        found_model = False
        for path in alternative_paths:
            if os.path.exists(os.path.join(path, "tokenizer_config.json")):
                print(f"从路径加载分词器: {path}")
                tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True)
                found_model = True
                break
        
        if not found_model:
            raise ValueError("无法找到本地模型文件，请确认bert-base-chinese-local目录路径")
    
    # 创建数据加载器
    train_loader, val_loader, label_map = create_dataloaders(train_data, val_data, tokenizer, batch_size)
    
    # 计算标签数量和词汇表大小
    num_labels = len(label_map)
    vocab_size = tokenizer.vocab_size
    print(f"Vocabulary size: {vocab_size}")
    print(f"Using {num_labels} labels for classification")
    
    # 创建模型 - 使用我们的共享编码器MultitaskTransformer
    model = MultitaskBertModel(
        num_labels=num_labels
    )
    model.to(device)
    
    # 优化器和学习率调度器
    # 优化器和学习率调度器
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=total_steps // 5,  # 更多的预热步骤
        num_training_steps=total_steps
    )
    
    # 训练模型
    results_history = train(
        model, 
        train_loader, 
        val_loader, 
        optimizer, 
        scheduler, 
        device,
        best_model_path,  # 添加这个参数
        num_epochs=epochs, 
        eval_steps=eval_steps
    )
    
    # 最终评估
    if os.path.exists(best_model_path):
        print(f"加载最佳模型用于最终评估: {best_model_path}")
        model.load_state_dict(torch.load(best_model_path))
        model.to(device)
    else:
        print(f"警告：无法找到最佳模型文件 {best_model_path}，使用当前模型进行评估")
    
    final_results = evaluate(model, val_loader, device)
    
    display(HTML("<h2>Final Evaluation Results</h2>"))
    display(HTML(f"""
    <ul>
        <li>Overall Accuracy: {final_results.get('overall_accuracy', 0):.4f}</li>
        <li>Classify Accuracy: {final_results.get('classify_accuracy', 0):.4f}</li>
        <li>NLI Accuracy: {final_results.get('nli_accuracy', 0):.4f}</li>
        <li>Overall F1 Score: {final_results.get('overall_f1', 0):.4f}</li>
    </ul>
    """))
    
    # 保存标签映射
    label_map_inv = {v: k for k, v in label_map.items()}
    with open(os.path.join(output_dir, "label_map.json"), "w", encoding="utf-8") as f:
        json.dump(label_map_inv, f, ensure_ascii=False, indent=2)
    print("Label map saved for inference")
    
    # 保存模型配置信息
    config_info = {
        "vocab_size": vocab_size,
        "num_labels": num_labels,
        "task_types": task_types,
        "training_params": {
            "batch_size": batch_size,
            "epochs": epochs,
            "learning_rate": lr
        },
        "best_results": final_results
    }
    
    with open(os.path.join(output_dir, "model_config.json"), "w", encoding="utf-8") as f:
        json.dump(config_info, f, ensure_ascii=False, indent=2)
    print("Model configuration saved")

if __name__ == "__main__":
    main()

                


Using project root: /root/Code/Multi-task Project
使用模型路径: /root/Code/Multi-task Project/test_outputs/multitask_model.pth
使用GPU: NVIDIA GeForce RTX 4090
GPU内存总量: 23.65 GB
可用GPU数量: 1


Loading and sampling data...
Reading file and sampling 80% of data...


71884it [00:00, 73909.58it/s] 

Total lines read: 71885
Sampled data size: 45000





从本地加载分词器: ./bert-base-chinese-local
Found 139 unique labels
Found 139 unique labels
Vocabulary size: 21128
Using 139 labels for classification


  scaler = torch.cuda.amp.GradScaler()  # 使用混合精度训练
Epoch 1/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just 

Unnamed: 0,accuracy,f1,support
classify,0.209804,0.147402,5100
nli,0.00359,0.004098,3900


New best model saved with score: 0.0999 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.297647,0.20615,5100
nli,0.304359,0.206931,3900


New best model saved with score: 0.2571 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.343333,0.271776,5100
nli,0.313077,0.276802,3900


New best model saved with score: 0.3016 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.4,0.309727,5100
nli,0.33641,0.201629,3900


New best model saved with score: 0.3193 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.409412,0.307336,5100
nli,0.340769,0.250256,3900


New best model saved with score: 0.3290 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.418235,0.3282,5100
nli,0.339744,0.253041,3900


New best model saved with score: 0.3355 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Epoch 2/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.425294,0.34412,5100
nli,0.342051,0.266825,3900


New best model saved with score: 0.3468 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.517647,0.445059,5100
nli,0.343333,0.271763,3900


New best model saved with score: 0.3981 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.543333,0.479279,5100
nli,0.353333,0.257214,3900


New best model saved with score: 0.4184 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.561765,0.490952,5100
nli,0.345641,0.211202,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.572941,0.508185,5100
nli,0.355128,0.227196,3900


New best model saved with score: 0.4267 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.583922,0.52086,5100
nli,0.392051,0.316439,3900


New best model saved with score: 0.4586 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.574706,0.512669,5100
nli,0.433077,0.35115,3900


New best model saved with score: 0.4743 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Epoch 3/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.589804,0.528421,5100
nli,0.449487,0.415329,3900


New best model saved with score: 0.5016 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.589608,0.531436,5100
nli,0.514103,0.49031,3900


New best model saved with score: 0.5351 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.593333,0.551347,5100
nli,0.545128,0.538722,3900


New best model saved with score: 0.5585 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.601765,0.542774,5100
nli,0.556154,0.537504,3900


New best model saved with score: 0.5610 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.598235,0.544035,5100
nli,0.602308,0.594506,3900


New best model saved with score: 0.5865 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.601569,0.548366,5100
nli,0.611538,0.598852,3900


New best model saved with score: 0.5913 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Epoch 4/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.599412,0.543776,5100
nli,0.628462,0.621704,3900


New best model saved with score: 0.5997 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.596471,0.53682,5100
nli,0.624103,0.616921,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.60098,0.542192,5100
nli,0.608205,0.597903,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.598235,0.544215,5100
nli,0.652051,0.647189,3900


New best model saved with score: 0.6115 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.602353,0.547462,5100
nli,0.666923,0.667592,3900


New best model saved with score: 0.6210 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.604902,0.556669,5100
nli,0.666923,0.667655,3900


New best model saved with score: 0.6241 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.60902,0.551506,5100
nli,0.676667,0.676158,3900


New best model saved with score: 0.6285 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Epoch 5/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.610588,0.572502,5100
nli,0.654103,0.645732,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.606078,0.555209,5100
nli,0.676154,0.676298,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.603529,0.553261,5100
nli,0.674103,0.672359,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.607059,0.553378,5100
nli,0.677436,0.67704,3900


New best model saved with score: 0.6290 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.609216,0.555003,5100
nli,0.682821,0.682345,3900


New best model saved with score: 0.6326 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.609608,0.558493,5100
nli,0.680256,0.679858,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.611176,0.576956,5100
nli,0.684615,0.684895,3900


New best model saved with score: 0.6385 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Epoch 6/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.612745,0.56378,5100
nli,0.683077,0.68375,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.611765,0.561671,5100
nli,0.686154,0.686787,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.611961,0.566418,5100
nli,0.688462,0.688428,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.614706,0.562048,5100
nli,0.689744,0.689966,3900


New best model saved with score: 0.6391 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.612549,0.560946,5100
nli,0.691282,0.691357,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.610196,0.573484,5100
nli,0.689231,0.687849,3900


New best model saved with score: 0.6393 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Epoch 7/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.61451,0.570529,5100
nli,0.693846,0.694471,3900


New best model saved with score: 0.6428 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.610784,0.56325,5100
nli,0.694872,0.695146,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.615686,0.564347,5100
nli,0.69359,0.693504,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.617843,0.569016,5100
nli,0.694615,0.69487,3900


New best model saved with score: 0.6441 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.612353,0.56548,5100
nli,0.684615,0.682077,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.612745,0.567091,5100
nli,0.696154,0.695808,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.615882,0.575354,5100
nli,0.694615,0.695442,3900


New best model saved with score: 0.6448 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Epoch 8/8:   0%|          | 0/1125 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.616471,0.568442,5100
nli,0.697692,0.698321,3900


New best model saved with score: 0.6449 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.613922,0.563962,5100
nli,0.69641,0.696723,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.618824,0.57084,5100
nli,0.697179,0.697395,3900


New best model saved with score: 0.6459 at /root/Code/Multi-task Project/test_outputs/multitask_model.pth


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.615098,0.564044,5100
nli,0.696923,0.69683,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.61549,0.565885,5100
nli,0.699487,0.699847,3900


  with torch.cuda.amp.autocast():
Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after

Unnamed: 0,accuracy,f1,support
classify,0.61549,0.566198,5100
nli,0.700513,0.701016,3900


Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.615686,0.566411,5100
nli,0.700513,0.701016,3900


加载最佳模型用于最终评估: /root/Code/Multi-task Project/test_outputs/multitask_model.pth


Evaluating:   0%|          | 0/282 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used

Unnamed: 0,accuracy,f1,support
classify,0.619412,0.571874,5100
nli,0.697179,0.697395,3900


Label map saved for inference
Model configuration saved


# 3

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
import json
import sys
import os

# 添加项目根目录到Python路径
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

# 导入我们自己的Seq2SeqTransformer模型
from src.models.core_seq2seq import Seq2SeqTransformer

# 修改后的基于共享编码器的多任务模型
class MultitaskTransformer(nn.Module):
    def __init__(self, vocab_size, num_labels=2, d_model=512, nhead=8, 
                 num_encoder_layers=6, num_decoder_layers=2, 
                 dim_feedforward=2048, dropout=0.1, max_seq_length=512, 
                 pad_idx=0):
        super(MultitaskTransformer, self).__init__()
        
        # 实例化Seq2SeqTransformer作为共享编码器
        self.seq2seq = Seq2SeqTransformer(
            vocab_size=vocab_size,
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            max_seq_length=max_seq_length,
            pad_idx=pad_idx
        )
        
        # 任务特定的头部
        self.classify_head = nn.Linear(d_model, num_labels)
        self.nli_head = nn.Linear(d_model, num_labels)
        
        # 初始化权重
        nn.init.xavier_uniform_(self.classify_head.weight)
        nn.init.xavier_uniform_(self.nli_head.weight)
    
    def forward(self, input_ids, attention_mask=None, src_padding_mask=None, task_type=None):
        # 使用共享编码器提取特征 - 只使用encoder部分
        memory = self.seq2seq.encode(src=input_ids)
        
        # 获取[CLS]位置的表示，用于分类（对应于第一个token）
        pooled_output = memory[:, 0]
        
        if isinstance(task_type, list):
            # 批处理多个样本时使用
            results = []
            for i, t in enumerate(task_type):
                if t == 'classify':
                    results.append(self.classify_head(pooled_output[i:i+1]))
                elif t == 'nli':
                    results.append(self.nli_head(pooled_output[i:i+1]))
                else:
                    results.append(self.classify_head(pooled_output[i:i+1]))
            return torch.cat(results, dim=0)
        else:
            # 单个样本或同类型批次处理
            if task_type == 'classify':
                return self.classify_head(pooled_output)
            elif task_type == 'nli':
                return self.nli_head(pooled_output)
            else:
                return self.classify_head(pooled_output)

# Function to load the saved model
def load_model(model_path, vocab_size=21128, num_labels=139):
    # 创建模型实例
    model = MultitaskTransformer(
        vocab_size=vocab_size,
        num_labels=num_labels,
        d_model=512,
        nhead=8,
        num_encoder_layers=6,
        num_decoder_layers=2,
        dim_feedforward=2048,
        dropout=0.1,
        max_seq_length=512
    )
    
    # 尝试加载现有模型权重
    if os.path.exists(model_path):
        print(f"加载现有模型权重: {model_path}")
        model.load_state_dict(torch.load(model_path))
    else:
        print(f"模型文件不存在: {model_path}，使用随机初始化的模型")
        
    model.to(device)
    model.eval()
    return model

# Function to load the tokenizer
def load_tokenizer(model_name="bert-base-chinese"):
    # 优先从本地加载
    local_model_path = "./bert-base-chinese-local"
    if os.path.exists(os.path.join(local_model_path, "tokenizer_config.json")):
        print(f"从本地加载分词器: {local_model_path}")
        return AutoTokenizer.from_pretrained(local_model_path, local_files_only=True)
    else:
        print(f"本地分词器未找到，尝试在线加载: {model_name}")
        return AutoTokenizer.from_pretrained(model_name)

# Function to process the input and make a prediction
def predict(model, tokenizer, input_data, device):
    # 准备输入
    input_text = input_data['input']
    answer_choices = input_data['answer_choices']
    task_type = input_data.get('type', 'classify')
    
    # 对输入进行tokenize
    encoding = tokenizer(input_text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

    # 将张量移至正确的设备
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    src_padding_mask = (input_ids == tokenizer.pad_token_id).to(device)
    
    # 进行预测
    with torch.no_grad():
        output = model(input_ids=input_ids, 
                      attention_mask=attention_mask,
                      src_padding_mask=src_padding_mask,
                      task_type=[task_type])
    
    # 获取预测的标签索引
    predicted_label_idx = torch.argmax(output, dim=-1).item()

    # 确保预测的索引在有效范围内
    if predicted_label_idx >= len(answer_choices):
        predicted_label_idx = len(answer_choices) - 1
    
    # 获取预测的标签
    predicted_label = answer_choices[predicted_label_idx]
    
    return predicted_label

# 加载模型和tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 检测运行环境并设置相应的路径
# 检测运行环境并设置相应的路径
is_kaggle = os.path.exists('/kaggle') and '/kaggle/working' in os.getcwd()

if is_kaggle:
    model_path = '/kaggle/working/best_multitask_transformer.pt'
else:
    # 本地环境 - 使用相对或绝对路径
    model_path = os.path.join(project_root, "test_outputs", "multitask_model.pth")
    # 如果模型不存在，尝试其他可能的位置
    if not os.path.exists(model_path):
        model_path = os.path.join(project_root, "outputs", "transformer_model", "best_multitask_transformer.pt")
    
print(f"Loading model from: {model_path}")
print(f"Checking if model file exists: {os.path.exists(model_path)}")

local_model_path = "./bert-base-chinese-local"
if os.path.exists(os.path.join(local_model_path, "tokenizer_config.json")):
    print(f"从本地加载分词器: {local_model_path}")
    tokenizer = AutoTokenizer.from_pretrained(local_model_path, local_files_only=True)
else:
    print("本地分词器未找到，尝试从在线获取")
    model_name = "bert-base-chinese"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

# 估计词汇表大小
vocab_size = tokenizer.vocab_size
num_labels = 139  # 从训练中获取的标签数量

# 加载模型
model = load_model(model_path, vocab_size, num_labels)

# 定义输入数据
input_data = {
    "input": '这是关于哪方面的新闻： 故事,文化,娱乐,体育,财经,房产,汽车,教育,科技,军事,旅游,国际,股票,农业,游戏?投票事件过后，王者荣耀今后的日子该怎样走？',
    "target": "123",
    "answer_choices": ["故事", "文化", "娱乐", "体育", "财经", "房产", "汽车", "教育", "科技", "军事", "旅游", "国际", "股票", "农业", "游戏"],
    "type": "classify"
}

# 预测
predicted_label = predict(model, tokenizer, input_data, device)
print(f"预测标签: {predicted_label}")


Loading model from: /root/Code/outputs/transformer_model/best_multitask_transformer.pt
Checking if model file exists: False
从本地加载分词器: ./bert-base-chinese-local
模型文件不存在: /root/Code/outputs/transformer_model/best_multitask_transformer.pt，使用随机初始化的模型
预测标签: 游戏


  output = torch._nested_tensor_from_mask(
