Processing data

In [1]:
import pandas as pd
import csv
from io import StringIO

dataset_path = '/kaggle/input/your-dataset-folder/'
df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')

print(df.iloc[0]['prompt'])

#"xxx, yyy", "xxx" -> xxx,yyy xxx     
def merge_quoted_strings(s):
    reader = csv.reader(
        StringIO(s.strip()), 
        quotechar='"', 
        skipinitialspace=True
    )
    
    parts = []
    for row in reader:
        parts.extend([item.strip() for item in row if item.strip()])
    
    return " ".join(parts)

def preprocess(s):
    s = s[1:-2]
    s = merge_quoted_strings(s)
    s = s.replace("\\n", "\n")
    s = s.replace("**", "")
    s = s.replace('\\"', '"')
    return s
    
df['prompt'] = df['prompt'].apply(preprocess)
df['response_a'] = df['response_a'].apply(preprocess)
df['response_b'] = df['response_b'].apply(preprocess)
df['labels'] = df[['winner_model_a', 'winner_model_b', 'winner_tie']].dot([1,2,0])



["Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer."]


install dependency

In [2]:
!pip install bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.2


model config

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    TrainingArguments, 
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import pandas as pd
from sklearn.metrics import accuracy_score


# parameter
MODEL_NAME = "roberta-base"  # test model
MAX_LENGTH = 2048 #question is long
BATCH_SIZE = 4    
LORA_R = 8
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
EPOCHS = 3

# save memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

dataset

In [4]:
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from torch.utils.data import random_split

# 输入你的 Hugging Face 账户 Token，这里把自己的刚刚创建的token粘贴进来就行啦
login("your key")

class ComparisonDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # template
#         text = f"""<s>[INST] <<SYS>>
# You are a professional answer quality assessment assistant. Please compare the quality of the following two answers.。
# <</SYS>>

# question:{row['prompt']}

# answer1:{row['response_a']}

# answer2:{row['response_b']}

# Please compare the quality of these two answers and choose the better one:[/INST]"""
        text = f"[INST]{row['prompt']}[A1]{row['response_a']}[A2]{row['response_b']}[/INST]"
        label_map = {0: "tie", 1: "winner_model_a", 2: "winner_model_b"}
        label_text = label_map[int(row['labels'])]
        
        # full sequence
        full_text = text
        encoding = self.tokenizer(
            full_text,
            max_length=MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(int(row['labels']), dtype=torch.long)
        }



Training

In [None]:
from torch import nn
from tqdm import tqdm

from transformers import (
    LongformerForMaskedLM,
    LongformerTokenizer,
    LongformerForSequenceClassification,
    LongformerTokenizerFast,
)

# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                            num_labels = 3,
                                                           attention_window = 512)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = 2048)

full_dataset = ComparisonDataset(df, tokenizer)
full_dataset[0]
dataset_size = len(full_dataset)
train_size = int(0.6 * dataset_size)
val_size = int(0.2 * dataset_size)
test_size = dataset_size - train_size - val_size

generator = torch.Generator().manual_seed(42)

train_dataset, val_dataset, test_dataset = random_split(
    full_dataset,
    [train_size, val_size, test_size],
    generator=generator
)


from torch.optim import AdamW
import torch.nn.functional as F

def compute_loss(outputs, labels):
    return F.cross_entropy(outputs, labels)

optimizer = AdamW(model.parameters(), lr=1e-5)


def train_model(train_loader, val_loader, model, optimizer, num_epochs=3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print("-" * 10)
        scaler = torch.cuda.amp.GradScaler()
        # 训练阶段
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            #outputs = model(input_ids, attention_mask=attention_mask)

            # loss = compute_loss(outputs.logits, labels)
            
            # loss.backward()
            # optimizer.step()
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = compute_loss(outputs.logits, labels)
    
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
                    
            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_loader)
        print(f"Training Loss: {avg_train_loss}")
        
        # 验证阶段
        model.eval()
        val_preds = []
        val_labels = []
        
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
                
            preds = torch.argmax(outputs, dim=-1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.cpu().numpy())
        
        val_acc = accuracy_score(val_labels, val_preds)
        val_f1 = f1_score(val_labels, val_preds)
        print(f"Validation Accuracy: {val_acc}")
        print(f"Validation F1 Score: {val_f1}")
    
    torch.save(model.state_dict(), 'longformer_classifier.pth')
    print("Training complete!")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=4,
    shuffle=True,
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=2,
    shuffle=True,
    num_workers=4
)


# 假设已经定义了train_loader和val_loader
train_model(train_loader, val_loader, model, optimizer, num_epochs=3)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler()


Epoch 1/3
----------


  with torch.cuda.amp.autocast():
Initializing global attention on CLS token...
  0%|          | 31/8622 [00:53<4:01:31,  1.69s/it]