# finetune

we finetune the bert model and add a classification layer after last_hidden_state.

In [15]:
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
import os
import numpy as np
import pandas as pd
import re

## data

In [16]:
# load data from tsv files
data_dir = os.path.join(os.getcwd(), 'data')
train_file = os.path.join(data_dir, 'train.tsv')
test_file = os.path.join(data_dir, 'test.tsv')

train_data = pd.read_csv(train_file, sep='\t', keep_default_na=False)
test_data = pd.read_csv(test_file, sep='\t', keep_default_na=False)

In [None]:
# add a new column "Sentence" to the data
# which stands for the whole sentence of the Phrase 
def add_attr_Sentence(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe['Sentence'] = dataframe['Phrase']
    for i in range(1, len(dataframe)):
        if dataframe["SentenceId"].iloc[i] == dataframe["SentenceId"].iloc[i-1]:
            dataframe.loc[i, "Sentence"] = dataframe.loc[i-1, "Sentence"]
    return dataframe

# # train_data = add_attr_Sentence(train_data)
# test_data = add_attr_Sentence(test_data)
# # save the data to tsv files
# # train_data.to_csv(train_file, sep='\t', index=False)
# test_data.to_csv(test_file, sep='\t', index=False)
    

In [19]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentence
0,156061,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...
2,156063,8545,An,An intermittently pleasing but mostly routine ...
3,156064,8545,intermittently pleasing but mostly routine effort,An intermittently pleasing but mostly routine ...
4,156065,8545,intermittently pleasing but mostly routine,An intermittently pleasing but mostly routine ...


In [None]:
# max length of each phrase tokens
train_data['Phrase'].apply(lambda x: len(x.split())).max(), \
    test_data['Phrase'].apply(lambda x: len(x.split())).max()

(52, 56)

In [4]:
# create a DataLoader
from torch.utils.data import Dataset, DataLoader


class FinetuneDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=200):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        phrase = self.data.loc[idx, "Phrase"]
        sentence = self.data.loc[idx, "Sentence"]
        label = self.data.loc[idx, "Sentiment"]

        inputs = self.tokenizer(
            sentence,
            phrase,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()

        return input_ids, attention_mask, label

## training

In [None]:
# download bert model
dir = os.path.join(os.getcwd(), "models")
bert_base_model = "bert-base-uncased"
tokenizer = transformers.BertTokenizer.from_pretrained(
    bert_base_model, cache_dir=dir
)
bert_model = transformers.BertModel.from_pretrained(
    bert_base_model, cache_dir=dir
    )

2025-02-17 08:56:29.226112: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-17 08:56:29.238998: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739782589.254860  647393 cuda_dnn.cc:8179] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739782589.259783  647393 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-17 08:56:29.277855: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [6]:
from transformers import PreTrainedModel

# add a classification layer after the bert model
class BertClassifier(PreTrainedModel):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__(bert_model.config)
        self.bert = bert_model
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        cls_output = last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

In [7]:
# training method
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

def train(model, train_loader, test_loader, optimizer, criterion, device, epoch_num=2,
          test_only=False):
    model.to(device)
    model.train()
    for epoch in range(epoch_num):
        total_loss = 0
        for input_ids, attention_mask, labels in tqdm(train_loader):
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_accuracy = accuracy_score(labels.cpu(), logits.argmax(dim=1).cpu())
        print(f'Epoch {epoch + 1}/{epoch_num}, Loss: {total_loss:.4f}')
        print(f'Training Accuracy: {total_accuracy:.4f}')
        print(f'Training F1 Score: {f1_score(labels.cpu(), logits.argmax(dim=1).cpu(), average="macro"):.4f}')
        
        # evaluate on the validation set
        model.eval()
        with torch.no_grad():
            total_accuracy = 0
            for input_ids, attention_mask, labels in tqdm(test_loader):
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                logits = model(input_ids, attention_mask)
                total_accuracy += accuracy_score(labels.cpu(), logits.argmax(dim=1).cpu())
            print(f'Validation Accuracy: {total_accuracy/len(test_loader):.4f}')
            print(f'Validation F1 Score: {f1_score(labels.cpu(), logits.argmax(dim=1).cpu(), average="macro"):.4f}')
    
    

In [None]:
# split train_data into train and test
from sklearn.model_selection import train_test_split

trainset_data, testset_data = train_test_split(train_data, test_size=0.2, random_state=7777, shuffle=True)
trainset_data = trainset_data.reset_index(drop=True)
testset_data = testset_data.reset_index(drop=True)

# create a DataLoader
trainset_dataset = FinetuneDataset(trainset_data, tokenizer)
testset_dataset = FinetuneDataset(testset_data, tokenizer)

trainset_loader = DataLoader(trainset_dataset, batch_size=8, shuffle=True)
testset_loader = DataLoader(testset_dataset, batch_size=8, shuffle=False)

In [9]:
# example of the dataset
ids = trainset_dataset[0][0]
tokens = tokenizer.convert_ids_to_tokens(ids.numpy().tolist())
print(tokens)


['[CLS]', 'a', 'loud', ',', 'low', '-', 'budget', 'and', 'tired', 'formula', 'film', 'that', 'arrives', 'cloak', '##ed', 'in', 'the', 'eu', '##ph', '##emi', '##sm', '`', 'urban', 'drama', '.', "'", '[SEP]', 'loud', ',', 'low', '-', 'budget', 'and', 'tired', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [None]:
model = BertClassifier(bert_model, num_classes=5)

# apply lora to accelerate the fintune training

optimizer = torch.optim.AdamW([
    {'params': model.bert.parameters(), 'lr': 5e-6},
    {'params': model.classifier.parameters(), 'lr': 5e-5}
])
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epoch_num = 1

train(model, trainset_loader, testset_loader, optimizer, criterion, device, epoch_num)


 47%|████▋     | 7391/15606 [30:45<33:46,  4.05it/s]  

## save and upload

In [26]:
# save the model
save_dir = os.path.join(os.getcwd(), 'models', 'lab1_finetuned_bert')
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('/notebooks/learningFile/courses/NLP/assignment/lab1 homework/lab1 homework/models/lab1_finetuned_bert/tokenizer_config.json',
 '/notebooks/learningFile/courses/NLP/assignment/lab1 homework/lab1 homework/models/lab1_finetuned_bert/special_tokens_map.json',
 '/notebooks/learningFile/courses/NLP/assignment/lab1 homework/lab1 homework/models/lab1_finetuned_bert/vocab.txt',
 '/notebooks/learningFile/courses/NLP/assignment/lab1 homework/lab1 homework/models/lab1_finetuned_bert/added_tokens.json')

# CoT

we call the deepseek api, and set the appropriate prompt to do the sentiment analysis.

we compare the accuracy with 2 kinds of system prompt.

**user prompt:**
1. **zero-shot prompt**:(we provide no example and ask the problems directly)
> You are a sentiment analysis expert.
> Your task is to analysis classify movie review phrase which is truncated from a whole movie review sentence. please classify the sentiment of phrase into one of five sentiment categories: negative (0), somewhat negative (1), neutral (2), somewhat positive (3), or positive (4). You should carefully consider the intensity and context of the phrases to determine the most appropriate label.Please show your chain of thinking step by step, and print the result at the end of output with format: Label: \\label{{number}}, Sentiment: \\sentiment{{text}}.

2. **few-shot prompt**:(we provide some examples and ask the problems)

> Task: Classify the sentiment of the truncated phrase (not the full review) into one of five categories:
> 0: Negative
> 1: Somewhat Negative
> 2: Neutral
> 3: Somewhat Positive
> 4: Positive
> 
> Format:
> \\label{{number}}, Sentiment: \\sentiment{{text}}
> 
> Examples:
> 
> Review: "The movie was a complete disaster. The plot was incoherent, and the acting was terrible."
> Truncated Phrase: "complete disaster"
> Output:
> \\label{0}, Sentiment: \\sentiment{negative}
> 
> Review: "The film had a few good moments, but overall it was a bit dull."
> Truncated Phrase: "a bit dull"
> Output:
> \\label{1}, Sentiment: \\sentiment{somewhat negative}
> 
> Review: "The cinematography was decent, but the story lacked depth."
> Truncated Phrase: "decent"
> Output:
> \\label{2}, Sentiment: \\sentiment{neutral}
> 
> Review: "The acting was surprisingly good, though the script could have been better."
> Truncated Phrase: "surprisingly good"
> Output:
> \\label{3}, Sentiment: \\sentiment{somewhat positive}
> 
> Review: "This movie was an absolute masterpiece. The performances were outstanding."
> Truncated Phrase: "absolute masterpiece"
> Output:
> \\label{4}, Sentiment: \\sentiment{positive}
> 
> Instructions:
> Focus only on the truncated phrase, not the full review.
> Ignore context outside the phrase (e.g., sarcasm, hyperbole, or mixed sentiments).
> Use the labels and sentiment text exactly as formatted above.
> Think step by step, and answer based on your thinking.



In [None]:
# add code/ to sys.path
sys.path.append(os.path.join(os.getcwd(), 'code'))

# load data from tsv files
data_dir = os.path.join(os.getcwd(), 'data')
train_file = os.path.join(data_dir, 'train_cp.tsv')
test_file = os.path.join(data_dir, 'test.tsv')

train_data = pd.read_csv(train_file, sep='\t', keep_default_na=False)
test_data = pd.read_csv(test_file, sep='\t', keep_default_na=False)

In [3]:
# zero shot response
from gemini_call_api2 import GeminiAPI

deepseek_key_file = os.path.join(os.getcwd(), 'api_key', 'deepseek.api.key')
api_key = open(deepseek_key_file, 'r').read()
client = GeminiAPI(api_key)

In [4]:
# generate the query
sys_msg = r"You are a sentiment analysis expert."
phrases = test_data['Phrase'].tolist()
# true_labels = test_data['Sentiment'].tolist()
user_msgs = phrases.copy()
for i in range(len(user_msgs)):
    user_msgs[i] = f"Your task is to classify movie review phrase which is truncated from a whole movie review sentence. please classify the sentiment of phrase into one of five sentiment categories: negative (0), somewhat negative (1), neutral (2), somewhat positive (3), or positive (4). You should carefully consider the intensity and context of the phrases to determine the most appropriate label.Please show your chain of thinking step by step, and print the result at the end of output with format: Label: \\label{{number}}, Sentiment: \\sentiment{{text}}." + \
    f"\n The whole movie review sentence is: \"{phrases[0]}\"" + \
    f"\n The phrase to be classified is: \"{phrases[i]}\""
sys_msgs = [sys_msg] * len(user_msgs)


In [5]:
# call the llm api, and save the responses
import re
from save_labels import Save

def seperate(responses):
    # seperate the label from the response
    labels = []
    for response in responses:
        pattern = r'\\label\{(\d)\}'
        match = re.search(pattern, response)
        if match:
            label = match.group(1)
        else:
            label = -1
        labels.append(int(label))
    return labels, responses

def save_labels(labels, col_name, path, df):
    save = Save(df=df)
    save.add_labels(col_name, labels)
    save.save_df(path)


In [12]:
# test
# split the user_msgs and sys_msgs into batches
from tqdm import tqdm

responses = []
batch_size = 500
for i in tqdm(range(0, len(test_data), batch_size)):
    responses += await client.batch_chat(
        user_msgs[i : i + batch_size],
        sys_msgs[i : i + batch_size],
        max_concurrency=50,
        model="deepseek-v3",
        # model='deepseek-r1'
        # model='deepseek-reasoner'
    )

responses.__len__()

 35%|███▍      | 46/133 [8:20:41<14:18:29, 592.07s/it]

# upload answer

In [28]:
test_data = pd.read_csv(test_file, keep_default_na=False, sep='\t')
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentence,Label2,Label3,Label5,Label4
0,156061,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...,1,1,1,1
1,156062,8545,An intermittently pleasing but mostly routine ...,An intermittently pleasing but mostly routine ...,1,1,1,2
2,156063,8545,An,An intermittently pleasing but mostly routine ...,2,2,2,2
3,156064,8545,intermittently pleasing but mostly routine effort,An intermittently pleasing but mostly routine ...,1,1,1,1
4,156065,8545,intermittently pleasing but mostly routine,An intermittently pleasing but mostly routine ...,1,1,1,1


In [31]:
test_data['Label3'].value_counts()

Label3
 2    26691
 0    11124
 4    10548
 1    10505
 3     7384
-1       40
Name: count, dtype: int64

In [47]:
# generate submit.csv
# keep columns: PhraseId, Sentiment
# compute the average value of sentiment

def average(col_list:list[int]):
    submit_file = os.path.join(data_dir, 'submit6.csv')
    submit_data = test_data[['PhraseId'] + col_list].copy()
    # compute the average value, except -1
    for i in range(len(submit_data)):
        sum = 0
        count = 0
        for col in col_list:
            if submit_data.loc[i, col] != -1:
                sum += submit_data.loc[i, col]
                count += 1
        if count == 0:
            submit_data.loc[i, 'Sentiment'] = -1
        else:
            submit_data.loc[i, 'Sentiment'] = round(sum / count)
    
    # replace sentiment values -1 with 2
    submit_data['Sentiment'].replace(-1, 2, inplace=True)
    # keep columns: PhraseId, Sentiment
    submit_data = submit_data[['PhraseId', 'Sentiment']]
    # make Sentiment values to int
    submit_data['Sentiment'] = submit_data['Sentiment'].astype(int)
    submit_data.to_csv(submit_file, index=False)
    return submit_data