# BERT fine-tuning

### Loading data

In [11]:
import pandas as pd
import os

# Load the sentiment scores
excel_path = r"C:/Users/MCOB PHD 14/Dropbox/Charlie's Dissertation/Beige Books/manual_sentiment.csv"
sentiment_data = pd.read_csv(excel_path)

# create labels
    # -.2 to -2: Mixed
    # <-.2: Negative
    # >.2: Positive
    

def label_sentiment(score):
    if score <= -0.3:
        return 0
    elif score <= 0.2:
        return 1
    else:
        return 2  
    
sentiment_data['label'] = sentiment_data['human_sentiment'].apply(label_sentiment)
    
# Define path where text files are stored
text_files_dir = r"C:/Users/MCOB PHD 14/Dropbox/Charlie's Dissertation/Beige Books/selected_chunks2"

# Load the tokenized text files
text_data = {}
for filename in os.listdir(text_files_dir):
    if filename.endswith('.txt'):
        file_path = os.path.join(text_files_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data[filename] = file.read()

# Combine the two
# Create a DataFrame from text_data
text_df = pd.DataFrame(list(text_data.items()), columns=['file_names', 'text'])

# Join the sentiment data with the text data based on 'file_names'
combined_data = pd.merge(sentiment_data, text_df, on='file_names')

# Preview the combined data
print(combined_data.head(50))


    Document                file_names  human_sentiment scorer  label  \
0          1   1970_at (7)_chunk_1.txt             -0.9     CS      0   
1          2   1970_bo (4)_chunk_2.txt              0.2     CS      1   
2          3   1970_ch (1)_chunk_4.txt             -0.5     CS      0   
3          4   1970_ch (5)_chunk_2.txt             -0.7     CS      0   
4          5   1970_ch (7)_chunk_2.txt             -0.5     CS      0   
5          6   1970_cl (6)_chunk_1.txt              0.3     CS      2   
6          7   1970_da (2)_chunk_4.txt              0.4     CS      2   
7          8   1970_kc (3)_chunk_1.txt             -0.8     CS      0   
8          9   1970_kc (5)_chunk_3.txt             -0.5     CS      0   
9         10   1970_mn (3)_chunk_2.txt              0.4     CS      2   
10        11   1970_ns (1)_chunk_2.txt             -0.8     CS      0   
11        12   1970_ns (2)_chunk_3.txt             -0.6     CS      0   
12        13   1970_ny (1)_chunk_3.txt             

### Tokenization

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data from the 'text' column in combined_data
def tokenize_texts(texts, max_length=512):
    tokenized_texts = []
    for text in texts:
        tokenized_input = tokenizer(
            text, 
            padding='max_length',  # Pad to max_length
            truncation=True,       # Truncate if longer than max_length
            max_length=max_length, # Maximum length for input
            return_tensors='pt'    # Return PyTorch tensors
        )
        tokenized_texts.append({
            'input_ids': tokenized_input['input_ids'].squeeze(), 
            'attention_mask': tokenized_input['attention_mask'].squeeze()
        })
    return tokenized_texts

# Tokenize the texts from the 'text' column in combined_data
texts = combined_data['text'].tolist()  # Extract the texts
tokenized_texts = tokenize_texts(texts)

# Extract the labels from the 'label' column in combined_data
labels = combined_data['label'].tolist()

# Create a custom Dataset class using tokenized texts and labels
class SentimentDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.tokenized_texts[idx]['input_ids']
        attention_mask = self.tokenized_texts[idx]['attention_mask']
        label = torch.tensor(self.labels[idx])
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label}

# Create the dataset using the tokenized texts and labels
dataset = SentimentDataset(tokenized_texts, labels)

# Create a DataLoader to batch the data
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Example: Iterate over the DataLoader and print a batch
for batch in dataloader:
    print(batch['input_ids'])
    print(batch['attention_mask'])
    print(batch['labels'])
    break  # Remove break to iterate through the entire dataset




tensor([[ 101, 6713, 1010,  ...,    0,    0,    0],
        [ 101, 2254, 2654,  ...,    0,    0,    0],
        [ 101, 1011, 1011,  ...,    0,    0,    0],
        ...,
        [ 101, 2096, 1996,  ...,    0,    0,    0],
        [ 101, 2152, 1011,  ...,    0,    0,    0],
        [ 101, 2195, 5501,  ...,    0,    0,    0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([2, 2, 2, 2, 1, 1, 2, 0])


### Fine-tuning model

Warning: This takes about 70 minutes

In [13]:
# Define the training arguments
from transformers import Trainer, TrainingArguments, BertForSequenceClassification

# Load the BERT model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Define training arguments (I just used all the default values)
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    warmup_steps=500,                # number of warmup steps
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    eval_strategy="no",              # No evaluation during training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=dataset,          # Use the full dataset for training
)

# Fine-tune the model on the full dataset
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 375/375 [1:12:54<00:00, 11.67s/it]

{'train_runtime': 4374.405, 'train_samples_per_second': 0.686, 'train_steps_per_second': 0.086, 'train_loss': 1.0100785319010417, 'epoch': 3.0}





TrainOutput(global_step=375, training_loss=1.0100785319010417, metrics={'train_runtime': 4374.405, 'train_samples_per_second': 0.686, 'train_steps_per_second': 0.086, 'total_flos': 789354427392000.0, 'train_loss': 1.0100785319010417, 'epoch': 3.0})

### Save the model for future use

In [17]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./BeigeBERT_three')
tokenizer.save_pretrained('./BeigeBERT_three')

# The model and tokenizer can be loaded later for inference


('./BeigeBERT_three\\tokenizer_config.json',
 './BeigeBERT_three\\special_tokens_map.json',
 './BeigeBERT_three\\vocab.txt',
 './BeigeBERT_three\\added_tokens.json')

### Assess performance

Note I have not split into testing/training
i.e. this data is being tested on the same data it was trained on

Results are promising, showing good balance between classes and consistent performance across classes

In [15]:
# Performance assessment

from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Get predictions from the model
# The predict function returns predictions, actual labels, and other info
predictions, true_labels, _ = trainer.predict(dataset)

# Convert predictions to label indices
predicted_labels = np.argmax(predictions, axis=1)

# Print a classification report
report = classification_report(true_labels, predicted_labels, target_names=["Negative", "Mixed", "Positive"])
print(report)

# Calculate and print the accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy is: {accuracy:.4f}')


100%|██████████| 125/125 [06:59<00:00,  3.36s/it]

              precision    recall  f1-score   support

    Negative       0.74      0.87      0.80       198
     Neutral       0.81      0.79      0.80       413
    Positive       0.91      0.86      0.89       389

    accuracy                           0.83      1000
   macro avg       0.82      0.84      0.83      1000
weighted avg       0.84      0.83      0.83      1000

Accuracy on training data: 0.8320



