# Libraries and paths

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizerFast
import torch
from torch.utils.data import Dataset#, DataLoader
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
#import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report
from transformers import BertTokenizerFast, BertForSequenceClassification, pipeline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preparation

In [2]:
# Reading in the data
df = pd.read_csv('McDonald_s_Reviews.csv', encoding='latin-1')
df = df[['review', 'rating']]
df['len'] = df['review'].str.len()
# Drop all reviews above 200 characters
df = df[df['len']<=200]
df = df.drop(columns='len')

In [3]:
# Encode the sentiment labels
label_encoder = LabelEncoder()
df['rating'] = label_encoder.fit_transform(df['rating'])

# Split the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(df['review'], df['rating'], test_size=0.3, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny')

# Tokenize the text
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

# Create a PyTorch dataset
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Ensure labels are in the correct format (convert Pandas Series to a list)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

# Model Training

In [4]:
# Load TinyBERT model
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=len(label_encoder.classes_))

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [5]:
# Train the model
trainer.train()

  0%|          | 0/118000 [00:00<?, ?it/s]

{'loss': 1.5865, 'grad_norm': 3.8054561614990234, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 1.5875, 'grad_norm': 2.533926248550415, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}
{'loss': 1.579, 'grad_norm': 2.704298973083496, 'learning_rate': 3e-06, 'epoch': 0.03}
{'loss': 1.5752, 'grad_norm': 1.8904556035995483, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.03}
{'loss': 1.5661, 'grad_norm': 3.441650629043579, 'learning_rate': 5e-06, 'epoch': 0.04}
{'loss': 1.5623, 'grad_norm': 2.9754953384399414, 'learning_rate': 6e-06, 'epoch': 0.05}
{'loss': 1.5728, 'grad_norm': 2.411702871322632, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.06}
{'loss': 1.5817, 'grad_norm': 3.430481195449829, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.07}
{'loss': 1.5411, 'grad_norm': 2.40678071975708, 'learning_rate': 9e-06, 'epoch': 0.08}
{'loss': 1.5573, 'grad_norm': 2.414332628250122, 'learning_rate': 1e-05, 'epoch': 0.08}
{'loss': 1.537, 'grad_norm': 2.944384

TrainOutput(global_step=118000, training_loss=0.31626530540570363, metrics={'train_runtime': 1835.8037, 'train_samples_per_second': 1027.997, 'train_steps_per_second': 64.277, 'train_loss': 0.31626530540570363, 'epoch': 100.0})

In [6]:
# Evaluate the model
trainer.evaluate()

  0%|          | 0/127 [00:00<?, ?it/s]

{'eval_loss': 1.9718717336654663,
 'eval_runtime': 0.9733,
 'eval_samples_per_second': 8311.117,
 'eval_steps_per_second': 130.487,
 'epoch': 100.0}

# Classification report

In [7]:
# Prepare model and data for evaluation
model.eval()
predictions = []
actuals = []

# Iterate over the test dataset
for item in test_dataset:
    input_ids = item['input_ids'].unsqueeze(0).to(device)
    attention_mask = item['attention_mask'].unsqueeze(0).to(device)
    labels = item['labels'].unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Move logits to CPU and use softmax for probabilities
    logits = outputs.logits.cpu()
    probabilities = torch.softmax(logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1)
    
    predictions.extend(predicted_label.numpy())
    actuals.extend(labels.cpu().numpy())

# Transform labels back to original encoding
predicted_labels = label_encoder.inverse_transform(predictions)
actual_labels = label_encoder.inverse_transform(actuals)

# Generate classification report
report = classification_report(actual_labels, predicted_labels, target_names=label_encoder.classes_)
print(report)

              precision    recall  f1-score   support

      1 star       0.74      0.75      0.75      1731
     2 stars       0.45      0.43      0.44       703
     3 stars       0.54      0.56      0.55      1225
     4 stars       0.59      0.52      0.55      1613
     5 stars       0.75      0.78      0.76      2817

    accuracy                           0.66      8089
   macro avg       0.61      0.61      0.61      8089
weighted avg       0.66      0.66      0.66      8089



# Prediction of a random test sample

In [22]:
# Ensure model is in evaluation mode
model.eval()

# Randomly select an index for a test item
random_idx = np.random.randint(len(test_dataset))
test_item = test_dataset[random_idx]

# Retrieve the original text and label using the selected index
original_text = test_texts.iloc[random_idx]
original_label = test_labels.iloc[random_idx]

# Move the input tensors to the same device as the model
input_ids = test_item['input_ids'].unsqueeze(0).to(device)
attention_mask = test_item['attention_mask'].unsqueeze(0).to(device)

# Perform the prediction, ensuring no gradient calculations
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

# Apply softmax to the logits to get probabilities
predictions = torch.softmax(outputs.logits, dim=1).cpu() # Move predictions to CPU for further operations

# Decode the predicted label
predicted_label = label_encoder.inverse_transform([torch.argmax(predictions).item()])

# Decode the original label to its string representation
original_label_decoded = label_encoder.inverse_transform([original_label])[0] 

# Print the original text and label
print(f'Original text: {original_text}')
print(f'Original label: {original_label_decoded}')

# Print the predicted label and the probability distribution
print(f'Predicted label: {predicted_label[0]}')

# Print labels with their corresponding prediction probabilities
label_names = label_encoder.inverse_transform(range(len(label_encoder.classes_)))
probabilities = predictions.numpy().flatten()
for label, probability in zip(label_names, probabilities):
    print(f'{label}: {probability:.4f}')

Original text: Rude employees
Original label: 1 star
Predicted label: 1 star
1 star: 0.2157
2 stars: 0.2152
3 stars: 0.2010
4 stars: 0.1766
5 stars: 0.1914


# Comparison to non fine-tuned model

In [9]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-tiny')
model = BertForSequenceClassification.from_pretrained('prajjwal1/bert-tiny', num_labels=5)

# Convert labels to numeric
label_dict = {'1 star': 0, '2 stars': 1, '3 stars': 2, '4 stars': 3, '5 stars': 4}
#df_test['numeric_labels'] = df_test['rating'].map(label_dict)
df_test['numeric_labels'] = df_test['rating']

max_seq_length = 512  # Define the maximum sequence length for BERT-based models
test_encodings = tokenizer(df_test['review'].tolist(), truncation=True, padding=True, max_length=max_seq_length)


# Convert to torch dataset
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

test_dataset = TweetDataset(test_encodings, df_test['numeric_labels'].tolist())

# Load sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, return_all_scores=True)

# Perform predictions
test_preds = []
for tweet in df_test['review']:
    preds = sentiment_analysis(tweet)
    # Convert model output to single label prediction
    pred_label = max(preds[0], key=lambda x: x['score'])['label']
    test_preds.append(int(pred_label[-1]))

# Generate classification report
print(classification_report(df_test['numeric_labels'], test_preds, target_names=label_dict.keys()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


              precision    recall  f1-score   support

      1 star       0.26      0.85      0.39      1731
     2 stars       0.03      0.10      0.05       703
     3 stars       0.04      0.01      0.01      1225
     4 stars       0.20      0.01      0.02      1613
     5 stars       0.58      0.00      0.01      2817

    accuracy                           0.19      8089
   macro avg       0.22      0.19      0.10      8089
weighted avg       0.31      0.19      0.10      8089

