## <center> Sentiment Analysis Of Movie Reviews (distilbert-base-uncased-finetuned-sst-2-english) </center>

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, get_scheduler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from accelerate import Accelerator
import datetime
from datetime import datetime
import os




#### Load Data

In [2]:
import pandas as pd

root_path = "."
df = pd.read_csv(f'{root_path}/data/rating_auto_label_sentiment_two_classes.csv')

# drop unused columns
df = df [['review_text','sentiment']]
df.head(2)
df.shape

(10468, 2)

In [3]:
# Drop rows with NaN values in any column
df = df.dropna()
df.shape

(10462, 2)

#### Split the Data into Train, Test, and Eval Sets

In [4]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, temp_idx in splitter.split(df, df['sentiment']):
    train_df = df.iloc[train_idx]
    temp_df = df.iloc[temp_idx]

# Split temp_df into test and eval (50% each)
test_df, eval_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['sentiment'], random_state=42)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}, Eval size: {len(eval_df)}")

Train size: 8369, Test size: 1046, Eval size: 1047


### Initialize tokenizer

In [5]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

#### Data Preparation (Tokenization)

In [6]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset
import torch
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Define the SentimentDataset
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['review_text']
        label = 1 if row['sentiment'] == 'POSITIVE' else 0

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Custom Trainer to handle class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss with class weights
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Create datasets
train_dataset = SentimentDataset(train_df, tokenizer)
test_dataset = SentimentDataset(test_df, tokenizer)
eval_dataset = SentimentDataset(eval_df, tokenizer)

# Handle Class Imbalance with Weighted Loss
class_counts = df['sentiment'].value_counts().to_dict()
total_samples = len(df)
weights = {
    0: total_samples / class_counts['NEGATIVE'],
    1: total_samples / class_counts['POSITIVE']
}

class_weights = torch.tensor([weights[0], weights[1]], dtype=torch.float)

In [7]:
# Initialize Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)
model.config.problem_type = "single_label_classification"

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./model_checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    use_cpu=True,  # Force training on CPU
    disable_tqdm=False,  # Enable progress bar
    report_to="none",  # Disable wandb logging
)
# Initialize Trainer with Early Stopping Callback
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

best_model = ''
best_accuracy = 0
best_y_test_pred = None

# Train the Model
trainer.train()

# Evaluate the Best Model on the Evaluation set
eval_results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", eval_results)

# Evaluate the Best Model on the Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

  trainer = WeightedTrainer(


  0%|          | 0/1572 [00:00<?, ?it/s]

{'loss': 0.2336, 'grad_norm': 23.36346435546875, 'learning_rate': 1.3638676844783715e-05, 'epoch': 0.95}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.255489706993103, 'eval_accuracy': 0.9159503342884432, 'eval_f1': 0.9346210995542348, 'eval_precision': 0.9024390243902439, 'eval_recall': 0.9691833590138675, 'eval_runtime': 307.9387, 'eval_samples_per_second': 3.4, 'eval_steps_per_second': 0.214, 'epoch': 1.0}
{'loss': 0.1258, 'grad_norm': 11.411917686462402, 'learning_rate': 7.27735368956743e-06, 'epoch': 1.91}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.41435420513153076, 'eval_accuracy': 0.9121298949379179, 'eval_f1': 0.9315476190476191, 'eval_precision': 0.9007194244604316, 'eval_recall': 0.9645608628659477, 'eval_runtime': 278.3089, 'eval_samples_per_second': 3.762, 'eval_steps_per_second': 0.237, 'epoch': 2.0}
{'loss': 0.0538, 'grad_norm': 5.007336139678955, 'learning_rate': 9.160305343511451e-07, 'epoch': 2.86}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.41497743129730225, 'eval_accuracy': 0.9216809933142311, 'eval_f1': 0.9374045801526718, 'eval_precision': 0.9288956127080181, 'eval_recall': 0.9460708782742681, 'eval_runtime': 294.6742, 'eval_samples_per_second': 3.553, 'eval_steps_per_second': 0.224, 'epoch': 3.0}
{'train_runtime': 19400.7361, 'train_samples_per_second': 1.294, 'train_steps_per_second': 0.081, 'train_loss': 0.13358036009713287, 'epoch': 3.0}


  0%|          | 0/66 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.255489706993103, 'eval_accuracy': 0.9159503342884432, 'eval_f1': 0.9346210995542348, 'eval_precision': 0.9024390243902439, 'eval_recall': 0.9691833590138675, 'eval_runtime': 291.8124, 'eval_samples_per_second': 3.588, 'eval_steps_per_second': 0.226, 'epoch': 3.0}


  0%|          | 0/66 [00:00<?, ?it/s]

Test Results: {'eval_loss': 0.23828238248825073, 'eval_accuracy': 0.9196940726577438, 'eval_f1': 0.9379615952732644, 'eval_precision': 0.900709219858156, 'eval_recall': 0.9784283513097073, 'eval_runtime': 306.824, 'eval_samples_per_second': 3.409, 'eval_steps_per_second': 0.215, 'epoch': 3.0}


In [26]:
from datetime import datetime

# Save Test results
df_result = pd.DataFrame(columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'tree_param', 'n_estimator', 'lrate', 'batch_size', 'num_epoch', 'weight_decay', 'test_accuracy', 'wall_time', 'run_time'])
model_no = 1
filename="output/result_DB.csv"

# Print result
task_no = str(model_no)
model_algo = 'distilbert'
print(f"{model_algo} - {task_no}, text_preprocess: {True}, vectorizer: {'WordPiece'}, lrate: {training_args.learning_rate}, batch:{training_args.per_device_train_batch_size}, num_epoch:{training_args.per_device_train_batch_size}, weight_decay:{training_args.weight_decay}")

wall_time=0
test_accuracy=0

for key, value in test_results.items():
    if key=='eval_accuracy':
        test_accuracy = value
        print(f"Test Accuracy: {test_accuracy}\n")
    if key == 'train_runtime' or key == 'eval_runtime':
        wall_time = wall_time + float(value)
model_no +=1

# Record result to dataframe, to be exported to csv
columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'tree_param', 'n_estimator', 'lrate', 'batch_size', 'num_epoch', 'weight_decay', 'test_accuracy', 'wall_time', 'run_time']
new_row = [model_algo, task_no, 'wordpiece', 0, 0, 0, '', '', 0, training_args.learning_rate, training_args.per_device_train_batch_size, training_args.num_train_epochs, training_args.weight_decay, test_accuracy, wall_time, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
df_result.loc[len(df_result)] = new_row

new_row_df = pd.DataFrame([new_row], columns=df_result.columns)
new_row_df.to_csv(filename, index=False, mode='a', header=not os.path.exists(filename))

# Check for the best model
if test_accuracy > best_accuracy:
    best_model = f"{model_algo} - {task_no}, text_preprocess: {True}, vectorizer: {'WordPiece'}, lrate: {training_args.learning_rate}, batch:{training_args.per_device_train_batch_size}, num_epoch:{training_args.num_train_epochs}, weight_decay:{training_args.weight_decay}"
    best_accuracy = test_accuracy

distilbert - 1, text_preprocess: True, vectorizer: WordPiece, lrate: 2e-05, batch:16, num_epoch:16, weight_decay:0.01
Test Accuracy: 0.9196940726577438



#### Save the Trained Model

In [28]:
import zipfile
import os
import shutil
import IPython.display as display
model_name = "distilbert"
output_folder = f"./output/{model_name}"
model.save_pretrained(output_folder)
tokenizer.save_pretrained(output_folder)

# Create a zip file from the final_model folder and save it to the output folder
with zipfile.ZipFile(f"./output/{model_name}.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, output_folder))

print("Model has been zipped and saved to output foder successfully.")

Model has been zipped and saved to output foder successfully.


### Inference

In [29]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch
import torch.nn.functional as F  # To use softmax

# Path to the saved model and tokenizer
model_path = './output/distilbert'

# Load the model and tokenizer from the saved path
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

# Set the model to evaluation mode (disable dropout, etc.)
model.eval()

def preprocess_text(text, tokenizer, max_length=256):
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return encoding

def predict_sentiment(text, model, tokenizer):
    # Preprocess the input text
    encoding = preprocess_text(text, tokenizer)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits  # Get the raw prediction scores
        
        # Apply softmax to get probabilities
        probabilities = F.softmax(logits, dim=-1)  # Apply softmax along the last dimension
        predicted_class = torch.argmax(probabilities, dim=-1)  # Get the predicted class (0 for negative, 1 for positive)
    
    # Map the predicted class to sentiment label
    sentiment = 'POSITIVE' if predicted_class.item() == 1 else 'NEGATIVE'
    
    # Return sentiment and probabilities for each class
    positive_prob = probabilities[0][1].item()  # Probability for POSITIVE class
    negative_prob = probabilities[0][0].item()  # Probability for NEGATIVE class
    
    return sentiment, positive_prob, negative_prob

In [18]:
# Sample sentences (positive and negative examples)
sample_sentences = [
    "I absolutely love this movie! It was amazing.",
    "This movie was terrible, I hated every second of it.", 
    "while this movie is not intended for everyone, it is good for someone has no brain", 
    "let's watch it only when it is free to watch, i will not pay for it",
    'A worthy contender for the Animated film of 2024', 
    'No plot at all. But if you are looking for a good laugh. You will not find that either.'
]

# Perform inference on each sample sentence
for sentence in sample_sentences:
    sentiment, positive_prob, negative_prob = predict_sentiment(sentence, model, tokenizer)
    print(f"Sentence: {sentence}\nPredicted Sentiment: {sentiment}")
    print(f"POSITIVE: {positive_prob:.4f}, NEGATIVE: {negative_prob:.4f}\n")

Sentence: I absolutely love this movie! It was amazing.
Predicted Sentiment: POSITIVE
POSITIVE: 0.9998, NEGATIVE: 0.0002

Sentence: This movie was terrible, I hated every second of it.
Predicted Sentiment: NEGATIVE
POSITIVE: 0.0005, NEGATIVE: 0.9995

Sentence: while this movie is not intended for everyone, it is good for someone has no brain
Predicted Sentiment: NEGATIVE
POSITIVE: 0.1357, NEGATIVE: 0.8643

Sentence: let's watch it only when it is free to watch, i will not pay for it
Predicted Sentiment: NEGATIVE
POSITIVE: 0.0044, NEGATIVE: 0.9956

Sentence: A worthy contender for the Animated film of 2024
Predicted Sentiment: POSITIVE
POSITIVE: 0.9997, NEGATIVE: 0.0003

Sentence: No plot at all. But if you are looking for a good laugh. You will not find that either.
Predicted Sentiment: NEGATIVE
POSITIVE: 0.0021, NEGATIVE: 0.9979

