In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, get_scheduler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from accelerate import Accelerator




#### Import Dataset

In [2]:
import pandas as pd

root_path = "."
df = pd.read_csv(f'{root_path}/data/rating_auto_label_sentiment_two_classes.csv')

# drop unused columns
df = df [['review_text','sentiment']]
df.head(2)
df.shape

(10468, 2)

In [3]:
# Drop rows with NaN values in any column
df = df.dropna()
df.shape

(10462, 2)

#### Step 1: Split the Data into Train, Test, and Eval Sets
We can use sklearn to split the data.

In [4]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, temp_idx in splitter.split(df, df['sentiment']):
    train_df = df.iloc[train_idx]
    temp_df = df.iloc[temp_idx]

# Split temp_df into test and eval (50% each)
test_df, eval_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['sentiment'], random_state=42)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}, Eval size: {len(eval_df)}")

Train size: 8369, Test size: 1046, Eval size: 1047


### init tokenizer

In [5]:
# Step 3: Initialize Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")



#### Data Preparation (Tokenization)
We need to tokenize the text data and convert it into PyTorch tensors.

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset
import torch
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Define the SentimentDataset
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['review_text']
        label = 1 if row['sentiment'] == 'POSITIVE' else 0

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = SentimentDataset(train_df, tokenizer)
test_dataset = SentimentDataset(test_df, tokenizer)
eval_dataset = SentimentDataset(eval_df, tokenizer)

# Handle Class Imbalance with Weighted Loss
class_counts = df['sentiment'].value_counts().to_dict()
total_samples = len(df)
weights = {
    0: total_samples / class_counts['NEGATIVE'],
    1: total_samples / class_counts['POSITIVE']
}
class_weights = torch.tensor([weights[0], weights[1]], dtype=torch.float)

# Initialize Model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english",
    num_labels=2
)
model.config.problem_type = "single_label_classification"

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./model_checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    use_cpu=True,  # Force training on CPU
    disable_tqdm=False,  # Enable progress bar
    report_to="none",  # Disable wandb logging
)

# Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Custom Trainer to handle class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss with class weights
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Initialize Trainer with Early Stopping Callback
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the Model
trainer.train()

# Evaluate the Best Model on the Evaluation set
eval_results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", eval_results)

# Evaluate the Best Model on the Test Set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)

  0%|          | 0/1572 [00:00<?, ?it/s]

{'loss': 0.2204, 'grad_norm': 8.525217056274414, 'learning_rate': 1.3638676844783715e-05, 'epoch': 0.95}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.19172503054141998, 'eval_accuracy': 0.9312320916905444, 'eval_f1': 0.9451219512195121, 'eval_precision': 0.9351432880844646, 'eval_recall': 0.9553158705701078, 'eval_runtime': 165.314, 'eval_samples_per_second': 6.333, 'eval_steps_per_second': 0.399, 'epoch': 1.0}
{'loss': 0.1078, 'grad_norm': 54.804473876953125, 'learning_rate': 7.27735368956743e-06, 'epoch': 1.91}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.34871822595596313, 'eval_accuracy': 0.9340974212034384, 'eval_f1': 0.9478458049886621, 'eval_precision': 0.93026706231454, 'eval_recall': 0.9661016949152542, 'eval_runtime': 138.1068, 'eval_samples_per_second': 7.581, 'eval_steps_per_second': 0.478, 'epoch': 2.0}
{'loss': 0.0516, 'grad_norm': 0.1546747237443924, 'learning_rate': 9.160305343511451e-07, 'epoch': 2.86}


  0%|          | 0/66 [00:00<?, ?it/s]

{'eval_loss': 0.3410106301307678, 'eval_accuracy': 0.9379178605539638, 'eval_f1': 0.9503437738731857, 'eval_precision': 0.9424242424242424, 'eval_recall': 0.9583975346687211, 'eval_runtime': 76.8551, 'eval_samples_per_second': 13.623, 'eval_steps_per_second': 0.859, 'epoch': 3.0}
{'train_runtime': 14348.7688, 'train_samples_per_second': 1.75, 'train_steps_per_second': 0.11, 'train_loss': 0.12268722739838461, 'epoch': 3.0}


  0%|          | 0/66 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.19172503054141998, 'eval_accuracy': 0.9312320916905444, 'eval_f1': 0.9451219512195121, 'eval_precision': 0.9351432880844646, 'eval_recall': 0.9553158705701078, 'eval_runtime': 86.1908, 'eval_samples_per_second': 12.147, 'eval_steps_per_second': 0.766, 'epoch': 3.0}


  0%|          | 0/66 [00:00<?, ?it/s]

Test Results: {'eval_loss': 0.17893357574939728, 'eval_accuracy': 0.9388145315487572, 'eval_f1': 0.9511450381679389, 'eval_precision': 0.9425113464447806, 'eval_recall': 0.9599383667180277, 'eval_runtime': 100.0609, 'eval_samples_per_second': 10.454, 'eval_steps_per_second': 0.66, 'epoch': 3.0}


#### Save the Trained Model

In [None]:
import zipfile
import os
import shutil
import IPython.display as display
model_name = "distilbert-sst-2"
output_folder = f"./output/{model_name}"
model.save_pretrained(output_folder)
tokenizer.save_pretrained(output_folder)

# Create a zip file from the final_model folder and save it to the output folder
with zipfile.ZipFile(f"./output/{model_name}.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, output_folder))

print("Model has been zipped and saved to output foder successfully.")

Model has been zipped and saved to output foder successfully.


In [None]:
import zipfile
import os
import shutil
import IPython.display as display
model_name = "fine_tuned_model"
output_folder = f"./output/{model_name}"
model.save_pretrained(output_folder)
tokenizer.save_pretrained(output_folder)

# Create a zip file from the final_model folder and save it to disk
with zipfile.ZipFile(f"./output/{model_name}.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, output_folder))

print("Model has been zipped and saved to output folder successfully.")

Model has been zipped and saved to output foder successfully.


In [None]:
# Save Test results
df_result = pd.DataFrame(columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'tree_param', 'n_estimator', 'lrate', 'test_accuracy', 'wall_time','run_time'])
model_no = 1
filename="output/result_DB.csv"

# Print result
task_no = str(model_no)
model = 'Distill'
print(f"{model} - {task_no}, text_preprocess: {True}, vectorizer: {'WordPiece'}, lrate: {'-'}, batch:{'-'}, num_epoch:{'-'}, weight_decay:{-})
print(f"Test Accuracy: {test_accuracy}\n")
model_no +=1

# Record result to dataframe, to be exported to csv
# columns=['model', 'task_no', 'vectorizer', 'ngram', 'max_iter', 'C', 'gamma', 'tree_param', 'n_estimator', 'lrate', 'test_accuracy', 'wall_time','run_time']
new_row = [model, task_no, n_vect, n_gram, n_iter, C, gamma, '', 0, 0, test_accuracy, wall_time, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
df_result.loc[len(df_result)] = new_row

new_row_df = pd.DataFrame([new_row], columns=df_result.columns)
new_row_df.to_csv(filename, index=False, mode='a', header=not os.path.exists(filename))

# Check for the best model
if test_accuracy > best_accuracy:
    best_model = f"{model} - {task_no}, text_preprocess: {True}, vectorizer: {n_vect}, ngram: {n_gram}, max_iter: {n_iter}, C:{C}, gamma:{gamma}"
    best_accuracy = test_accuracy
    best_y_test_pred = y_test_pred

for key, value in test_results.items():
    if key=='eval_accuracy':
        print(f"{key}: {value}")

### Inference

In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch
import torch.nn.functional as F  # To use softmax

# Path to the saved model and tokenizer
model_path = './output/distilbert-sst-2'

# Load the model and tokenizer from the saved path
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

# Set the model to evaluation mode (disable dropout, etc.)
model.eval()

def preprocess_text(text, tokenizer, max_length=128):
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    return encoding

def predict_sentiment(text, model, tokenizer):
    # Preprocess the input text
    encoding = preprocess_text(text, tokenizer)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits  # Get the raw prediction scores
        
        # Apply softmax to get probabilities
        probabilities = F.softmax(logits, dim=-1)  # Apply softmax along the last dimension
        predicted_class = torch.argmax(probabilities, dim=-1)  # Get the predicted class (0 for negative, 1 for positive)
    
    # Map the predicted class to sentiment label
    sentiment = 'POSITIVE' if predicted_class.item() == 1 else 'NEGATIVE'
    
    # Return sentiment and probabilities for each class
    positive_prob = probabilities[0][1].item()  # Probability for POSITIVE class
    negative_prob = probabilities[0][0].item()  # Probability for NEGATIVE class
    
    return sentiment, positive_prob, negative_prob

In [10]:
# Sample sentences (positive and negative examples)
sample_sentences = [
    "I absolutely love this movie! It was amazing.",
    "This movie was terrible, I hated every second of it.", 
    "while this movie is not intended for everyone, it is good for someone has no brain", 
    "let's watch it only when it is free to watch, i will not pay for it",
    'A worthy contender for the Animated film of 2024', 
    'No plot at all. But if you are looking for a good laugh. You will not find that either.'
]

# Perform inference on each sample sentence
for sentence in sample_sentences:
    sentiment, positive_prob, negative_prob = predict_sentiment(sentence, model, tokenizer)
    print(f"Sentence: {sentence}\nPredicted Sentiment: {sentiment}")
    print(f"POSITIVE: {positive_prob:.4f}, NEGATIVE: {negative_prob:.4f}\n")

Sentence: I absolutely love this movie! It was amazing.
Predicted Sentiment: POSITIVE
POSITIVE: 0.9998, NEGATIVE: 0.0002

Sentence: This movie was terrible, I hated every second of it.
Predicted Sentiment: NEGATIVE
POSITIVE: 0.0004, NEGATIVE: 0.9996

Sentence: while this movie is not intended for everyone, it is good for someone has no brain
Predicted Sentiment: NEGATIVE
POSITIVE: 0.0683, NEGATIVE: 0.9317

Sentence: let's watch it only when it is free to watch, i will not pay for it
Predicted Sentiment: NEGATIVE
POSITIVE: 0.0671, NEGATIVE: 0.9329

Sentence: A worthy contender for the Animated film of 2024
Predicted Sentiment: POSITIVE
POSITIVE: 0.9996, NEGATIVE: 0.0004

Sentence: No plot at all. But if you are looking for a good laugh. You will not find that either.
Predicted Sentiment: NEGATIVE
POSITIVE: 0.0017, NEGATIVE: 0.9983

