In [4]:
# Install required libraries
# !pip install pandas torch scikit-learn transformers[torch] datasets

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the cleaned dataset
try:
    df = pd.read_csv('cleaned_amazon_reviews.csv')
    # Drop any rows that might still have missing values in the cleaned text
    df.dropna(subset=['cleaned_review_text'], inplace=True)
    print("Cleaned dataset loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("Error: 'cleaned_amazon_reviews.csv' not found. Please run the Part 1 script first.")

Cleaned dataset loaded successfully.
                   Rating                                        Review Text  \
0  Rated 1 out of 5 stars  I registered on the website, tried to order a ...   
1  Rated 1 out of 5 stars  Had multiple orders one turned up and driver h...   
2  Rated 1 out of 5 stars  I informed these reprobates that I WOULD NOT B...   
3  Rated 1 out of 5 stars  I have bought from Amazon before and no proble...   
4  Rated 1 out of 5 stars  If I could give a lower rate I would! I cancel...   

                                 cleaned_review_text  
0  registered website tried order laptop entered ...  
1  multiple order one turned driver phone door nu...  
2  informed reprobate would going visit sick rela...  
3  bought amazon problem happy service price amaz...  
4  could give lower rate would cancelled amazon p...  


In [6]:
import re

def map_rating_to_sentiment(rating):
    # Extract the digit from the string using regex
    match = re.search(r'\d+', str(rating))
    if match:
        rating = int(match.group(0))
    else:
        # Handle cases where no digit is found (e.g., missing or invalid rating)
        return None # Or some other indicator for missing sentiment

    if rating <= 2:
        return 0  # Negative
    elif rating == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

# Apply the function to create a 'sentiment' column
df['sentiment'] = df['Rating'].apply(map_rating_to_sentiment)

# Drop rows where sentiment could not be determined (if any)
df.dropna(subset=['sentiment'], inplace=True)


# Let's see the distribution of sentiments
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

# Prepare the final dataframe for the model
model_df = df[['cleaned_review_text', 'sentiment']].copy()
model_df.rename(columns={'cleaned_review_text': 'text', 'sentiment': 'label'}, inplace=True)

print("\nData prepared for modeling:")
print(model_df.head())


Sentiment Distribution:
sentiment
0    14350
2     5820
1      885
Name: count, dtype: int64

Data prepared for modeling:
                                                text  label
0  registered website tried order laptop entered ...      0
1  multiple order one turned driver phone door nu...      0
2  informed reprobate would going visit sick rela...      0
3  bought amazon problem happy service price amaz...      0
4  could give lower rate would cancelled amazon p...      0


In [7]:
# Split the data into training and validation sets
train_df, val_df = train_test_split(model_df, test_size=0.2, random_state=42, stratify=model_df['label'])

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load tokenizer
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# Create a tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

# Apply the tokenizer to our datasets
print("\nTokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
print("Tokenization complete.")


Tokenizing datasets...


Map:   0%|          | 0/16844 [00:00<?, ? examples/s]

Map:   0%|          | 0/4211 [00:00<?, ? examples/s]

Tokenization complete.


In [9]:
# Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3 # We have 3 labels: Negative, Neutral, Positive
)

# Define a function to compute metrics for evaluation [cite: 35]
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save the model
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    eval_strategy="epoch",     # Evaluate at the end of each epoch (Changed from evaluation_strategy)
    save_strategy="epoch",           # Save checkpoint at the end of each epoch
    load_best_model_at_end=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

# Start training!
print("\nStarting model training...")
trainer.train()
print("Training finished.")

# Evaluate the final model on the validation set
print("\nEvaluating the final model...")
evaluation_results = trainer.evaluate()
print(evaluation_results)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting model training...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msharma-anadi007[0m ([33msharma-anadi007-parul-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3388,0.352884,0.893612,0.876343,0.863819,0.893612
2,0.2283,0.317186,0.907385,0.893171,0.892005,0.907385
3,0.102,0.36994,0.902161,0.897103,0.893063,0.902161


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training finished.

Evaluating the final model...


{'eval_loss': 0.31718575954437256, 'eval_accuracy': 0.9073854191403468, 'eval_f1': 0.8931709108935395, 'eval_precision': 0.8920054192830885, 'eval_recall': 0.9073854191403468, 'eval_runtime': 30.4317, 'eval_samples_per_second': 138.376, 'eval_steps_per_second': 2.169, 'epoch': 3.0}


In [11]:
import joblib

model.save_pretrained('./sentiment_model_files')
tokenizer.save_pretrained('./sentiment_model_files')

print("\nModel and tokenizer saved in './sentiment_model_files/'")


Model and tokenizer saved in './sentiment_model_files/'
