In [27]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

In [39]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from tqdm import tqdm


In [43]:
# Load and clean
df = pd.read_csv("amazon_reviews.csv")  # Replace with your actual file name
df = df.dropna(subset=["Score", "Text"])
df = df[df["Score"] != 3]  # remove neutral
df["label"] = df["Score"].apply(lambda x: 1 if x >= 4 else 0)

# Balanced sample (10k per class)
min_class = min(10000, df.label.value_counts()[0], df.label.value_counts()[1])
df_pos = df[df.label == 1].sample(n=min_class, random_state=42)
df_neg = df[df.label == 0].sample(n=min_class, random_state=42)
df_balanced = pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced["Text"], df_balanced["label"], test_size=0.2, random_state=42
)


In [45]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)

In [47]:
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels.tolist())
val_dataset = ReviewDataset(val_encodings, val_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [49]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optim = AdamW(model.parameters(), lr=2e-5)
epochs = 1

model.train()
for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 0: 100%|██████████████████████████████████████████████████████| 2000/2000 [1:01:47<00:00,  1.85s/it, loss=0.0197]


In [51]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ Precision: {p:.4f}")
print(f"✅ Recall: {r:.4f}")
print(f"✅ F1 Score: {f1:.4f}")

✅ Accuracy: 0.9210
✅ Precision: 0.9238
✅ Recall: 0.9210
✅ F1 Score: 0.9207


In [55]:
# Save model weights
torch.save(model.state_dict(), "distilbert_sentiment.pt")

# Save tokenizer separately (you can still use .pickle or .joblib for tokenizer if needed)
import joblib
joblib.dump(tokenizer, "tokenizer.joblib")

['tokenizer.joblib']

In [57]:
import torch
import pickle

# Save model weights
torch.save(model.state_dict(), "distilbert_sentiment.pt")

# Save tokenizer path and model info using pickle
model_bundle = {
    "model_name": "distilbert-base-uncased",
    "model_path": "distilbert_sentiment.pt"
}

with open("model_bundle.pkl", "wb") as f:
    pickle.dump(model_bundle, f)
