In [None]:
# ==========================================
# Amazon Review Classifier - 1000 Samples
# DistilBERT + Focal Loss + 8 Epochs
# Auto ZIP Export for Streamlit
# ==========================================

# 1. Install dependencies
!pip install transformers datasets scikit-learn pandas torch evaluate -q

# 2. Imports
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.utils.class_weight import compute_class_weight
import evaluate
import joblib
import shutil, os
from google.colab import files

# 3. Load dataset
df = pd.read_csv("amazon_reviews_1000_updated.csv")  # Your generated dataset
df = df[df["review_sentiment"].isin(["positive", "neutral", "negative"])]

# Balance to ~1000 samples (equal per class)
samples_per_class = min(333, df["review_sentiment"].value_counts().min())
df_balanced = (
    df.groupby("review_sentiment", group_keys=False)
      .apply(lambda x: x.sample(samples_per_class, random_state=42))
      .reset_index(drop=True)
)

text_column = "review_text"
label_column = "review_sentiment"

# 4. Encode labels
label_encoder = LabelEncoder()
df_balanced["label"] = label_encoder.fit_transform(df_balanced[label_column])
joblib.dump(label_encoder, "label_encoder.pkl")

# 5. Train-test split
train_df, test_df = train_test_split(
    df_balanced,
    test_size=0.2,
    stratify=df_balanced["label"],
    random_state=42
)

# 6. Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# 7. Tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example[text_column],
        padding="max_length",
        truncation=True
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# 8. Remove unnecessary columns
train_dataset = train_dataset.remove_columns([label_column, text_column, "__index_level_0__"])
test_dataset = test_dataset.remove_columns([label_column, text_column, "__index_level_0__"])

# 9. Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df["label"]),
    y=train_df["label"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# 10. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_encoder.classes_)
)

# 11. Define Focal Loss
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = torch.nn.functional.cross_entropy(inputs, targets, weight=self.alpha, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        return focal_loss

focal_loss_fn = FocalLoss(alpha=class_weights.to(model.device), gamma=2)

def compute_loss_with_focal(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
    logits = outputs.get("logits")
    loss = focal_loss_fn(logits, labels)
    return (loss, outputs) if return_outputs else loss

model.compute_loss = compute_loss_with_focal

# 12. Metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

# 13. Training args
training_args = TrainingArguments(
    output_dir="./results_amazon_1000",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs_amazon_1000",
    load_best_model_at_end=False,
    report_to="none"
)

# 14. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 15. Train
trainer.train()

# 16. Evaluate
eval_results = trainer.evaluate()
print("\n📊 Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# 17. Save model
model_dir = "fine-tuned-amazon-model-1000"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# 18. ZIP export for Streamlit
zip_filename = "streamlit_amazon_model_package.zip"
package_dir = "streamlit_amazon_model_package"

shutil.rmtree(package_dir, ignore_errors=True)
os.makedirs(package_dir, exist_ok=True)
shutil.copytree(model_dir, f"{package_dir}/{model_dir}")
shutil.copy("label_encoder.pkl", f"{package_dir}/label_encoder.pkl")
shutil.make_archive("streamlit_amazon_model_package", 'zip', package_dir)

files.download(zip_filename)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

  .apply(lambda x: x.sample(samples_per_class, random_state=42))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.778915,0.913793,0.913337
2,No log,0.207933,1.0,1.0
3,No log,0.051499,1.0,1.0
4,No log,0.029785,1.0,1.0
5,No log,0.026218,1.0,1.0



📊 Evaluation Results:
eval_loss: 0.0262
eval_accuracy: 1.0000
eval_f1: 1.0000
eval_runtime: 0.8497
eval_samples_per_second: 68.2600
eval_steps_per_second: 9.4150
epoch: 5.0000


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>