# üçΩÔ∏è Sentiment Analysis Fine-tuning for Restaurant Reviews

This notebook fine-tunes DistilBERT for sentiment analysis on Zomato restaurant reviews.

**Steps:**
1. Load and preprocess Zomato reviews
2. Create balanced dataset with proper labels
3. Fine-tune DistilBERT
4. Save model to `models/sentiment/`

## 1. Install Dependencies

In [None]:
# !pip install transformers datasets evaluate accelerate pandas numpy tqdm

## 2. Imports

In [None]:
from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

import evaluate
import torch
import numpy as np
import pandas as pd
import ast
import re
from tqdm import tqdm
from pathlib import Path

## 3. Configuration

In [None]:
# Paths
RAW_DATA_PATH = 'data/raw/zomato 2.csv'
MODEL_SAVE_PATH = 'models/sentiment/final_model'

# Model config
MODEL_CHECKPOINT = 'distilbert-base-uncased'

# Training config
MAX_LENGTH = 256
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
SAMPLE_SIZE = 10000  # Number of restaurants to sample

# Labels (binary classification like movie reviews)
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

print(f"Model checkpoint: {MODEL_CHECKPOINT}")
print(f"Save path: {MODEL_SAVE_PATH}")

## 4. Load and Preprocess Zomato Reviews

The `reviews_list` column contains a list of tuples: `[("Rated X.X", "review text"), ...]`

We need to:
1. Parse the string representation to actual list
2. Extract rating and review text
3. Convert rating to binary label: ‚â•4 ‚Üí Positive, <3 ‚Üí Negative (skip neutral 3-4)

In [None]:
# Load raw data
print("Loading raw data...")
df = pd.read_csv(RAW_DATA_PATH)
print(f"Total restaurants: {len(df):,}")

# Sample for faster processing
if SAMPLE_SIZE and SAMPLE_SIZE < len(df):
    df = df.sample(n=SAMPLE_SIZE, random_state=42)
    print(f"Sampled: {len(df):,} restaurants")

In [None]:
def parse_reviews(reviews_str):
    """
    Parse the reviews_list column and extract (rating, text) pairs.
    """
    if pd.isna(reviews_str) or reviews_str == '[]':
        return []
    
    try:
        # Safely evaluate the string representation
        reviews = ast.literal_eval(str(reviews_str))
        
        parsed = []
        for rating_str, review_text in reviews:
            # Extract numeric rating from "Rated 4.0"
            rating_match = re.search(r'(\d+\.?\d*)', rating_str)
            if not rating_match:
                continue
            rating = float(rating_match.group(1))
            
            # Clean review text - remove "RATED\n" prefix
            clean_text = str(review_text).replace('RATED\n', '').replace('RATED\\n', '').strip()
            
            # Skip empty or very short reviews
            if not clean_text or len(clean_text) < 20:
                continue
            
            parsed.append((rating, clean_text))
        
        return parsed
    except Exception as e:
        return []

# Test parsing
sample_reviews = df['reviews_list'].iloc[0]
parsed = parse_reviews(sample_reviews)
print(f"Sample parsed reviews: {len(parsed)} reviews")
if parsed:
    print(f"First review: Rating={parsed[0][0]}, Text={parsed[0][1][:100]}...")

In [None]:
def extract_all_reviews(df):
    """
    Extract all reviews from dataframe and create labeled dataset.
    Binary labels: rating >= 4 ‚Üí Positive (1), rating < 3 ‚Üí Negative (0)
    We skip neutral reviews (3-4) to get cleaner training signal.
    """
    all_reviews = []
    
    print("Extracting reviews from dataset...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        reviews = parse_reviews(row['reviews_list'])
        
        for rating, text in reviews:
            # Binary labeling (skip 3-4 range for clearer signal)
            if rating >= 4:
                label = 1  # Positive
            elif rating < 3:
                label = 0  # Negative
            else:
                continue  # Skip neutral (3-4)
            
            # Truncate very long reviews
            text = text[:1000]
            
            all_reviews.append({
                'text': text,
                'label': label
            })
    
    return pd.DataFrame(all_reviews)

# Extract all reviews
reviews_df = extract_all_reviews(df)
print(f"\nTotal reviews extracted: {len(reviews_df):,}")
print(f"Class distribution:")
print(f"  Positive (1): {(reviews_df['label'] == 1).sum():,}")
print(f"  Negative (0): {(reviews_df['label'] == 0).sum():,}")

In [None]:
# Balance the dataset by undersampling the majority class
def balance_dataset(df):
    positive = df[df['label'] == 1]
    negative = df[df['label'] == 0]
    
    min_samples = min(len(positive), len(negative))
    print(f"Balancing to {min_samples:,} samples per class")
    
    positive_sampled = positive.sample(n=min_samples, random_state=42)
    negative_sampled = negative.sample(n=min_samples, random_state=42)
    
    balanced = pd.concat([positive_sampled, negative_sampled])
    balanced = balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced

balanced_df = balance_dataset(reviews_df)
print(f"\nBalanced dataset size: {len(balanced_df):,}")

In [None]:
# View sample data
print("Sample positive review:")
pos_sample = balanced_df[balanced_df['label'] == 1].iloc[0]
print(f"  {pos_sample['text'][:200]}...")

print("\nSample negative review:")
neg_sample = balanced_df[balanced_df['label'] == 0].iloc[0]
print(f"  {neg_sample['text'][:200]}...")

## 5. Create HuggingFace Dataset

In [None]:
# Split into train and validation
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    balanced_df, 
    test_size=0.2, 
    random_state=42, 
    stratify=balanced_df['label']
)

print(f"Training samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")

# Create HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

dataset

In [None]:
# Display % of training data with label=1
np.array(dataset['train']['label']).sum() / len(dataset['train']['label'])

## 6. Load Model and Tokenizer

In [None]:
# Load model for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id
)

In [None]:
model

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, add_prefix_space=True)

# Add pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

print(f"Vocab size: {len(tokenizer)}")

## 7. Tokenize Dataset

In [None]:
def tokenize_function(examples):
    text = examples["text"]
    
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=MAX_LENGTH
    )
    
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 8. Evaluation Metrics

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## 9. Test Untrained Model (Baseline)

In [None]:
# Test with sample texts
text_list = [
    "The food was absolutely delicious! Best restaurant ever.",
    "Terrible experience. Food was cold and service was rude.",
    "Amazing ambiance and the pasta was incredible.",
    "Worst biryani I've ever had. Never coming back.",
    "Loved the desserts! Will definitely visit again."
]

print("Untrained model predictions:")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    
    print(f"  {id2label[prediction]}: {text[:50]}...")

## 10. Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=MODEL_SAVE_PATH + "_checkpoints",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=50,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    report_to="none",  # Disable wandb
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
print("Starting training...")
trainer.train()

## 11. Evaluate Model

In [None]:
# Evaluate on validation set
eval_results = trainer.evaluate()
print(f"\nValidation Results:")
print(f"  Accuracy: {eval_results['eval_accuracy']['accuracy']:.4f}")
print(f"  Loss: {eval_results['eval_loss']:.4f}")

## 12. Test Trained Model

In [None]:
# Test with sample texts
print("Trained model predictions:")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    
    # Move to same device as model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][prediction].item()
    
    print(f"  {id2label[prediction]} ({confidence:.2%}): {text[:50]}...")

In [None]:
# Test with actual Zomato reviews
zomato_test_reviews = [
    "The biryani was amazing! Perfect spices and tender meat. Must visit for biryani lovers.",
    "Pathetic service. Waited 45 minutes for cold food. Staff was extremely rude.",
    "Great ambiance for a romantic dinner. The pasta was creamy and delicious.",
    "Overpriced and underwhelming. The pizza was soggy and tasteless.",
    "Loved the live music and the cocktails. Perfect weekend hangout spot!"
]

print("\nZomato review predictions:")
for text in zomato_test_reviews:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][prediction].item()
    
    print(f"  {id2label[prediction]} ({confidence:.2%}): {text[:60]}...")

## 13. Save Model

In [None]:
# Create output directory
Path(MODEL_SAVE_PATH).mkdir(parents=True, exist_ok=True)

# Save the model and tokenizer
print(f"Saving model to: {MODEL_SAVE_PATH}")
trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

# Save training info
import json

training_info = {
    "base_model": MODEL_CHECKPOINT,
    "num_epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "max_length": MAX_LENGTH,
    "train_samples": len(train_df),
    "val_samples": len(val_df),
    "eval_accuracy": eval_results['eval_accuracy']['accuracy'],
    "id2label": id2label,
    "label2id": label2id
}

with open(f"{MODEL_SAVE_PATH}/training_info.json", "w") as f:
    json.dump(training_info, f, indent=2)

print("‚úÖ Model saved successfully!")

## 14. Verify Saved Model

In [None]:
# Load and verify the saved model
print("Loading saved model for verification...")

loaded_tokenizer = AutoTokenizer.from_pretrained(MODEL_SAVE_PATH)
loaded_model = AutoModelForSequenceClassification.from_pretrained(MODEL_SAVE_PATH)

# Test prediction
test_text = "This restaurant has the best food I've ever tasted!"
inputs = loaded_tokenizer(test_text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)

with torch.no_grad():
    outputs = loaded_model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()

print(f"‚úÖ Loaded model prediction: {id2label[prediction]}")
print(f"   Test text: {test_text}")

---

## ‚úÖ Training Complete!

The sentiment model has been saved to `models/sentiment/final_model`

You can now use this model in the restaurant recommendation agents.