# **Movie Review Sentiment Analysis**

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# STEP 1 — Install dependencies
!pip install --upgrade transformers datasets accelerate evaluate
!pip install --upgrade transformers==4.44.2 accelerate==0.34.2

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import evaluate

# STEP 2 — Load your Kaggle IMDB dataset
# If using Google Colab, first upload the CSV file or mount Google Drive
df = pd.read_csv("/content/IMDB Dataset.csv")  # update with your actual path
df.head()

# Map labels to integers
df['label'] = df['sentiment'].map({'negative': 0, 'positive': 1})
df = df[['review', 'label']]

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Train-Test split
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
test_dataset = dataset['test']

# STEP 3 — Tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["review"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])




Collecting transformers
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvi

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# STEP 4 — Load Pretrained Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# STEP 5 — Evaluation function
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=predictions, references=labels)

# STEP 6 — Evaluate BEFORE Fine-tuning
trainer_before = Trainer(
    model=model,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
print("📊 Accuracy BEFORE fine-tuning:")
print(trainer_before.evaluate())

# STEP 7 — Training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50
)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


📊 Accuracy BEFORE fine-tuning:


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{'eval_loss': 0.699142575263977, 'eval_model_preparation_time': 0.0142, 'eval_accuracy': 0.5065, 'eval_runtime': 145.2095, 'eval_samples_per_second': 68.866, 'eval_steps_per_second': 8.608}


In [None]:
# STEP 8 — Fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# STEP 9 — Evaluate AFTER Fine-tuning
print("📊 Accuracy AFTER fine-tuning:")
print(trainer.evaluate())



Epoch,Training Loss,Validation Loss,Accuracy
1,0.2055,0.236513,0.9271
2,0.1529,0.227554,0.9412
3,0.1119,0.265711,0.9419


📊 Accuracy AFTER fine-tuning:


{'eval_loss': 0.26571083068847656, 'eval_accuracy': 0.9419, 'eval_runtime': 141.999, 'eval_samples_per_second': 70.423, 'eval_steps_per_second': 4.401, 'epoch': 3.0}


In [None]:
# STEP 10 — Save Model
model.save_pretrained("./fine_tuned_sentiment")
tokenizer.save_pretrained("./fine_tuned_sentiment")

# STEP 11 — Prediction Function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    # Move input tensors to the same device as the model
    device = model.device
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    return "Positive" if prediction == 1 else "Negative"

# STEP 12 — Test User Input
while True:
    user_text = input("Enter a review (or 'quit' to stop): ")
    if user_text.lower() == "quit":
        break
    print("Predicted Sentiment:", predict_sentiment(user_text))

Enter a review (or 'quit' to stop): movie was good
Predicted Sentiment: Positive
Enter a review (or 'quit' to stop): i loved it
Predicted Sentiment: Positive
Enter a review (or 'quit' to stop): stop
Predicted Sentiment: Negative
Enter a review (or 'quit' to stop): quit


In [None]:
import shutil
from google.colab import files

# Zip the trained model folder
shutil.make_archive("fine_tuned_sentiment", 'zip', "fine_tuned_sentiment")

# Download the zip file
files.download("fine_tuned_sentiment.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
model_path = "fine_tuned_sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)



In [9]:
# Prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=1).item()
    return predicted_class, probs.numpy()

# Test
text = "The movie is hit"
label, probabilities = predict_sentiment(text)
print(f"Predicted Label: {label}, Probabilities: {probabilities}")
if label == 1:
  print("Positive review")
else:
  print("Negative review")


Predicted Label: 1, Probabilities: [[0.00253933 0.9974606 ]]
Positive review


In [12]:
# # --- Professional Sentiment UI (ipywidgets, styled) ---
# import torch
# import ipywidgets as widgets
# from IPython.display import display, clear_output

# # Styled header
# header = widgets.HTML(
#     """
#     <div style="
#         background: linear-gradient(135deg, #1e3a8a, #3b82f6);
#         color: white;
#         padding: 14px 18px;
#         border-radius: 14px 14px 0 0;
#         font-size: 20px;
#         font-weight: 600;
#         box-shadow: 0 2px 6px rgba(0,0,0,0.2);
#         ">
#         🎬 IMDB Sentiment Analyzer — RoBERTa-base
#     </div>
#     """
# )

# # Text input
# review_input = widgets.Textarea(
#     placeholder="✍️ Type or paste a movie review here...",
#     layout=widgets.Layout(width="100%", height="140px", border="1px solid #cbd5e1", border_radius="10px"),
# )

# # Predict button
# predict_btn = widgets.Button(
#     description="🔎 Analyze Sentiment",
#     tooltip="Run model",
#     layout=widgets.Layout(width="220px", height="42px")
# )
# try:
#     predict_btn.style.button_color = "#2563eb"  # Tailwind blue-600
# except Exception:
#     pass

# # Output area with styling
# out = widgets.Output()

# # Prediction function
# def predict_with_probs(text: str):
#     inputs = tokenizer(
#         text,
#         return_tensors="pt",
#         truncation=True,
#         padding="max_length",
#         max_length=256
#     )
#     device = model.device
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     with torch.no_grad():
#         logits = model(**inputs).logits
#         probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
#         pred_idx = int(probs.argmax())
#     label = "Positive" if pred_idx == 1 else "Negative"
#     return label, probs

# # Button click
# def on_click_predict(_):
#     with out:
#         clear_output()
#         text = review_input.value.strip()
#         if not text:
#             display(widgets.HTML("<div style='color:#ef4444;font-weight:500;'>⚠️ Please enter a review first.</div>"))
#             return
#         try:
#             label, probs = predict_with_probs(text)
#             color = "#16a34a" if label == "Positive" else "#dc2626"  # green vs red
#             display(widgets.HTML(
#                 f"""
#                 <div style="padding:12px; border-radius:10px; background:#f8fafc;
#                             box-shadow:0 1px 4px rgba(0,0,0,0.1); font-size:16px;">
#                     <b style="color:{color}; font-size:18px;">Prediction: {label}</b><br>
#                     <span style="color:#334155;">Confidence →
#                         Negative: {probs[0]:.3f} | Positive: {probs[1]:.3f}
#                     </span>
#                 </div>
#                 """
#             ))
#         except Exception as e:
#             display(widgets.HTML(f"<div style='color:#ef4444;'>❌ Oops: {e}</div>"))

# predict_btn.on_click(on_click_predict)

# # Card layout (overall styling)
# card = widgets.VBox(
#     [
#         header,
#         widgets.HTML('<div style="color:#1e3a8a;font-weight:600;margin:6px 0;">📝 Enter Review</div>'),
#         review_input,
#         widgets.HBox([predict_btn]),
#         widgets.HTML('<div style="height:12px;"></div>'),
#         out
#     ],
#     layout=widgets.Layout(
#         border="1px solid #dbeafe",
#         padding="0px",
#         width="100%",
#         border_radius="14px",
#         box_shadow="0 4px 10px rgba(0,0,0,0.1)"
#     )
# )

# display(card)


VBox(children=(HTML(value='\n    <div style="\n        background: linear-gradient(135deg, #1e3a8a, #3b82f6);\…