This project aims to develop an AI-powered sentiment analysis model for movie reviews, classifying them as positive or negative using a pre-trained transformer model. The goal is to automate review analysis, helping audiences and industry professionals quickly gauge public opinion on films in real time. ðŸŽ¬âœ¨

input: I didn't like the movie! It was bad

output: Predicted Sentiment: Negative

In [1]:
!pip install transformers datasets accelerate -q

In [2]:
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from torch.nn import functional as F
import os

In [3]:
# Disable W&B logging to avoid API key request
# os.environ["WANDB_DISABLED"] = "true"

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cpu


In [5]:
# Load dataset from local Parquet files
train_df = pd.read_parquet(r"data/train.parquet")
test_df = pd.read_parquet(r"data/test.parquet")

# Convert DataFrames to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

In [6]:
# Load tokenizer
model_name = "distilbert-base-uncased"  # Model suited for classification
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
# Preprocessing function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [9]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    report_to="none",
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    processing_class=tokenizer,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment = "Positive" if torch.argmax(probs) == 1 else "Negative"
    return sentiment

test_text = "I didn't like the movie! It was bad."
print("Predicted Sentiment:", predict_sentiment(test_text))


Predicted Sentiment: Negative
