In [4]:
# Load the dataset
from  datasets import load_dataset
dataset = load_dataset("stanfordnlp/imdb")  
train_data = dataset["train"]
test_data = dataset["test"]

train_data[0]
# test_data[0]
# print(test_data[0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [2]:
# def get_movie_title():
#
#     sample_titles = []
#     with open("imdb_top_movies.txt", "r") as f:
#         for l in f.readlines():
#             line = l.strip()
#             sample_titles.append(line)
#
#         print("movie titles",sample_titles)
#     return sample_titles


In [5]:
import re
import unicodedata
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from time import sleep
import spacy

# ------------------------------
# Load API key and spaCy model
# ------------------------------
load_dotenv()
API_KEY = os.getenv("API_KEY")
BASE_URL = "https://api.themoviedb.org/3"

nlp = spacy.load("en_core_web_sm")  # NER model

# ------------------------------
# Normalize text function
# ------------------------------
def normalize_text(text):
    return unicodedata.normalize("NFKC", text).lower()

# ------------------------------
# Movie titles
# ------------------------------

#
sample_titles = [
   "The Shawshank Redemption", "The Godfather", "The Dark Knight", "Pulp Fiction", "Forrest Gump",
    "Fight Club", "Inception", "The Matrix", "Goodfellas", "The Lord of the Rings: The Return of the King",
    "Interstellar", "Parasite", "The Silence of the Lambs", "Saving Private Ryan", "Schindler’s List",
    "Gladiator", "Titanic", "The Green Mile", "The Departed", "Django Unchained",
    "The Prestige", "Whiplash", "The Lion King", "Toy Story", "Avengers: Endgame",
    "Avengers: Infinity War", "Iron Man", "Black Panther", "Joker", "The Social Network",
    "The Wolf of Wall Street", "La La Land", "Mad Max: Fury Road", "The Revenant", "Get Out",
    "Oppenheimer", "Barbie", "Dune", "The Batman", "Spider-Man: No Way Home",
    "Everything Everywhere All at Once", "The Irishman", "12 Years a Slave", "Moonlight", "Spotlight",
    "Birdman", "Arrival", "Blade Runner 2049", "No Country for Old Men", "The Big Short",
    "The Hateful Eight", "Once Upon a Time in Hollywood", "There Will Be Blood",
    "The Curious Case of Benjamin Button", "The Shape of Water", "The Theory of Everything",
    "Bohemian Rhapsody", "Rocketman", "A Star Is Born", "The Imitation Game", "The King's Speech",
    "Slumdog Millionaire", "Life of Pi", "Gravity", "Cast Away", "The Truman Show",
    "Eternal Sunshine of the Spotless Mind", "Requiem for a Dream", "American Beauty", "The Sixth Sense",
    "Se7en", "The Usual Suspects", "Memento", "Oldboy", "Pan’s Labyrinth",
    "Amélie", "The Pianist", "The Lives of Others", "City of God", "Crouching Tiger, Hidden Dragon",
    "Spirited Away", "Howl’s Moving Castle", "Princess Mononoke", "My Neighbor Totoro", "WALL·E",
    "Up", "Inside Out", "Coco", "Soul", "Minari", "The Banshees of Inisherin",
    "Casino Royale", "South Park: Bigger, Longer & Uncut", "A Fistful of Dollars", "Rosemary's Baby",
    "The Incredibles", "Black Swan", "Deadpool", "The Breakfast Club", "The Untouchables",
    "Shaun of the Dead", "True Romance", "Harry Potter and the Prisoner of Azkaban", "Hot Fuzz",
    "In Bruges", "Boyhood", "Straight Outta Compton", "Drive", "Moneyball", "Brazil", "Chronicle",
    "Still Alice", "Triangle", "The Endless", "The Man from Earth",
    "The Secret in Their Eyes", "The Fall", "The Hunt", "Incendies", "The Intouchables",
    "Prisoners", "Enemy", "Locke", "The Lobster", "Under the Skin",
    "Ex Machina", "Annihilation", "The Florida Project", "Room", "Brooklyn",
    "Carol", "The Farewell", "Portrait of a Lady on Fire", "The Handmaiden", "Shoplifters",
    "A Separation", "Toni Erdmann", "Cold War", "Wild Tales", "The Square"
]

# normalize titles for regex patterns
sample_titles_norm = [normalize_text(t) for t in sample_titles]

title_patterns = {
    title: re.compile(r"(?<!\w)" + re.escape(title) + r"(?!\w)", re.IGNORECASE)
    for title in sample_titles_norm
}

# ------------------------------
# Detect titles function
# ------------------------------
def detect_titles_regex(text, patterns):
    text_norm = normalize_text(text)
    detected = []
    for title, pattern in patterns.items():
        if pattern.search(text_norm):
            detected.append(title)
    return detected

# ------------------------------
# TMDb metadata caching
# ------------------------------
metadata_cache = {}

def get_movie_metadata(title):
    title_key = title.strip().lower()
    if title_key in metadata_cache:
        return metadata_cache[title_key]

    query_title = title.strip().title()  # proper case for API
    search_url = f"{BASE_URL}/search/movie?api_key={API_KEY}&query={query_title}"
    try:
        search_response = requests.get(search_url).json()
    except requests.RequestException:
        return None

    if not search_response.get("results"):
        return None

    movie_id = search_response["results"][0]["id"]

    credits_url = f"{BASE_URL}/movie/{movie_id}/credits?api_key={API_KEY}"
    try:
        credits_response = requests.get(credits_url).json()
    except requests.RequestException:
        return None

    actors = [member["name"] for member in credits_response.get("cast", [])[:7]]
    directors = [member["name"] for member in credits_response.get("crew", []) if member["job"] == "Director"]

    result = {"actors": actors, "directors": directors}
    metadata_cache[title_key] = result
    sleep(0.25)  # rate limit
    return result

# ------------------------------
# Enrich with metadata
# ------------------------------
def enrich_with_metadata(row):
    titles = row.get("detected_titles", [])
    if not titles:
        row["actors"] = None
        row["directors"] = None
        row["ner_entities"] = []
        return row

    metadata = get_movie_metadata(titles[0])
    row["actors"] = metadata.get("actors") if metadata else []
    row["directors"] = metadata.get("directors") if metadata else []

    # ------------------------------
    # Use spaCy NER to detect persons in review
    # ------------------------------
    doc = nlp(row["text"])
    persons_in_review = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # ------------------------------
    # Link detected persons to movie metadata
    # ------------------------------
    relevant_entities = []
    for person in persons_in_review:
        if person in row["actors"] or person in row["directors"]:
            relevant_entities.append(person)

    row["ner_entities"] = relevant_entities
    return row

# ------------------------------
# Load dataset
# ------------------------------
train_df = pd.DataFrame(train_data)

# Detect titles
train_df["detected_titles"] = train_df["text"].apply(lambda x: detect_titles_regex(x, title_patterns))

# Filter rows with at least one detected title
matched_reviews = train_df[train_df["detected_titles"].map(len) > 0].copy()

# Initialize columns
matched_reviews["actors"] = None
matched_reviews["directors"] = None
matched_reviews["ner_entities"] = None

# Enrich with metadata and NER
matched_reviews = matched_reviews.apply(enrich_with_metadata, axis=1)

# ------------------------------
# Save final DataFrame - for development purposes
# ------------------------------
matched_reviews.to_csv("matched_reviews_with_metadata_ner.csv", index=False)
print("Pipeline complete. Saved matched reviews with NER metadata.")

#Took about 38 minutes to run the full pipeline on the training set.


Pipeline complete. Saved matched reviews with NER metadata.


In [6]:
filtered = matched_reviews[matched_reviews["detected_titles"] != "up"][10:]
filtered.head(10)




Unnamed: 0,text,label,detected_titles,actors,directors,ner_entities
28,Some films that you pick up for a pound turn o...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
29,"I received this movie as a gift, I knew from t...",0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
30,I have not seen many low budget films i must a...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
31,"..Oh wait, I can! This movie is not for the ty...",0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
33,THE ZOMBIE CHRONICLES <br /><br />Aspect ratio...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
42,WARNING: This review contains SPOILERS. Do not...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
44,"Jill Dunne (played by Mitzi Kapture), is an at...",0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
45,This movie sucked. It really was a waste of my...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
46,Lifetime did it again. Can we say stupid? I co...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]
49,The annoying mouse and lullaby really got to m...,0,[up],"[Ed Asner, Christopher Plummer, Jordan Nagai, ...",[Pete Docter],[]


In [14]:
# Baseline Classification Models without NER Metadata

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Prepare data for modeling
df = matched_reviews.copy()

# Ensure text is string
df["text"] = df["text"].astype(str)

X = df["text"]
y = df["label"]     # 0 = negative, 1 = positive

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=20_000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [15]:
# Logistic Regression Model
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_vec, y_train)

# Evaluation
pred_lr = log_reg.predict(X_test_vec)

print("==== Logistic Regression ====")
print("Accuracy:", accuracy_score(y_test, pred_lr))
print("F1 Score:", f1_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))

==== Logistic Regression ====
Accuracy: 0.8694191070571291
F1 Score: 0.8698564593301435
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      1064
           1       0.85      0.89      0.87      1019

    accuracy                           0.87      2083
   macro avg       0.87      0.87      0.87      2083
weighted avg       0.87      0.87      0.87      2083



In [16]:
# SVM Model
svm_clf = LinearSVC()
svm_clf.fit(X_train_vec, y_train)

# Evaluation
pred_svm = svm_clf.predict(X_test_vec)

print("==== Linear SVM ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))


==== Linear SVM ====
Accuracy: 0.8684589534325492
F1 Score: 0.8676328502415459
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1064
           1       0.85      0.88      0.87      1019

    accuracy                           0.87      2083
   macro avg       0.87      0.87      0.87      2083
weighted avg       0.87      0.87      0.87      2083



In [17]:
# Baseline Classification Models with NER Metadata
import numpy as np
from textblob import TextBlob

def compute_entity_features(row):
    titles = row["detected_titles"]

    # actors/directors are lists, not strings
    actors = row.get("actors", [])
    directors = row.get("directors", [])

    num_titles = len(titles)
    num_actors = len(actors)
    num_directors = len(directors)

    text = row["text"].lower()

    # Count actor mentions
    actor_mentions = 0
    for a in actors:
        actor_mentions += text.count(a.lower())

    # Count director mentions
    director_mentions = 0
    for d in directors:
        director_mentions += text.count(d.lower())

    # Sentiment toward entity names
    entity_tokens = actors + directors
    entity_sentiment = 0

    if entity_tokens:
        combined = " ".join(entity_tokens)
        try:
            entity_sentiment = TextBlob(combined).sentiment.polarity
        except:
            entity_sentiment = 0

    return pd.Series({
        "num_titles": num_titles,
        "num_actors": num_actors,
        "num_directors": num_directors,
        "actor_mentions": actor_mentions,
        "director_mentions": director_mentions,
        "entity_sentiment": entity_sentiment
    })


# Compute entity features
entity_features = matched_reviews.apply(compute_entity_features, axis=1)
full_df = pd.concat([matched_reviews, entity_features], axis=1)

In [18]:
from scipy.sparse import hstack

# TF-IDF Vectorization on review text with NER features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(full_df["text"])

# Combine text features with NER features
X_entity = full_df[[
    "num_titles", "num_actors", "num_directors",
    "actor_mentions", "director_mentions",
    "entity_sentiment"
]].fillna(0).values

X = hstack([X_text, X_entity])
y = full_df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [23]:
# Logistic Regression with NER features
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)

pred_log = log_clf.predict(X_test)

print("==== Entity-Aware Logistic Regression ====")
print("Accuracy:", accuracy_score(y_test, pred_log))
print("F1 Score:", f1_score(y_test, pred_log))
print(classification_report(y_test, pred_log))

==== Entity-Aware Logistic Regression ====
Accuracy: 0.8665386461833894
F1 Score: 0.8635917566241413
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      1086
           1       0.85      0.88      0.86       997

    accuracy                           0.87      2083
   macro avg       0.87      0.87      0.87      2083
weighted avg       0.87      0.87      0.87      2083



In [24]:
# SVM with NER features
svm_clf = LinearSVC()
svm_clf.fit(X_train, y_train)

pred_svm = svm_clf.predict(X_test)

print("==== Entity-Aware SVM ====")
print("Accuracy:", accuracy_score(y_test, pred_svm))
print("F1 Score:", f1_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))

==== Entity-Aware SVM ====
Accuracy: 0.8679788766202592
F1 Score: 0.8645987198424422
              precision    recall  f1-score   support

           0       0.89      0.86      0.87      1086
           1       0.85      0.88      0.86       997

    accuracy                           0.87      2083
   macro avg       0.87      0.87      0.87      2083
weighted avg       0.87      0.87      0.87      2083

