In [1]:
# Load the dataset
from  datasets import load_dataset
dataset = load_dataset("stanfordnlp/imdb")
train_data = dataset["train"]
test_data = dataset["test"]

train_data[0]
# print(test_data[0])

  from .autonotebook import tqdm as notebook_tqdm


{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [4]:
import requests
from dotenv import load_dotenv
import os

load_dotenv()  # Load variables from .env file
api_key = os.getenv("API_KEY")

# Define a Function to Fetch movie metadata --> get actors and directors from the movie title
def get_movie_metadata(title):
    # Search for movie ID
    search_url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={title}"
    search_response = requests.get(search_url).json()

    if not search_response["results"]:
        return None

    movie_id = search_response["results"][0]["id"]

    # Get credits (cast and crew)
    credits_url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}"
    credits_response = requests.get(credits_url).json()

    actors = [member["name"] for member in credits_response.get("cast", [])[:7]]
    directors = [member["name"] for member in credits_response.get("crew", []) if member["job"] == "Director"]

    return {"actors": actors, "directors": directors}

In [5]:
import re

# Sample titles
sample_titles = ["The Dark Knight","Titanic","Inception","Pulp Fiction","Fight Club","The Godfather","Forrest Gump"]

#REGEX pattern
title_patterns = {title: re.compile(r"\b" + re.escape(title) + r"\b", re.IGNORECASE) for title in sample_titles}


In [6]:
def detect_titles_regex(text, patterns):
    detected = []
    for title, pattern in patterns.items():
        if pattern.search(text):
            detected.append(title)
    return detected

In [7]:
import pandas as pd
train_df = pd.DataFrame(dataset["train"])

train_df["detected_titles"] = train_df["text"].apply(lambda x: detect_titles_regex(x, title_patterns))
matched_reviews = train_df[train_df["detected_titles"].map(len) > 0]


In [8]:
def enrich_with_metadata(row):
    titles = row["detected_titles"]
    if titles:
        metadata = get_movie_metadata(titles[0])
        if metadata:
            row["actors"] = ", ".join(metadata["actors"])
            row["directors"] = ", ".join(metadata["directors"])
    return row

matched_reviews = matched_reviews.apply(enrich_with_metadata, axis=1)
# Takes about 1 minutes 20s to process

In [9]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [15]:
# matching named entities
def match_named_entities(text, actor_list, director_list):
    doc = nlp(text)
    mentioned_actors = set()
    mentioned_directors = set()

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            name = ent.text.strip().lower()
            for actor in actor_list:
                if actor.lower() in name or name in actor.lower():
                    mentioned_actors.add(actor)
            for director in director_list:
                if director.lower() in name or name in director.lower():
                    mentioned_directors.add(director)

    return {
        "mentions_actor": bool(mentioned_actors),
        "mentions_director": bool(mentioned_directors),
        "matched_actors": list(mentioned_actors),
        "matched_directors": list(mentioned_directors)
    }


In [16]:
# Apply NER matching to rows with metadata
for idx, row in matched_reviews.iterrows():
    actor_list = [name.strip() for name in row.get("actors", "").split(",") if name]
    director_list = [name.strip() for name in row.get("directors", "").split(",") if name]
    match_result = match_named_entities(row["text"], actor_list, director_list)

    matched_reviews.at[idx, "mentions_actor"] = match_result["mentions_actor"]
    matched_reviews.at[idx, "mentions_director"] = match_result["mentions_director"]
    matched_reviews.at[idx, "matched_actors"] = ", ".join(match_result["matched_actors"])
    matched_reviews.at[idx, "matched_directors"] = ", ".join(match_result["matched_directors"])

#Takes about 45

In [17]:
matched_reviews["text_short"] = matched_reviews["text"].str.slice(0, 20)
print(matched_reviews[[
    "text_short",
    "detected_titles",
    "actors",
    "directors",
    "mentions_actor",
    "mentions_director",
    "matched_actors",
    "matched_directors"
]].head(30))


                text_short  detected_titles  \
74    I'm studying Catalan   [Pulp Fiction]   
312   Even if you're a hug        [Titanic]   
359   God, I was bored out        [Titanic]   
451   (SPOILERS IN THIS)<b        [Titanic]   
667   Well, on it's credit        [Titanic]   
751   Note: I couldn't for  [The Godfather]   
861   Like so many media e   [Pulp Fiction]   
1020  On the 1998 summer b        [Titanic]   
1272  Some movies are repe   [Pulp Fiction]   
1470  A thematic staple of      [Inception]   
1510  Images are great and        [Titanic]   
1622  This 1997 film-blanc        [Titanic]   
1623  this film has it all  [The Godfather]   
1808  This movie should be   [Pulp Fiction]   
1866  Losing Control is an   [Pulp Fiction]   
1900  Need a lesson in pur        [Titanic]   
1903  In his 1966 film "Bl     [Fight Club]   
2140  At what point exactl   [Pulp Fiction]   
2493  Forgive me for stati  [The Godfather]   
2709  Well, because I'm a    [Pulp Fiction]   
2792  Even if

In [28]:
# Baseline Classifers for movies reviews with named entity(actors and directors)

#Prepare Features
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorization of review text
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_text = vectorizer.fit_transform(matched_reviews["text"])

# Convert binary features to numeric
import numpy as np
X_meta = (
    matched_reviews[["mentions_actor", "mentions_director"]]
    .fillna(False)
    .infer_objects(copy=False)
    .astype(int)
    .values
)

# Combine text and metadata features
from scipy.sparse import hstack
X_combined = hstack([X_text, X_meta])

# Target labels
y = matched_reviews["label"]

print("Total enriched reviews:", len(matched_reviews))
print("Reviews with actor mentions:", matched_reviews["mentions_actor"].sum())
print("Reviews with director mentions:", matched_reviews["mentions_director"].sum())

Total enriched reviews: 275
Reviews with actor mentions: 41
Reviews with director mentions: 25


  .fillna(False)


In [22]:
# Train and test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [23]:
# Training Baseline classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# SVM
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [24]:
# Evaluate performance
from sklearn.metrics import classification_report, confusion_matrix

# Logistic Regression
print("Logistic Regression:")
print(classification_report(y_test, lr_model.predict(X_test)))

# SVM
print("SVM:")
print(classification_report(y_test, svm_model.predict(X_test)))

Logistic Regression:
              precision    recall  f1-score   support

           0       0.88      0.52      0.65        27
           1       0.67      0.93      0.78        28

    accuracy                           0.73        55
   macro avg       0.77      0.72      0.71        55
weighted avg       0.77      0.73      0.71        55

SVM:
              precision    recall  f1-score   support

           0       0.86      0.67      0.75        27
           1       0.74      0.89      0.81        28

    accuracy                           0.78        55
   macro avg       0.80      0.78      0.78        55
weighted avg       0.80      0.78      0.78        55



In [25]:
# Baseline Classifers for movies reviews without named entity(actors and directors)
# Reuse the same TF-IDF vectorizer
X_text_only = vectorizer.transform(matched_reviews["text"])
y = matched_reviews["label"]

from sklearn.model_selection import train_test_split
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text_only, y, test_size=0.2, random_state=42)

In [26]:
# Logistic Regression
lr_text_model = LogisticRegression(max_iter=1000)
lr_text_model.fit(X_train_text, y_train)

# SVM
svm_text_model = LinearSVC()
svm_text_model.fit(X_train_text, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [27]:
from sklearn.metrics import classification_report

print("Logistic Regression (Without Named Entities):")
print(classification_report(y_test, lr_text_model.predict(X_test_text)))

print("SVM (Without Named Entities):")
print(classification_report(y_test, svm_text_model.predict(X_test_text)))

Logistic Regression (Without Named Entities):
              precision    recall  f1-score   support

           0       0.93      0.48      0.63        27
           1       0.66      0.96      0.78        28

    accuracy                           0.73        55
   macro avg       0.79      0.72      0.71        55
weighted avg       0.79      0.73      0.71        55

SVM (Without Named Entities):
              precision    recall  f1-score   support

           0       0.83      0.74      0.78        27
           1       0.77      0.86      0.81        28

    accuracy                           0.80        55
   macro avg       0.80      0.80      0.80        55
weighted avg       0.80      0.80      0.80        55

