In [1]:
import pandas as pd

train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

print(train_data.head())

                                                text  label
0  One of the other reviewers has mentioned that ...      1
1  A wonderful little production. <br /><br />The...      1
2  I thought this was a wonderful way to spend ti...      1
3  Basically there's a family where a little boy ...      0
4  Petter Mattei's "Love in the Time of Money" is...      1


In [2]:
import re
import unicodedata
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from time import sleep
import spacy
import difflib


# ------------------------------
# Load API key and spaCy model
# ------------------------------
load_dotenv()
API_KEY = os.getenv("API_KEY")
BASE_URL = "https://api.themoviedb.org/3"

nlp = spacy.load("en_core_web_trf")

# ------------------------------
# Normalize text function
# ------------------------------
def normalize_text(text):
    return unicodedata.normalize("NFKC", text).lower()

# ------------------------------
# Movie titles # 425 popular movies
# ------------------------------

#
sample_titles = [
   "The Shawshank Redemption", "The Godfather", "The Dark Knight", "Pulp Fiction", "Forrest Gump",
    "Fight Club", "Inception", "The Matrix", "Goodfellas", "The Lord of the Rings: The Return of the King",
    "Interstellar", "Parasite", "The Silence of the Lambs", "Saving Private Ryan", "Schindler’s List",
    "Gladiator", "Titanic", "The Green Mile", "The Departed", "Django Unchained",
    "The Prestige", "Whiplash", "The Lion King", "Toy Story", "Avengers: Endgame",
    "Avengers: Infinity War", "Iron Man", "Black Panther", "Joker", "The Social Network",
    "The Wolf of Wall Street", "La La Land", "Mad Max: Fury Road", "The Revenant", "Get Out",
    "Oppenheimer", "Barbie", "Dune", "The Batman", "Spider-Man: No Way Home",
    "Everything Everywhere All at Once", "The Irishman", "12 Years a Slave", "Moonlight", "Spotlight",
    "Birdman", "Arrival", "Blade Runner 2049", "No Country for Old Men", "The Big Short",
    "The Hateful Eight", "Once Upon a Time in Hollywood", "There Will Be Blood",
    "The Curious Case of Benjamin Button", "The Shape of Water", "The Theory of Everything",
    "Bohemian Rhapsody", "Rocketman", "A Star Is Born", "The Imitation Game", "The King's Speech",
    "Slumdog Millionaire", "Life of Pi", "Gravity", "Cast Away", "The Truman Show",
    "Eternal Sunshine of the Spotless Mind", "Requiem for a Dream", "American Beauty", "The Sixth Sense",
    "Se7en", "The Usual Suspects", "Memento", "Oldboy", "Pan’s Labyrinth",
    "Amélie", "The Pianist", "The Lives of Others", "City of God", "Crouching Tiger, Hidden Dragon",
    "Spirited Away", "Howl’s Moving Castle", "Princess Mononoke", "My Neighbor Totoro", "WALL·E",
    "Up", "Inside Out", "Coco", "Soul", "Minari", "The Banshees of Inisherin",
    "Casino Royale", "South Park: Bigger, Longer & Uncut", "A Fistful of Dollars", "Rosemary's Baby",
    "The Incredibles", "Black Swan", "Deadpool", "The Breakfast Club", "The Untouchables",
    "Shaun of the Dead", "True Romance", "Harry Potter and the Prisoner of Azkaban", "Hot Fuzz",
    "In Bruges", "Boyhood", "Straight Outta Compton", "Drive", "Moneyball", "Brazil", "Chronicle",
    "Still Alice", "Triangle", "The Endless", "The Man from Earth",
    "The Secret in Their Eyes", "The Fall", "The Hunt", "Incendies", "The Intouchables",
    "Prisoners", "Enemy", "Locke", "The Lobster", "Under the Skin",
    "Ex Machina", "Annihilation", "The Florida Project", "Room", "Brooklyn",
    "Carol", "The Farewell", "Portrait of a Lady on Fire", "The Handmaiden", "Shoplifters",
    "A Separation", "Toni Erdmann", "Cold War", "Wild Tales", "The Square",
        "The Godfather Part II", "12 Angry Men", "The Lord of the Rings: The Fellowship of the Ring",
    "The Lord of the Rings: The Two Towers", "Star Wars", "Return of the Jedi",
    "The Phantom Menace", "Revenge of the Sith", "A New Hope", "The Force Awakens",
    "The Last Jedi", "The Rise of Skywalker", "Rogue One", "Solo: A Star Wars Story",
    "Jurassic Park", "The Lost World: Jurassic Park", "Jurassic World", "Jurassic World: Fallen Kingdom",
    "Jurassic World Dominion", "E.T. the Extra-Terrestrial", "Jaws", "Close Encounters of the Third Kind",
    "Raiders of the Lost Ark", "Indiana Jones and the Temple of Doom", "Indiana Jones and the Last Crusade",
    "Indiana Jones and the Kingdom of the Crystal Skull", "Indiana Jones and the Dial of Destiny",
    "Back to the Future", "Back to the Future Part II", "Back to the Future Part III",
    "The Terminator", "Terminator 2: Judgment Day", "Terminator 3: Rise of the Machines",
    "Terminator Salvation", "Terminator Genisys", "Terminator: Dark Fate",
    "Alien", "Aliens", "Alien 3", "Alien Resurrection", "Prometheus", "Alien: Covenant",
    "Predator", "Predator 2", "Predators", "The Predator", "Prey",
    "Rocky", "Rocky II", "Rocky III", "Rocky IV", "Rocky V", "Rocky Balboa",
    "Creed", "Creed II", "Creed III",
    "The Hunger Games", "Catching Fire", "Mockingjay Part 1", "Mockingjay Part 2",
    "The Ballad of Songbirds and Snakes",
    "Twilight", "New Moon", "Eclipse", "Breaking Dawn Part 1", "Breaking Dawn Part 2",
    "The Avengers", "Age of Ultron", "Captain America: The First Avenger",
    "Captain America: The Winter Soldier", "Captain America: Civil War",
    "Thor", "Thor: The Dark World", "Thor: Ragnarok", "Thor: Love and Thunder",
    "Doctor Strange", "Doctor Strange in the Multiverse of Madness",
    "Guardians of the Galaxy", "Guardians of the Galaxy Vol. 2", "Guardians of the Galaxy Vol. 3",
    "Ant-Man", "Ant-Man and the Wasp", "Ant-Man and the Wasp: Quantumania",
    "Captain Marvel", "The Marvels",
    "Shang-Chi and the Legend of the Ten Rings", "Eternals",
    "Man of Steel", "Batman v Superman: Dawn of Justice", "Justice League",
    "Zack Snyder's Justice League", "Aquaman", "Aquaman and the Lost Kingdom",
    "Wonder Woman", "Wonder Woman 1984", "Suicide Squad", "The Suicide Squad",
    "Shazam!", "Shazam! Fury of the Gods", "Black Adam", "The Flash",
    "The Lego Movie", "The Lego Batman Movie", "The Lego Movie 2: The Second Part",
    "Frozen", "Frozen II", "Moana", "Zootopia", "Big Hero 6",
    "Ratatouille", "Finding Nemo", "Finding Dory", "Monsters, Inc.", "Monsters University",
    "Cars", "Cars 2", "Cars 3", "The Good Dinosaur",
    "Despicable Me", "Despicable Me 2", "Despicable Me 3", "Despicable Me 4",
    "Minions", "Minions: The Rise of Gru",
    "Shrek", "Shrek 2", "Shrek the Third", "Shrek Forever After",
    "Kung Fu Panda", "Kung Fu Panda 2", "Kung Fu Panda 3",
    "How to Train Your Dragon", "How to Train Your Dragon 2", "How to Train Your Dragon: The Hidden World",
    "Ice Age", "Ice Age: The Meltdown", "Ice Age: Dawn of the Dinosaurs",
    "Ice Age: Continental Drift", "Ice Age: Collision Course",
    "Madagascar", "Madagascar: Escape 2 Africa", "Madagascar 3: Europe's Most Wanted",
    "Megamind", "The Croods", "The Croods: A New Age",
     # Drama / Romance
    "Echoes of Tomorrow", "The Last Letter Home", "Shadows Between Us", "A Winter’s Promise",
    "The Forgotten Garden", "Beneath the Willow Tree", "Silent Horizons", "The Weight of Rain",
    "Letters Never Sent", "The Glass Bridge", "Songs for the Broken", "The Edge of Innocence",
    "A Thousand Autumns", "The Quiet Harbor", "Dust and Roses", "The Longest Goodbye",
    "The Painted Veil of Time", "The Secret of Bluebells", "Ashes of Summer", "The Road to Evermore",

    # Sci‑Fi / Fantasy
    "Starlight Dominion", "The Quantum Voyage", "Beyond the Singularity", "The Last Colony",
    "Fractured Realms", "The Crystal Spire", "Echoes from Andromeda", "The Forgotten Nebula",
    "The Iron Horizon", "The Ninth Planet", "Dreamers of the Void", "The Eternal Circuit",
    "The Shadow Engine", "The Phoenix Protocol", "The Orbital Divide", "The Lost Dimension",
    "The Prism War", "The Time Weaver", "The Gravity Paradox", "The Silent Constellation",

    # Thriller / Mystery
    "The Vanishing Point", "Cold Ashes", "The Silent Witness", "The Crimson Key",
    "The Hollow Man’s Diary", "The Last Cipher", "The Forgotten Room", "The Blackened Mirror",
    "The Widow’s Secret", "The Seventh Floor", "The Glass Labyrinth", "The Shadow Corridor",
    "The Final Manuscript", "The Whispering Walls", "The Hidden Agenda", "The Midnight Caller",
    "The Pale Mask", "The Raven’s Pact", "The Frozen Hour", "The Collector’s Game",

    # Comedy / Lighthearted
    "Pancakes at Midnight", "The Awkward Reunion", "Dancing with Disaster", "The Great Pretenders",
    "Roommates from Mars", "The Wedding That Never Was", "Coffee Shop Chronicles", "The Misfit Club",
    "Karaoke Apocalypse", "The Wrong Best Man", "Pizza Planet Blues", "The Couch Potato Revolution",
    "The Almost Honeymoon", "The Babysitter’s Guide to Chaos", "The Office Prank Wars",
    "The Great Escape Room", "The Substitute Teacher’s Revenge", "The Dating Algorithm",
    "The Elevator Incident", "The Last Slice",

    # Animation / Family
    "The Clockwork Fox", "The Secret of Fireflies", "Journey to Cloudberry Hill", "The Little Lantern",
    "The Whispering Forest", "The Song of Crystals", "The Rainbow Caravan", "The Adventures of Pebble and Twig",
    "The Hidden Valley", "The Star Painter", "The Ocean’s Lullaby", "The Curious Compass",
    "The Moonlight Circus", "The Golden Acorn", "The Dreammaker’s Apprentice", "The Skybound Garden",
    "The Tale of Frost and Flame", "The Enchanted Carousel", "The River of Wishes", "The Cloud Shepherd",

    # Action / Adventure
    "The Crimson Horizon", "The Last Outpost", "The Iron Fortress", "The Savage Frontier",
    "The Black Sun Rising", "The Siege of Avalon", "The Dragon’s Oath", "The Shadow Legion",
    "The Desert Storm", "The Forgotten Crusade", "The Pirate’s Legacy", "The Hidden Blade",
    "The Stormbreaker", "The Lost Expedition", "The Bloodstone Pact", "The Eternal Warrior",
    "The Silent Armada", "The Firebrand", "The Savage Tide", "The Broken Banner",

    # Horror
    "The House on Hollow Hill", "The Forgotten Crypt", "The Blood Moon Ritual", "The Silent Woods",
    "The Dollmaker’s Curse", "The Last Séance", "The Blackened Church", "The Hollow Eyes",
    "The Crimson Fog", "The Whispering Dolls", "The Frozen Graveyard", "The Shadowed Lake",
    "The Forgotten Orphanage", "The Screaming Portrait", "The Midnight Ritual", "The Pale Children",
    "The Haunted Carnival", "The Widow’s Curse", "The Silent Choir", "The Darkened Cellar"
]


# normalize titles for regex patterns
sample_titles_norm = [normalize_text(t) for t in sample_titles]

title_patterns = {
    title: re.compile(r"(?<!\w)" + re.escape(title) + r"(?!\w)", re.IGNORECASE)
    for title in sample_titles_norm
}

# ------------------------------
# Detect titles function
# ------------------------------
def detect_titles_regex(text, patterns):
    text_norm = normalize_text(text)
    detected = []
    for title, pattern in patterns.items():
        if pattern.search(text_norm):
            detected.append(title)
    return detected

# ------------------------------
# TMDb metadata caching
# ------------------------------
metadata_cache = {}

def get_movie_metadata(title):
    title_key = title.strip().lower()
    if title_key in metadata_cache:
        return metadata_cache[title_key]

    query_title = title.strip().title()  # proper case for API
    search_url = f"{BASE_URL}/search/movie?api_key={API_KEY}&query={query_title}"
    try:
        search_response = requests.get(search_url).json()
    except requests.RequestException:
        return None

    if not search_response.get("results"):
        return None

    movie_id = search_response["results"][0]["id"]

    credits_url = f"{BASE_URL}/movie/{movie_id}/credits?api_key={API_KEY}"
    try:
        credits_response = requests.get(credits_url).json()
    except requests.RequestException:
        return None

    actors = [member["name"] for member in credits_response.get("cast", [])[:7]]
    directors = [member["name"] for member in credits_response.get("crew", []) if member["job"] == "Director"]

    result = {"actors": actors, "directors": directors}
    metadata_cache[title_key] = result
    sleep(0.25)  # rate limit
    return result

# ------------------------------
# Enrich with metadata
# ------------------------------
def normalize_name(name):
    """Lowercase and normalize Unicode for consistent matching."""
    return unicodedata.normalize("NFKC", name).lower().strip()

def fuzzy_match(person, metadata_list, threshold=0.8):
    """Check if a person matches any metadata name with fuzzy similarity."""
    person_norm = normalize_name(person)
    for m in metadata_list:
        if difflib.SequenceMatcher(None, person_norm, normalize_name(m)).ratio() >= threshold:
            return True
    return False

def enrich_with_metadata(row):
    titles = row.get("detected_titles", [])
    if not titles:
        row["actors"] = []
        row["directors"] = []
        row["ner_entities"] = []
        return row

    all_actors, all_directors = [], []
    for t in titles:
        metadata = get_movie_metadata(t)
        if metadata:
            all_actors.extend(metadata.get("actors", []))
            all_directors.extend(metadata.get("directors", []))

    # Deduplicate
    row["actors"] = list(set(all_actors))
    row["directors"] = list(set(all_directors))

    # Run spaCy NER
    doc = nlp(row["text"])
    persons_in_review = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # Link detected persons to metadata with fuzzy matching
    relevant_entities = []
    for person in persons_in_review:
        if fuzzy_match(person, row["actors"]) or fuzzy_match(person, row["directors"]):
            relevant_entities.append(person)

    row["ner_entities"] = relevant_entities
    return row

# ------------------------------
# Load dataset
# ------------------------------
train_df = pd.DataFrame(train_data)

# Detect titles
train_df["detected_titles"] = train_df["text"].apply(lambda x: detect_titles_regex(x, title_patterns))

# Filter rows with at least one detected title
matched_reviews = train_df[train_df["detected_titles"].map(len) > 0].copy()

# Initialize columns
matched_reviews["actors"] = None
matched_reviews["directors"] = None
matched_reviews["ner_entities"] = None

# Enrich with metadata and NER
matched_reviews = matched_reviews.apply(enrich_with_metadata, axis=1)

# ------------------------------
# Save final DataFrame - for development purposes
# ------------------------------
matched_reviews.to_csv("matched_reviews_with_metadata_ner.csv", index=False)
print("Pipeline complete. Saved matched reviews with NER metadata.")

#Took about 38 minutes to run the full pipeline on the training set.


Pipeline complete. Saved matched reviews with NER metadata.


In [None]:
print("Number of titles:", len(matched_reviews))

# filtered = matched_reviews[matched_reviews["detected_titles"] != "up"][10:]
# filtered.head(10)

Number of titles: 10990
