In [28]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer
import umap
from sklearn.cluster import KMeans
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
import re
from sklearn.metrics.pairwise import cosine_similarity
from utils import jaccard_similarity_score
import hdbscan

In [9]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')
# df = df[df['Origin/Ethnicity'] == 'Bollywood']
df = df[df['Release Year'] >= 2010].sample(250)
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
17894,2013,"Railway Man, TheThe Railway Man",Australian,Jonathan Teplitzky,"Nicole Kidman, Colin Firth Jeremy Irvine, Stel...",drama,https://en.wikipedia.org/wiki/The_Railway_Man_...,"During the Second World War, Eric Lomax is a B..."
21676,2017,Loving Vincent,British,Directors: Dorota Kobiela,"Directors: Dorota Kobiela, Hugh Welchman\r\nCa...",unknown,https://en.wikipedia.org/wiki/Loving_Vincent,"One year after Vincent van Gogh's suicide, Pos..."
28235,2011,Scene No: 001,Malayalam,Snehajith,"Saiju Kurup, Niyas, Roopasree, Priyanandanan",unknown,https://en.wikipedia.org/wiki/Scene_No:_001,Scene No: 001 tells the life of Chandramohan (...
22358,2014,Pompeii,Canadian,Paul W. S. Anderson,"Kit Harington, Emily Browning, Carrie-Anne Mos...",historical-disaster romance,https://en.wikipedia.org/wiki/Pompeii_(2014_film),The film opens with scenes of plaster casts of...
16517,2013,Frozen,American,"Chris Buck, Jennifer Lee","Kristen Bell, Idina Menzel, Jonathan Groff","animated, fantasy, musical",https://en.wikipedia.org/wiki/Frozen_(2013_film),Princess Elsa of Arendelle possesses cryokinet...


In [10]:
import spacy
import pandas as pd
from tqdm import tqdm

# Load the spaCy model once
nlp = spacy.load("en_core_web_md")

def mask_named_entities(text):
    """
    Uses spaCy to identify PERSON entities and replaces them with a placeholder.
    """
    if not isinstance(text, str):
        return text  # Handles NaN or non-string inputs

    doc = nlp(text)
    
    # Create a list of tuples: (start_char, end_char, replacement_string)
    # The replacement string is our placeholder
    replacements = []
    
    for ent in doc.ents:
        # We only care about names (PERSON entities)
        if ent.label_ == "PERSON":
            replacements.append((ent.start_char, ent.end_char, "[PERSON_NAME]"))

    # Apply replacements from end to start to avoid shifting indices
    replacements.sort(key=lambda x: x[0], reverse=True)
    
    masked_text = list(text)
    for start, end, repl in replacements:
        # Replace the characters in the range with the placeholder
        masked_text[start:end] = repl
    
    return "".join(masked_text)


# --- Apply to your DataFrame ---

# 1. Assuming your original DataFrame is named 'df' and contains a 'Plot' column.
# 2. **Important:** It's best practice to run NER only on the 'Plot' column, 
#    not the entire 'combined' string, as you want to preserve the Title/Genre/Origin info.

print("Starting Named Entity Masking...")

# Use tqdm for a progress bar since NER can take a while on a large dataset
tqdm.pandas()
df['Plot_Masked'] = df['Plot'].str.replace(r'\([a-zA-Z\s]*\)', '', regex=True).progress_apply(mask_named_entities)

print("Masking complete.")

# 3. Create your new 'combined' column using the masked plot
df['combined_masked'] = (
    df['Origin/Ethnicity'] + ' ' + 
    df['Title'] + ' ' + 
    df['Genre'] + ' ' + 
    df['Plot_Masked']
)

Starting Named Entity Masking...


100%|██████████| 250/250 [01:14<00:00,  3.36it/s]

Masking complete.





In [11]:
df[['Plot', 'Plot_Masked']].head()

Unnamed: 0,Plot,Plot_Masked
17894,"During the Second World War, Eric Lomax is a B...","During the Second World War, [PERSON_NAME] is ..."
21676,"One year after Vincent van Gogh's suicide, Pos...","One year after [PERSON_NAME] suicide, Postman ..."
28235,Scene No: 001 tells the life of Chandramohan (...,Scene No: 001 tells the life of Chandramohan ...
22358,The film opens with scenes of plaster casts of...,The film opens with scenes of plaster casts of...
16517,Princess Elsa of Arendelle possesses cryokinet...,Princess Elsa of Arendelle possesses cryokinet...


In [42]:
df['combined'] = df['Origin/Ethnicity'] + ' ' + df['Title'] + ' ' + df['Genre'] + ' ' + df['Plot']

input = df['Plot_Masked'].tolist()

vectorizer_model = CountVectorizer(stop_words="english", max_df = 0.5, min_df = 0.2)
emb_minilm = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = umap.UMAP(
    n_neighbors=5,
    min_dist=0.01, 
    random_state=0
)

# clustering_model = KMeans(
#     n_clusters=12
# )

MIN_CLUSTER_SIZE= 3
MIN_SAMPLES = 1
clustering_model = hdbscan.HDBSCAN(
    min_cluster_size=MIN_CLUSTER_SIZE,
    min_samples=MIN_SAMPLES,
    prediction_data=True
)

ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

representation_model = KeyBERTInspired()

topic_model = BERTopic(language="english", 
                       embedding_model=emb_minilm,
                       vectorizer_model=vectorizer_model, 
                       umap_model=umap_model,
                       hdbscan_model=clustering_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model,
                       calculate_probabilities=True, 
                       verbose=True)
topics, probs = topic_model.fit_transform(input)

2025-11-30 00:29:13,081 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 8/8 [00:06<00:00,  1.27it/s]
2025-11-30 00:29:19,482 - BERTopic - Embedding - Completed ✓
2025-11-30 00:29:19,486 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-30 00:29:20,157 - BERTopic - Dimensionality - Completed ✓
2025-11-30 00:29:20,159 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-30 00:29:20,243 - BERTopic - Cluster - Completed ✓
2025-11-30 00:29:20,273 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-30 00:29:22,339 - BERTopic - Representation - Completed ✓


In [43]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,20,-1_scene_victims_director_revenge,"[scene, victims, director, revenge, sister, at...","[[PERSON_NAME] , a renowned actress and film h..."
1,0,18,0_movie_narrates_affair_relationship,"[movie, narrates, affair, relationship, book, ...",[[PERSON_NAME] around the story of three men. ...
2,1,12,1_situation_kidnapped_incident_person,"[situation, kidnapped, incident, person, relat...",[The story opens at a railway station where th...
3,2,11,2_murder_victims_kills_scene,"[murder, victims, kills, scene, murdered, act,...","[In February, at a prestigious Catholic boardi..."
4,3,10,3_criminal_situation_act_narrates,"[criminal, situation, act, narrates, marriage,...","[In August 1947, after India sought independen..."
5,4,10,4_movie_person_grandfather_escapes,"[movie, person, grandfather, escapes, revenge,...",[The film starts in the 1980s. A young [PERSON...
6,5,7,5_murder_gang_crime_kidnapped,"[murder, gang, crime, kidnapped, flees, incide...","[In July 1985, Dallas electrician and rodeo co..."
7,6,7,6_fate_movie_scene_classmate,"[fate, movie, scene, classmate, role, realizes...",[The film revolves around village girl [PERSON...
8,7,7,7_suspects_flees_gang_criminal,"[suspects, flees, gang, criminal, confronts, t...","[On March 21, 2023, the media credits the annu..."
9,8,7,8_murder_classmate_scene_incident,"[murder, classmate, scene, incident, suicide, ...","[At the [PERSON_NAME] homestead, a birthday pa..."


In [60]:
topic_info = topic_model.get_topic_info()
topic_rep_map = topic_info.set_index("Topic")["Representation"].to_dict()
topic_info[topic_info['Topic'] == 0]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
1,0,18,0_movie_narrates_affair_relationship,"[movie, narrates, affair, relationship, book, ...",[[PERSON_NAME] around the story of three men. ...


In [61]:
df['Topic'] = topics
df['Topic_Probs'] = probs
df['Topic_Rep'] = df['Topic'].map(topic_rep_map)
df[~df['Topic_Probs'].isna()].head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Plot_Masked,combined_masked,combined,Topic,Topic_Probs,Topic_Rep
17894,2013,"Railway Man, TheThe Railway Man",Australian,Jonathan Teplitzky,"Nicole Kidman, Colin Firth Jeremy Irvine, Stel...",drama,https://en.wikipedia.org/wiki/The_Railway_Man_...,"During the Second World War, Eric Lomax is a B...","During the Second World War, [PERSON_NAME] is ...","Australian Railway Man, TheThe Railway Man dr...","Australian Railway Man, TheThe Railway Man dr...",-1,0.007860725,"[scene, victims, director, revenge, sister, at..."
21676,2017,Loving Vincent,British,Directors: Dorota Kobiela,"Directors: Dorota Kobiela, Hugh Welchman\r\nCa...",unknown,https://en.wikipedia.org/wiki/Loving_Vincent,"One year after Vincent van Gogh's suicide, Pos...","One year after [PERSON_NAME] suicide, Postman ...",British Loving Vincent unknown One year after ...,British Loving Vincent unknown One year after ...,8,0.005894854,"[murder, classmate, scene, incident, suicide, ..."
28235,2011,Scene No: 001,Malayalam,Snehajith,"Saiju Kurup, Niyas, Roopasree, Priyanandanan",unknown,https://en.wikipedia.org/wiki/Scene_No:_001,Scene No: 001 tells the life of Chandramohan (...,Scene No: 001 tells the life of Chandramohan ...,Malayalam Scene No: 001 unknown Scene No: 001 ...,Malayalam Scene No: 001 unknown Scene No: 001 ...,6,0.0140954,"[fate, movie, scene, classmate, role, realizes..."
22358,2014,Pompeii,Canadian,Paul W. S. Anderson,"Kit Harington, Emily Browning, Carrie-Anne Mos...",historical-disaster romance,https://en.wikipedia.org/wiki/Pompeii_(2014_film),The film opens with scenes of plaster casts of...,The film opens with scenes of plaster casts of...,Canadian Pompeii historical-disaster romance T...,Canadian Pompeii historical-disaster romance T...,9,4.573374e-308,"[affair, fate, victims, law, convinces, destro..."
16517,2013,Frozen,American,"Chris Buck, Jennifer Lee","Kristen Bell, Idina Menzel, Jonathan Groff","animated, fantasy, musical",https://en.wikipedia.org/wiki/Frozen_(2013_film),Princess Elsa of Arendelle possesses cryokinet...,Princess Elsa of Arendelle possesses cryokinet...,"American Frozen animated, fantasy, musical Pri...","American Frozen animated, fantasy, musical Pri...",19,1.396579e-307,"[kidnapped, convinces, confronts, act, threate..."


In [62]:
import numpy as np

def calculate_hierarchical_similarity(
    df: pd.DataFrame, 
    movie_id_or_title: str, 
    top_n: int = 10
) -> pd.DataFrame:
    """
    Calculates a composite similarity score based on hierarchical feature weights
    and returns the top N most similar movies.

    The weights prioritize features in this order: Origin > Genre > Date > Topic.
    """
    
    # 1. Identify the reference movie (query movie)
    if isinstance(movie_id_or_title, int):
        query_movie = df[df['id'] == movie_id_or_title].iloc[0]
    elif isinstance(movie_id_or_title, str):
        query_movie = df[df['Title'].str.lower() == movie_id_or_title.lower()].iloc[0]
    else:
        raise ValueError("Input must be a movie index (int) or title (str).")

    # Filter out the query movie itself from the candidate list
    candidates = df[df['Title'] != query_movie['Title']].copy()

    # Define the weights to enforce the strict hierarchy
    WEIGHT_ORIGIN = 5
    WEIGHT_GENRE = 5
    WEIGHT_DATE = 0.5
    WEIGHT_TOPIC = 5

    print(f"--- Calculating Similarity for: {query_movie['Title']} ({query_movie['Release Year']}) ---")

    # 2. Calculate Individual Scores

    # Score 1: Origin/Ethnicity (Highest Priority: 1000 points)
    candidates['score_origin'] = np.where(
        candidates['Origin/Ethnicity'] == query_movie['Origin/Ethnicity'], 
        WEIGHT_ORIGIN, 
        0
    )

    # Score 2: Genre (Second Priority: 100 points)
    # This assumes exact genre match is required.
    candidates['score_genre'] = candidates['Genre'].apply(
        lambda x: jaccard_similarity_score(x, query_movie['Genre'])
    ) * WEIGHT_GENRE

    max_diff_years = 5 # Score is 0 if difference is 5 years or more
    date_diff = np.abs(candidates['Release Year'] - query_movie['Release Year'])
    
    # Calculate decay factor: (1 - difference / max_diff_years). Capped at 0.
    decay_factor = np.maximum(0, 1 - (date_diff / max_diff_years))
    candidates['score_date'] = decay_factor * WEIGHT_DATE

    # Score 4: Plot Topic (Lowest Priority: Cosine Similarity on Topic Probs)
    query_prob = query_movie['Topic_Probs'].reshape(1, -1)
    # Ensure all topic probability arrays are correctly stacked (handles the list of numpy arrays)
    candidate_probs = np.vstack(candidates['Topic_Probs'].values)

    # Cosine Similarity returns a score between 0.0 and 1.0
    topic_similarity_vector = cosine_similarity(query_prob, candidate_probs)[0]
    candidates['score_topic'] = topic_similarity_vector * WEIGHT_TOPIC

    # 3. Combine Scores Hierarchically
    candidates['similarity_score'] = (
        candidates['score_origin'] +
        candidates['score_genre'] +
        candidates['score_date'] +
        candidates['score_topic']
    )

    # 4. Sort and return top N
    candidates = candidates.sort_values(
        by='similarity_score', 
        ascending=False
    ).head(top_n)
    
    # Select and rename relevant columns for output
    result = candidates[[
        'Title', 'Origin/Ethnicity', 'Genre', 'Release Year', 'Topic', 
        'similarity_score', 'score_origin', 'score_genre', 'score_date', 'score_topic'
    ]]

    return result

In [63]:
top_10_by_title = calculate_hierarchical_similarity(df, 'Frozen')
print("\n--- TOP 10 SIMILAR MOVIES (Query: Frozen) ---")
print(df[df['Title'] == 'Frozen'][['Title', 'Topic', 'Release Year', 'Genre', 'Origin/Ethnicity','Topic_Rep']])
print(top_10_by_title.to_markdown(index=False))

print("-" * 60)

--- Calculating Similarity for: Frozen (2013) ---

--- TOP 10 SIMILAR MOVIES (Query: Frozen) ---
        Title  Topic  Release Year                       Genre  \
16517  Frozen     19          2013  animated, fantasy, musical   

      Origin/Ethnicity                                          Topic_Rep  
16517         American  [kidnapped, convinces, confronts, act, threate...  
| Title                         | Origin/Ethnicity   | Genre                              |   Release Year |   Topic |   similarity_score |   score_origin |   score_genre |   score_date |   score_topic |
|:------------------------------|:-------------------|:-----------------------------------|---------------:|--------:|-------------------:|---------------:|--------------:|-------------:|--------------:|
| Beasts of the Southern Wild   | American           | drama, fantasy                     |           2012 |      19 |            6.65    |              5 |      1.25     |          0.4 |  0            |
| Spar