In [52]:
import pandas as pd

df=pd.read_csv("/content/movies_with_images.csv")
df.head()

Unnamed: 0,id,title,overview,genres,cast,director,keywords,rating,vote_count,release_date,imdb_rating,imdb_votes,poster_url,backdrop_url
0,533533,TRON: Ares,A highly sophisticated Program called Ares is ...,"['Science Fiction', 'Adventure', 'Action']","['Jared Leto', 'Greta Lee', 'Evan Peters', 'Gi...",Joachim Rønning,"['artificial intelligence (a.i.)', 'computer p...",6.548,656,2025-10-08,,,https://image.tmdb.org/t/p/w500/chpWmskl3aKm1a...,https://image.tmdb.org/t/p/w500/min9ZUDZbiguTi...
1,1180831,Troll 2,When a dangerous new troll unleashes devastati...,"['Action', 'Fantasy', 'Thriller']","['Ine Marie Wilmann', 'Kim S. Falck-Jørgensen'...",Roar Uthaug,"['mythical creature', 'sequel', 'troll', 'kaij...",6.8,218,2025-11-30,,,https://image.tmdb.org/t/p/w500/p6xAExLNFbHcLf...,https://image.tmdb.org/t/p/w500/lZYMXx74pWmbj5...
2,1083637,Kantara - A Legend: Chapter 1,"During the Kadamba reign, King Vijayendra, the...","['Action', 'Drama', 'Fantasy']","['Rishab Shetty', 'Rukmini Vasanth', 'Jayaram'...",Rishab Shetty,[],6.957,46,2025-10-01,,,https://image.tmdb.org/t/p/w500/zBvw25afDn93em...,https://image.tmdb.org/t/p/w500/w57nxiBIODAYHL...
3,23527,First Squad: The Moment of Truth,Set during the opening days of World War II on...,"['Fantasy', 'Animation', 'Action', 'Science Fi...","['Sergei Aisman', 'Michael Beskorovainy', 'Ele...",Yoshiharu Ashino,"['supernatural', 'super soldier', 'russian army']",6.2,129,2009-05-13,,,https://image.tmdb.org/t/p/w500/hBj1aTnGf4564K...,https://image.tmdb.org/t/p/w500/3k1PKmzNEosWFa...
4,1448560,Wildcat,An ex-black ops team reunite to pull off a des...,"['Action', 'Thriller', 'Crime']","['Kate Beckinsale', 'Lewis Tan', 'Alice Krige'...",James Nunn,"['playful', 'embarrassed']",5.838,34,2025-11-19,,,https://image.tmdb.org/t/p/w500/h893ImjM6Fsv5D...,https://image.tmdb.org/t/p/w500/pAyImoslSnpMgj...


In [53]:
df = df.dropna(subset=["overview", "genres", "cast", "director", "keywords"])
print("After cleaning:", df.shape)


After cleaning: (1402, 14)


In [54]:
import unicodedata

def normalize_text(t):
    if not isinstance(t, str):
        return ""
    t = unicodedata.normalize('NFKD', t).encode('ascii','ignore').decode('utf-8')
    return t.lower().strip()

def join_list(x):
    return " ".join(eval(x)) if isinstance(x, str) and x.startswith("[") else x


In [55]:
df["genres"] = df["genres"].apply(join_list)
df["cast"] = df["cast"].apply(join_list)
df["keywords"] = df["keywords"].apply(join_list)

df["title_norm"] = df["title"].apply(normalize_text)


In [56]:
df['genres']

Unnamed: 0,genres
0,Science Fiction Adventure Action
1,Action Fantasy Thriller
2,Action Drama Fantasy
3,Fantasy Animation Action Science Fiction
4,Action Thriller Crime
...,...
1397,Horror
1398,Animation Drama
1399,Action Animation Science Fiction
1400,Action


In [57]:
df["combined_text"] = (
    df["overview"].fillna("") + " " +
    df["genres"].fillna("") + " " +
    df["keywords"].fillna("") + " " +
    df["cast"].fillna("") + " " +
    df["director"].fillna("")
)


In [58]:
df["combined_text"]

Unnamed: 0,combined_text
0,A highly sophisticated Program called Ares is ...
1,When a dangerous new troll unleashes devastati...
2,"During the Kadamba reign, King Vijayendra, the..."
3,Set during the opening days of World War II on...
4,An ex-black ops team reunite to pull off a des...
...,...
1397,Masked thugs torture an innocent woman in incr...
1398,"In the hyper-masculine criminal underworld, a ..."
1399,"One peaceful day on Earth, two remnants of Fri..."
1400,"At the SCRS (Seiko Cup Rally Series), talented..."


In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(df["combined_text"])

tfidf_matrix.shape


(1402, 5000)

In [60]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape


(1402, 1402)

In [61]:
from difflib import SequenceMatcher

def seq_ratio(a, b):
    return SequenceMatcher(None, a, b).ratio()

def jaccard(a_tokens, b_tokens):
    A, B = set(a_tokens), set(b_tokens)
    if not A or not B:
        return 0
    return len(A & B) / len(A | B)

df["title_tokens"] = df["title_norm"].apply(lambda t: t.split())

def find_movie(query):
    q = normalize_text(query)
    q_tokens = q.split()

    # -----------------------------------------
    # SPECIAL CASE: short titles (length <= 4)
    # -----------------------------------------
    if len(q) <= 4:
        # 1. exact normalized match
        exact = df[df["title_norm"] == q]
        if len(exact) > 0:
            return exact["title"].iloc[0]

        # 2. substring match (VERY STRONG)
        sub = df[df["title_norm"].str.contains(q)]
        if len(sub) > 0:
            # pick most popular movie
            return sub.sort_values("vote_count", ascending=False)["title"].iloc[0]

        # 3. prefix match
        pre = df[df["title_norm"].str.startswith(q)]
        if len(pre) > 0:
            return pre.sort_values("vote_count", ascending=False)["title"].iloc[0]

        # 4. last fallback: fuzzy on short titles
        best_title = None
        best_score = 0

        for _, row in df.iterrows():
            t = row["title_norm"]
            score = seq_ratio(q, t)
            if score > best_score:
                best_score = score
                best_title = row["title"]

        return best_title

    # -----------------------------------------
    # MAIN LOGIC for normal-sized titles
    # -----------------------------------------

    # 1. exact match
    exact = df[df["title_norm"] == q]
    if len(exact) > 0:
        return exact["title"].iloc[0]

    # 2. substring match
    sub = df[df["title_norm"].str.contains(q)]
    if len(sub) > 0:
        return sub.sort_values("vote_count", ascending=False)["title"].iloc[0]

    # 3. fuzzy scoring
    best_title = None
    best_score = 0

    for _, row in df.iterrows():
        t = row["title_norm"]
        score = 0.7 * jaccard(q_tokens, row["title_tokens"]) + 0.3 * seq_ratio(q, t)

        if score > best_score:
            best_score = score
            best_title = row["title"]

    return best_title



In [62]:
def recommend(movie_input, top_n=10):
    title = find_movie(movie_input)

    if title is None:
        return f"Movie '{movie_input}' not found."

    print("Matched:", title)

    idx = df.index[df["title"] == title][0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1 : top_n+1]

    movie_indices = [i[0] for i in sim_scores]

    return df.iloc[movie_indices][["title", "genres", "rating", "poster_url"]]


In [66]:
recommend("12th fail")

Matched: 12th Fail


Unnamed: 0,title,genres,rating,poster_url
560,Saiyaara,Romance Drama Music,6.4,https://image.tmdb.org/t/p/w500/hQBIsi3ZfBYEay...
390,Dada,Drama Comedy Romance,7.438,https://image.tmdb.org/t/p/w500/x7C2u4oXoMFQBa...
1316,Maa,Horror,6.704,https://image.tmdb.org/t/p/w500/kc5n7LJUmvBsVx...
441,Santosh,Crime Drama Thriller,7.045,https://image.tmdb.org/t/p/w500/c4LdJKjE7Du2of...
1304,Naseeb,Drama,7.1,https://image.tmdb.org/t/p/w500/oDUddLemim047H...
1301,Like Stars on Earth,Drama,7.977,https://image.tmdb.org/t/p/w500/puHRt6Raovm5uj...
546,Lover,Romance Drama,7.286,https://image.tmdb.org/t/p/w500/5rN7wzvM9oAR35...
1322,Chhaava,History Action Drama,6.955,https://image.tmdb.org/t/p/w500/ubRsrzb6NRW8Yh...
1323,A Thursday,Thriller Crime Drama,6.591,https://image.tmdb.org/t/p/w500/xnoyG3rPuKAHeR...
1320,Kushi,Romance Comedy Drama,5.1,https://image.tmdb.org/t/p/w500/tiEnkpOarH20cC...


In [67]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def evaluate_recommender(k=10):
    true_positive = 0
    false_positive = 0
    false_negative = 0

    n = len(df)

    for idx in range(n):
        # TRUE SIMILAR MOVIES (top K from cosine_sim)
        true_scores = list(enumerate(cosine_sim[idx]))
        true_scores = sorted(true_scores, key=lambda x: x[1], reverse=True)[1:k+1]
        true_indices = [i[0] for i in true_scores]

        # MODEL PREDICTED MOVIES
        title = df.iloc[idx]["title"]
        pred_df = recommend(title, top_n=k)

        if isinstance(pred_df, str):
            continue

        pred_titles = pred_df["title"].tolist()
        pred_indices = [df.index[df["title"] == t][0] for t in pred_titles]

        # BUILD BINARY VECTORS
        true_vec = np.zeros(n)
        pred_vec = np.zeros(n)

        true_vec[true_indices] = 1
        pred_vec[pred_indices] = 1

        # ACCUMULATE RESULTS
        true_positive += np.sum((true_vec == 1) & (pred_vec == 1))
        false_positive += np.sum((true_vec == 0) & (pred_vec == 1))
        false_negative += np.sum((true_vec == 1) & (pred_vec == 0))

    # CALCULATE METRICS
    precision = true_positive / (true_positive + false_positive + 1e-6)
    recall = true_positive / (true_positive + false_negative + 1e-6)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-6)

    print("Precision@{}: {:.4f}".format(k, precision))
    print("Recall@{}: {:.4f}".format(k, recall))
    print("F1-score@{}: {:.4f}".format(k, f1))

    return precision, recall, f1


In [68]:
evaluate_recommender(k=10)


Matched: TRON: Ares
Matched: Troll 2
Matched: Kantara - A Legend: Chapter 1
Matched: First Squad: The Moment of Truth
Matched: Wildcat
Matched: Bureau 749
Matched: Predator: Badlands
Matched: The Shadow's Edge
Matched: Chainsaw Man - The Movie: Reze Arc
Matched: High Forces
Matched: The Family Plan 2
Matched: Altered
Matched: JUJUTSU KAISEN: Execution -Shibuya Incident x The Culling Game Begins-
Matched: Demon Slayer: Kimetsu no Yaiba Infinity Castle
Matched: She Rides Shotgun
Matched: A Legend
Matched: One Battle After Another
Matched: Art of Eight Limbs
Matched: Mission: Impossible - The Final Reckoning
Matched: The Fantastic 4: First Steps
Matched: Playdate
Matched: High Ground
Matched: Dhurandhar
Matched: Superman
Matched: xXx
Matched: Jurassic World Rebirth
Matched: The Prosecutor
Matched: Sisu: Road to Revenge
Matched: Stand Your Ground
Matched: The Gentleman
Matched: F1
Matched: The Avengers
Matched: Operation Blood Hunt
Matched: Avatar
Matched: The Storm
Matched: Beast of War
M

(np.float64(0.9872078895885651),
 np.float64(0.985306704637282),
 np.float64(0.9862558808962838))

In [69]:
def genre_precision_at_k(k=10):
    total = 0
    match = 0

    for idx in range(len(df)):
        title = df.iloc[idx]["title"]
        genres = set(df.iloc[idx]["genres"].split())

        preds = recommend(title, top_n=k)
        if isinstance(preds, str):
            continue

        for _, row in preds.iterrows():
            total += 1
            rec_genres = set(row["genres"].split())
            if len(genres & rec_genres) > 0:
                match += 1

    precision = match / total
    print(f"Genre Precision@{k}: {precision:.4f}")
    return precision


In [70]:
genre_precision_at_k(10)


Matched: TRON: Ares
Matched: Troll 2
Matched: Kantara - A Legend: Chapter 1
Matched: First Squad: The Moment of Truth
Matched: Wildcat
Matched: Bureau 749
Matched: Predator: Badlands
Matched: The Shadow's Edge
Matched: Chainsaw Man - The Movie: Reze Arc
Matched: High Forces
Matched: The Family Plan 2
Matched: Altered
Matched: JUJUTSU KAISEN: Execution -Shibuya Incident x The Culling Game Begins-
Matched: Demon Slayer: Kimetsu no Yaiba Infinity Castle
Matched: She Rides Shotgun
Matched: A Legend
Matched: One Battle After Another
Matched: Art of Eight Limbs
Matched: Mission: Impossible - The Final Reckoning
Matched: The Fantastic 4: First Steps
Matched: Playdate
Matched: High Ground
Matched: Dhurandhar
Matched: Superman
Matched: xXx
Matched: Jurassic World Rebirth
Matched: The Prosecutor
Matched: Sisu: Road to Revenge
Matched: Stand Your Ground
Matched: The Gentleman
Matched: F1
Matched: The Avengers
Matched: Operation Blood Hunt
Matched: Avatar
Matched: The Storm
Matched: Beast of War
M

0.8239657631954351