<a href="https://colab.research.google.com/github/ayushpratapno1/content_recommendation_system/blob/main/movie_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ahsanaseer/top-rated-tmdb-movies-10k")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ahsanaseer/top-rated-tmdb-movies-10k?dataset_version_number=1...


100%|██████████| 1.43M/1.43M [00:00<00:00, 151MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/ahsanaseer/top-rated-tmdb-movies-10k/versions/1





In [7]:
import glob
import pandas as pd
# Try to find a main movies CSV
candidates = glob.glob(os.path.join(path, "*.csv"))
print("CSV files:", [os.path.basename(p) for p in candidates])

# Pick the most relevant one (adjust if the dataset has different names)
# Common names: movies.csv, tmdb_movies.csv, top_rated_movies.csv, etc.
movies_path = None
for name in ["movies.csv", "tmdb_movies.csv", "top_rated_movies.csv", "top_rated_tmdb_movies.csv"]:
    match = [p for p in candidates if os.path.basename(p).lower() == name]
    if match:
        movies_path = match[0]
        break

# Fallback: choose the largest CSV if names are unknown
if movies_path is None and candidates:
    movies_path = max(candidates, key=lambda p: os.path.getsize(p))

assert movies_path is not None, "Could not find a movies CSV in the dataset directory."
print("Using:", os.path.basename(movies_path))

df = pd.read_csv(movies_path)
print(df.head(3))
print(df.columns.tolist())


CSV files: ['top10K-TMDB-movies.csv']
Using: top10K-TMDB-movies.csv
      id                        title                 genre original_language  \
0    278     The Shawshank Redemption           Drama,Crime                en   
1  19404  Dilwale Dulhania Le Jayenge  Comedy,Drama,Romance                hi   
2    238                The Godfather           Drama,Crime                en   

                                            overview  popularity release_date  \
0  Framed in the 1940s for the double murder of h...      94.075   1994-09-23   
1  Raj is a rich, carefree, happy-go-lucky second...      25.408   1995-10-19   
2  Spanning the years 1945 to 1955, a chronicle o...      90.585   1972-03-14   

   vote_average  vote_count  
0           8.7       21862  
1           8.7        3731  
2           8.7       16280  
['id', 'title', 'genre', 'original_language', 'overview', 'popularity', 'release_date', 'vote_average', 'vote_count']


In [8]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Pick columns that exist in this dataset (inspect df.columns above)
# Common fields: 'title', 'overview', 'genres', 'keywords', 'cast', 'director'
title_col = None
for cand in ["title", "movie_title", "original_title", "name"]:
    if cand in df.columns:
        title_col = cand
        break
assert title_col is not None, "No title column found. Check dataset columns."

text_fields = []
for cand in ["overview", "genres", "keywords", "cast", "director", "tagline"]:
    if cand in df.columns:
        text_fields.append(cand)

# Prepare text "soup"
def norm_text(s):
    if isinstance(s, str):
        return s
    if isinstance(s, (list, tuple, np.ndarray)):
        return " ".join(map(str, s))
    return ""

df[text_fields] = df[text_fields].fillna("")
df["soup"] = df[text_fields].apply(lambda row: " ".join(norm_text(v) for v in row), axis=1)

# Vectorize and compute similarity
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=2)
tfidf_matrix = tfidf.fit_transform(df["soup"])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Build title index (handle duplicates by keeping first occurrence)
indices = pd.Series(df.index, index=df[title_col]).drop_duplicates()

def recommend(title, k=10, extra_cols=None):
    if title not in indices:
        raise ValueError(f"Title not found: {title}")
    idx = int(indices[title])
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:k+1]
    movie_indices = [i for i, _ in sim_scores]
    cols = [title_col]
    if extra_cols:
        cols += [c for c in extra_cols if c in df.columns and c != title_col]
    return df.iloc[movie_indices][cols].reset_index(drop=True)

# Try an example (replace with an existing title from your dataset)
example_title = df[title_col].dropna().iloc[0]
print("Query:", example_title)
recs = recommend(example_title, k=10, extra_cols=["vote_average", "vote_count", "release_date", "genres"])
recs

Query: The Shawshank Redemption


Unnamed: 0,title,vote_average,vote_count,release_date
0,Sherlock Jr.,8.2,685,1924-04-17
1,The Blues Brothers,7.8,3440,1980-06-16
2,Sleuth,7.7,458,1972-12-10
3,In Hell,6.4,406,2003-11-24
4,The 40 Year Old Virgin,6.3,5626,2005-08-11
5,Wasp Network,6.1,282,2020-01-29
6,Brubaker,7.1,359,1980-06-20
7,The Woodsman,6.9,376,2004-12-24
8,Escape Plan,6.7,4370,2013-10-09
9,1922,5.9,2265,2017-10-20


In [9]:
import os, glob
import pandas as pd

# Use the path you already got from kagglehub
# path = kagglehub.dataset_download("ahsanaseer/top-rated-tmdb-movies-10k")
print("Using dataset path:", path)

# Find a main movies CSV
candidates = glob.glob(os.path.join(path, "*.csv"))
print("CSV files:", [os.path.basename(p) for p in candidates])

# Try common names, else pick largest file
movies_path = None
for name in ["movies.csv", "tmdb_movies.csv", "top_rated_movies.csv", "top_rated_tmdb_movies.csv", "tmdb_top_rated_10k.csv"]:
    match = [p for p in candidates if os.path.basename(p).lower() == name]
    if match:
        movies_path = match[0]
        break

if movies_path is None and candidates:
    movies_path = max(candidates, key=lambda p: os.path.getsize(p))

assert movies_path is not None, "Could not find a movies CSV in the dataset directory."
print("Chosen CSV:", os.path.basename(movies_path))

df = pd.read_csv(movies_path)
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
df.head(3)


Using dataset path: /root/.cache/kagglehub/datasets/ahsanaseer/top-rated-tmdb-movies-10k/versions/1
CSV files: ['top10K-TMDB-movies.csv']
Chosen CSV: top10K-TMDB-movies.csv
Rows: 10000
Columns: ['id', 'title', 'genre', 'original_language', 'overview', 'popularity', 'release_date', 'vote_average', 'vote_count']


Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280


In [10]:
import numpy as np

# Pick a title column
title_col = None
for cand in ["title", "movie_title", "original_title", "name"]:
    if cand in df.columns:
        title_col = cand
        break
assert title_col is not None, "No title-like column found. Inspect df.columns."

# Optional: standardize a date column if present
date_col = None
for cand in ["release_date", "release", "year", "release_year"]:
    if cand in df.columns:
        date_col = cand
        break

if date_col:
    # If it's a year/int column, convert to datetime-like strings for consistency
    if np.issubdtype(df[date_col].dtype, np.number):
        df["year"] = df[date_col].astype("Int64")
    else:
        # Try to parse full date; fallback to year
        parsed = pd.to_datetime(df[date_col], errors="coerce")
        df["year"] = parsed.dt.year

# Optional: standardize language if present
lang_col = None
for cand in ["original_language", "language", "lang"]:
    if cand in df.columns:
        lang_col = cand
        break

# Ensure text fields exist for “soup”
text_candidates = ["overview", "genres", "keywords", "cast", "director", "tagline"]
text_fields = [c for c in text_candidates if c in df.columns]

# Fill NAs
for c in text_fields:
    df[c] = df[c].fillna("")
df[title_col] = df[title_col].fillna("").astype(str)

print("Using title_col:", title_col)
print("Using text_fields:", text_fields)


Using title_col: title
Using text_fields: ['overview']


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def norm_text(v):
    if isinstance(v, str):
        return v
    if isinstance(v, (list, tuple, np.ndarray)):
        return " ".join(map(str, v))
    return str(v)

df["soup"] = df[text_fields].apply(lambda row: " ".join(norm_text(v) for v in row), axis=1)

# Vectorize
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=2)
tfidf_matrix = tfidf.fit_transform(df["soup"])

# Cosine similarity via linear kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Title index (deduplicate by first)
indices = pd.Series(df.index, index=df[title_col]).drop_duplicates()


In [12]:
# Optional popularity columns
pop_cols = [c for c in ["vote_count", "vote_average", "popularity"] if c in df.columns]

def filter_mask(language=None, min_year=None, max_year=None):
    m = pd.Series(True, index=df.index)
    if language and lang_col and lang_col in df.columns:
        m &= (df[lang_col].astype(str).str.lower() == str(language).lower())
    if min_year is not None and "year" in df.columns:
        m &= (df["year"] >= int(min_year))
    if max_year is not None and "year" in df.columns:
        m &= (df["year"] <= int(max_year))
    return m

def popularity_score(idx):
    # Normalize and combine available popularity signals
    score = np.zeros(len(idx), dtype=np.float32)
    if "vote_count" in df.columns:
        vc = df.loc[idx, "vote_count"].astype(float).fillna(0.0)
        vc = (vc - vc.min()) / (vc.max() - vc.min() + 1e-9)
        score += vc.values
    if "vote_average" in df.columns:
        va = df.loc[idx, "vote_average"].astype(float).fillna(0.0)
        va = (va - va.min()) / (va.max() - va.min() + 1e-9)
        score += va.values
    if "popularity" in df.columns:
        pop = df.loc[idx, "popularity"].astype(float).fillna(0.0)
        pop = (pop - pop.min()) / (pop.max() - pop.min() + 1e-9)
        score += pop.values
    return score

def recommend(
    title,
    k=10,
    language=None,
    min_year=None,
    max_year=None,
    popularity_alpha=0.2,
    extra_cols=None
):
    if title not in indices:
        raise ValueError(f"Title not found: {title}")
    base_idx = int(indices[title])

    # Raw similarity scores
    sims = cosine_sim[base_idx].ravel()

    # Apply candidate mask
    m = filter_mask(language=language, min_year=min_year, max_year=max_year)
    m.iloc[base_idx] = False  # exclude self

    candidate_idx = np.where(m.values)[0]
    if len(candidate_idx) == 0:
        return pd.DataFrame()

    # Select candidates and optionally blend popularity
    cand_scores = sims[candidate_idx]

    if popularity_alpha and popularity_alpha > 0:
        pop = popularity_score(candidate_idx)
        pop = (pop - pop.min()) / (pop.max() - pop.min() + 1e-9)
        # Blend: final = (1-alpha)*similarity + alpha*popularity
        cand_scores = (1 - popularity_alpha) * cand_scores + popularity_alpha * pop

    # Top-k
    order = np.argsort(-cand_scores)[:k]
    top_idx = candidate_idx[order]

    cols = [title_col]
    if extra_cols:
        cols += [c for c in extra_cols if c in df.columns and c != title_col]

    return df.iloc[top_idx][cols].reset_index(drop=True)

# Try it
example_title = df[title_col].dropna().iloc[0]
print("Query:", example_title)
recs = recommend(
    example_title,
    k=10,
    language="en",        # set None if unavailable
    min_year=2000,        # set None to disable
    popularity_alpha=0.2, # set 0.0 to use pure similarity
    extra_cols=["year", "vote_average", "vote_count", "popularity", "genres", "overview"]
)
recs


Query: The Shawshank Redemption


Unnamed: 0,title,year,vote_average,vote_count,popularity,overview
0,Inception,2010,8.4,31917,111.757,"Cobb, a skilled thief who commits corporate es..."
1,Interstellar,2014,8.4,28920,191.22,The adventures of a group of explorers who mak...
2,The Dark Knight,2008,8.5,27925,82.23,Batman raises the stakes in his war on crime. ...
3,Toy Story 3,2010,7.8,12665,116.195,"Woody, Buzz, and the rest of Andy's toys haven..."
4,Deadpool,2016,7.6,27028,230.309,The origin story of former Special Forces oper...
5,Avengers: Infinity War,2018,8.3,25020,475.535,As the Avengers and their allies have continue...
6,The Avengers,2012,7.7,27250,278.98,When an unexpected enemy emerges and threatens...
7,Top Gun: Maverick,2022,8.4,1690,7567.017,After more than thirty years of service as one...
8,Django Unchained,2012,8.2,22784,66.924,"With the help of a German bounty hunter, a fre..."
9,Jurassic World Dominion,2022,7.0,2054,10436.917,"Four years after Isla Nublar was destroyed, di..."


In [13]:
import joblib
import os

os.makedirs("artifacts", exist_ok=True)
joblib.dump(tfidf, "artifacts/tfidf_vectorizer.joblib")
joblib.dump(tfidf_matrix, "artifacts/tfidf_matrix.joblib")
joblib.dump(indices, "artifacts/title_indices.joblib")
df.to_parquet("artifacts/movies.parquet", index=False)

# Reload later
tfidf = joblib.load("artifacts/tfidf_vectorizer.joblib")
tfidf_matrix = joblib.load("artifacts/tfidf_matrix.joblib")
indices = joblib.load("artifacts/title_indices.joblib")
df = pd.read_parquet("artifacts/movies.parquet")

# Reconstruct cosine similarity lazily if needed to save RAM:
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [15]:
!pip -q install sentence-transformers
from sentence_transformers import SentenceTransformer
import torch

model_st = SentenceTransformer("all-MiniLM-L6-v2")  # small, fast
texts = df["overview"].fillna("").astype(str).tolist()
emb = model_st.encode(texts, batch_size=64, convert_to_tensor=True, show_progress_bar=True)
cosine_sim_dense = (emb @ emb.T).cpu().numpy()  # cosine if embeddings are normalized

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [16]:
import numpy as np
import pandas as pd
import torch

# If embeddings aren't L2-normalized, normalize now so dot product = cosine similarity
if isinstance(emb, torch.Tensor):
    emb = torch.nn.functional.normalize(emb, p=2, dim=1)
    emb_np = emb.cpu().numpy()
else:
    # If emb is a NumPy array, normalize with NumPy
    emb_norm = np.linalg.norm(emb, axis=1, keepdims=True) + 1e-9
    emb_np = emb / emb_norm

# Title index for lookup
# Make sure title_col is set (from earlier code)
# title_col = "title"  # set this if not already defined
indices = pd.Series(df.index, index=df[title_col]).drop_duplicates()


In [17]:
def filter_mask(language=None, min_year=None, max_year=None, lang_col=None):
    m = pd.Series(True, index=df.index)
    if language and lang_col and lang_col in df.columns:
        m &= (df[lang_col].astype(str).str.lower() == str(language).lower())
    if min_year is not None and "year" in df.columns:
        m &= (df["year"] >= int(min_year))
    if max_year is not None and "year" in df.columns:
        m &= (df["year"] <= int(max_year))
    return m

# Detect a language column if present
lang_col = None
for cand in ["original_language", "language", "lang"]:
    if cand in df.columns:
        lang_col = cand
        break

# Popularity helpers (optional)
def popularity_score(idx):
    score = np.zeros(len(idx), dtype=np.float32)
    if "vote_count" in df.columns:
        vc = df.loc[idx, "vote_count"].astype(float).fillna(0.0)
        vc = (vc - vc.min()) / (vc.max() - vc.min() + 1e-9)
        score += vc.values
    if "vote_average" in df.columns:
        va = df.loc[idx, "vote_average"].astype(float).fillna(0.0)
        va = (va - va.min()) / (va.max() - va.min() + 1e-9)
        score += va.values
    if "popularity" in df.columns:
        pop = df.loc[idx, "popularity"].astype(float).fillna(0.0)
        pop = (pop - pop.min()) / (pop.max() - pop.min() + 1e-9)
        score += pop.values
    return score

def recommend_dense(
    title,
    k=10,
    language=None,
    min_year=None,
    max_year=None,
    popularity_alpha=0.2,
    extra_cols=None
):
    if title not in indices:
        raise ValueError(f"Title not found: {title}")

    base_idx = int(indices[title])

    # Compute cosine similarities via dot with normalized embeddings
    # emb_np: (N, D)
    sims = emb_np @ emb_np[base_idx, :]

    # Candidate mask
    m = filter_mask(language=language, min_year=min_year, max_year=max_year, lang_col=lang_col)
    m.iloc[base_idx] = False  # exclude self

    candidate_idx = np.where(m.values)[0]
    if candidate_idx.size == 0:
        return pd.DataFrame()

    cand_scores = sims[candidate_idx]

    # Optional popularity blend
    if popularity_alpha and popularity_alpha > 0:
        pop = popularity_score(candidate_idx)
        pop = (pop - pop.min()) / (pop.max() - pop.min() + 1e-9)
        cand_scores = (1 - popularity_alpha) * cand_scores + popularity_alpha * pop

    # Top-k
    order = np.argsort(-cand_scores)[:k]
    top_idx = candidate_idx[order]

    cols = [title_col]
    if extra_cols:
        cols += [c for c in extra_cols if c in df.columns and c != title_col]

    return df.iloc[top_idx][cols].reset_index(drop=True)


In [18]:
example_title = df[title_col].dropna().iloc[0]
print("Query:", example_title)

recs = recommend_dense(
    example_title,
    k=10,
    language="en",        # set None if not applicable
    min_year=2000,        # set None to disable
    popularity_alpha=0.2, # set 0.0 for pure semantic similarity
    extra_cols=["year", "vote_average", "vote_count", "popularity", "genres", "overview"]
)
recs


Query: The Shawshank Redemption


Unnamed: 0,title,year,vote_average,vote_count,popularity,overview
0,Catch Me If You Can,2002,8.0,12740,45.878,"A true story about Frank Abagnale Jr. who, bef..."
1,The Man Who Wasn't There,2001,7.6,1251,9.441,"A tale of murder, crime and punishment set in ..."
2,No Country for Old Men,2007,7.9,9850,35.532,"Llewelyn Moss stumbles upon dead bodies, $2 mi..."
3,Get Hard,2015,6.0,2301,19.205,When obscenely rich hedge-fund manager James i...
4,The Imitation Game,2014,8.0,14808,79.903,Based on the real life story of legendary cryp...
5,Good Time,2017,7.2,2298,18.12,After a botched bank robbery lands his younger...
6,The Boss,2016,5.9,1172,13.017,A titan of industry is sent to prison after sh...
7,Ocean's Thirteen,2007,6.7,5340,22.608,Danny Ocean's team of criminals are back and c...
8,Chaos,2005,6.3,816,13.857,"In Seattle, detective Quentin Conners is unfai..."
9,The Informer,2019,6.5,591,21.441,"In New York, former convict Pete Koslow, relat..."


In [20]:
!pip -q install faiss-cpu
import faiss
import numpy as np

# Build a FAISS index for inner product (use normalized embeddings -> cosine)
d = emb_np.shape[1]
index = faiss.IndexFlatIP(d)
index.add(emb_np.astype(np.float32))

def recommend_dense_faiss(
    title,
    k=10,
    language=None,
    min_year=None,
    max_year=None,
    popularity_alpha=0.0,  # blending complicates pure ANN; keep 0 for speed
    extra_cols=None
):
    if title not in indices:
        raise ValueError(f"Title not found: {title}")
    base_idx = int(indices[title])

    # Candidate filtering: if filters are used, fallback to brute-force on filtered subset
    use_filters = any(v is not None for v in [language, min_year, max_year])
    if use_filters:
        return recommend_dense(title, k=k, language=language, min_year=min_year,
                               max_year=max_year, popularity_alpha=popularity_alpha,
                               extra_cols=extra_cols)

    # Query FAISS
    q = emb_np[base_idx:base_idx+1].astype(np.float32)
    scores, idxs = index.search(q, k+1)  # includes self
    idxs = idxs[0].tolist()
    scores = scores[0].tolist()

    # Remove self
    if base_idx in idxs:
        si = idxs.index(base_idx)
        del idxs[si]
        del scores[si]

    idxs = idxs[:k]
    cols = [title_col]
    if extra_cols:
        cols += [c for c in extra_cols if c in df.columns and c != title_col]
    return df.iloc[idxs][cols].reset_index(drop=True)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
import joblib, os
os.makedirs("artifacts_dense", exist_ok=True)

# Save embeddings and auxiliaries
np.save("artifacts_dense/embeddings.npy", emb_np)
indices.to_csv("artifacts_dense/title_indices.csv")
df.to_parquet("artifacts_dense/movies.parquet", index=False)

# Reload later
emb_np = np.load("artifacts_dense/embeddings.npy")
indices = pd.read_csv("artifacts_dense/title_indices.csv", index_col=0)
df = pd.read_parquet("artifacts_dense/movies.parquet")