In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import TruncatedSVD
from scipy import sparse
import pickle

In [2]:
movies = pd.read_csv("data/movies_metadata.csv", low_memory=False)
credits = pd.read_csv("data/credits.csv")
keywords = pd.read_csv("data/keywords.csv")
ratings = pd.read_csv("data/ratings.csv")   

CONTENT BASED

In [3]:
movies['id'] = movies['id'].astype(str)
credits['id'] = credits['id'].astype(str)
keywords['id'] = keywords['id'].astype(str)

movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

movies = movies[['id','title','overview','genres','keywords','cast','crew']]


In [4]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [5]:
def convert(obj):
    try:
        L = []
        for i in ast.literal_eval(obj):
            L.append(i['name'])
        return L
    except:
        return []


In [6]:
movies['genres']   = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast']     = movies['cast'].apply(lambda x: convert(x)[:3])


In [7]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [8]:
def fetch_director(obj):
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                return [i['name']]
        return []
    except:
        return []


In [9]:
movies['crew'] = movies['crew'].apply(fetch_director)


In [10]:
movies['overview'] = movies['overview'].astype(str).apply(lambda x: x.split())

movies['genres']   = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']     = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']     = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])


In [11]:
movies['tags'] = (movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'])

In [12]:
new = movies[['id','title','tags']]
new['tags'] = new['tags'].apply(lambda x: " ".join(x))


In [13]:
new.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [14]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1,2)  # unigram + bigram
)

vectors = tfidf.fit_transform(new['tags'])
similarity = cosine_similarity(vectors)


In [15]:
def recommend(movie):
    if movie not in new['title'].values:
        return ["Movie not found"]
    movie_index = new[new['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),
                         reverse=True,
                         key=lambda x: x[1])[1:6]
    return [new.iloc[i[0]].title for i in movies_list]


In [16]:
recommend("The Dark Knight")


['The Dark Knight Rises',
 'LEGO DC Comics Super Heroes: Batman: Be-Leaguered',
 'Batman: Under the Red Hood',
 'Batman Begins',
 "Batman Beyond Darwyn Cooke's Batman 75th Anniversary Short"]

POPULARITY BASED


In [17]:
# Ensure movie ID types match
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)


In [18]:
# Merge ratings with movie titles
ratings_with_title = ratings.merge(
    movies[['id','title']],
    left_on='movieId',
    right_on='id'
)

In [19]:
# Num ratings per movie
num_rating_df = ratings_with_title.groupby('title').count()['rating'].reset_index()
num_rating_df.rename(columns={'rating':'num_ratings'}, inplace=True)

In [20]:
# Avg rating per movie
avg_rating_df = ratings_with_title.groupby('title').mean()['rating'].reset_index()
avg_rating_df.rename(columns={'rating':'avg_rating'}, inplace=True)


In [21]:
# Merge
popularity_df = num_rating_df.merge(avg_rating_df, on='title')

# Filter threshold (tune if needed)
popularity_df = popularity_df[popularity_df['num_ratings'] >= 50]
popularity_df = popularity_df.sort_values(
    ['avg_rating','num_ratings'],
    ascending=False
)


In [22]:
popularity_df.head(10)


Unnamed: 0,title,num_ratings,avg_rating
6107,The Million Dollar Hotel,91082,4.429015
4825,Sleepless in Seattle,57070,4.339811
3930,Once Were Warriors,67662,4.266531
2336,Hard Target,13994,4.255074
3160,License to Wed,60024,4.230716
1961,Five Dances,273,4.217949
3669,Murder She Said,28280,4.21303
1321,"Cousin, Cousine",20855,4.202589
1451,Dead Man,7930,4.20082
244,"A Woman, a Gun and a Noodle Shop",8948,4.199039


COLLABORATIVE FILTERING

In [23]:
# === COLLABORATIVE FILTERING STARTS ===

ratings = pd.read_csv("data/ratings.csv")

# Filter users with >= 50 ratings
user_counts = ratings['userId'].value_counts()
active_users = user_counts[user_counts >= 50].index
ratings = ratings[ratings['userId'].isin(active_users)]

# Filter movies with >= 50 ratings
movie_counts = ratings['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 50].index
ratings = ratings[ratings['movieId'].isin(popular_movies)]

print("Filtered ratings shape:", ratings.shape)


Filtered ratings shape: (22610090, 4)


In [24]:
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

user_enc = LabelEncoder()
movie_enc = LabelEncoder()

ratings['user_idx'] = user_enc.fit_transform(ratings['userId'])
ratings['movie_idx'] = movie_enc.fit_transform(ratings['movieId'])

num_users = ratings['user_idx'].nunique()
num_movies = ratings['movie_idx'].nunique()

R = csr_matrix(
    (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
    shape=(num_users, num_movies)
)

print("Sparse matrix:", R.shape)


Sparse matrix: (103839, 12341)


In [25]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

svd = TruncatedSVD(n_components=20, random_state=42)
latent_matrix = svd.fit_transform(R)

movie_factors = R.T.dot(latent_matrix)
cf_similarity = cosine_similarity(movie_factors)

movie_ids = movie_enc.inverse_transform(np.arange(num_movies))
movieid_to_idx = {movie_ids[i]: i for i in range(len(movie_ids))}


In [26]:
def recommend_cf(title, n=10):
    if title not in new['title'].values:
        return ["Movie not found"]

    idx = new[new['title'] == title].index[0]
    tmdb_id = int(new.iloc[idx]['id'])

    if tmdb_id not in movieid_to_idx:
        return ["No CF data available"]

    cf_idx = movieid_to_idx[tmdb_id]
    distances = cf_similarity[cf_idx]

    similar_indices = distances.argsort()[::-1][1:n+1]

    recs = []
    for idx in similar_indices:
        rec_tmdb = movie_ids[idx]
        match = new[new['id'] == rec_tmdb]
        if not match.empty:
            recs.append(match.iloc[0]['title'])

    return recs if recs else ["No CF recommendations"]


HYBRID

In [27]:
# Match TMDB ids to CF index or -1 if not exists
tmdb_to_cf = {-1: -1}
for tmdb_id in new['id']:
    if tmdb_id in movieid_to_idx:
        tmdb_to_cf[tmdb_id] = movieid_to_idx[tmdb_id]
    else:
        tmdb_to_cf[tmdb_id] = -1


In [28]:
def hybrid_recommend(title, alpha=0.6, n=10):
    if title not in new['title'].values:
        return ["Movie not found"]

    idx = new[new['title'] == title].index[0]
    tmdb_id = int(new.iloc[idx]['id'])

    # content vector
    content_scores = similarity[idx]

    # CF vector or zeros padded
    cf_idx = tmdb_to_cf.get(tmdb_id, -1)
    if cf_idx == -1:
        cf_scores = np.zeros(len(content_scores))
    else:
        # pad cf similarity to full movie size
        cf_scores = np.zeros(len(content_scores))
        # fill ONLY movies that exist in CF
        cf_scores[new.index.isin(new[new['id'].isin(movie_ids)].index)] = \
            cf_similarity[cf_idx]

    final_scores = alpha * content_scores + (1 - alpha) * cf_scores

    similar_indices = final_scores.argsort()[::-1]
    similar_indices = [i for i in similar_indices if i != idx][:n]

    return new['title'].iloc[similar_indices].tolist()


In [29]:
def recommend(title, n=10):
    recs = hybrid_recommend(title, n=n)

    if recs == ["Movie not found"]:
        return popularity_df['title'].head(n).tolist()

    return recs


In [30]:
print(recommend("The Dark Knight"))
print(recommend("Toy Story"))
print(recommend("Jumanji"))
print(recommend("Avatar"))
print(recommend("Random Movie"))


['The Dark Knight Rises', 'LEGO DC Comics Super Heroes: Batman: Be-Leaguered', 'Batman: Under the Red Hood', 'Batman Begins', "Batman Beyond Darwyn Cooke's Batman 75th Anniversary Short", 'Batman Returns', 'Batman Unmasked: The Psychology of the Dark Knight', 'The Lego Batman Movie', 'Batman & Mr. Freeze: SubZero', 'Batman: The Dark Knight Returns, Part 2']
['Toy Story 3', 'The 40 Year Old Virgin', 'The Champ', 'Andy Kaufman Plays Carnegie Hall', 'Toy Story 2', "Andy Hardy's Blonde Trouble", 'Andy Peters: Exclamation Mark Question Point', 'Superstar: The Life and Times of Andy Warhol', "Andy Hardy's Private Secretary", 'Wabash Avenue']
['The Dark Angel', 'Liar Game: Reborn', 'Table No. 21', 'Word Wars', 'Snowed Under', 'The Mend', 'The Mend', 'Brainscan', 'Dungeons & Dragons', 'Minecraft: The Story of Mojang']
['Dead Sea', 'Project Moon Base', 'Welcome to the Space Show', 'First Men in the Moon', 'Aliens vs Predator: Requiem', 'Avatar 2', 'Miss Sadie Thompson', 'Horror of the Blood Mon

In [31]:
with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)
with open("content_sim.pkl", "wb") as f:
    pickle.dump(similarity, f)
with open("cf_sim.pkl", "wb") as f:
    pickle.dump(cf_similarity, f)
metadata = {
    "new": new,
    "movies": movies,
    "movie_ids": movie_ids,
    "movieid_to_idx": movieid_to_idx,
    "tmdb_to_cf": tmdb_to_cf,
    "user_enc": user_enc,
    "movie_enc": movie_enc
}

with open("metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)
