In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split


In [90]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

ratings = movies[['id', 'title', 'vote_average', 'vote_count']].copy()
ratings = ratings.rename(columns={'id': 'movieId'})  
ratings = ratings[ratings['vote_count'] > 50]  
ratings['userId'] = np.random.randint(1, 1000, size=len(ratings)) 
ratings['rating'] = ratings['vote_average'] / 2  

movies = movies.merge(credits, on="title")

import ast
def convert_to_list(obj):
    try:
        return [i["name"] for i in ast.literal_eval(obj)]
    except:
        return []

movies["genres"] = movies["genres"].apply(convert_to_list)
movies["keywords"] = movies["keywords"].apply(convert_to_list)
movies["cast"] = movies["cast"].apply(lambda x: convert_to_list(x)[:3])  # Top 3 actors
movies["crew"] = movies["crew"].apply(lambda x: [i["name"] for i in ast.literal_eval(x) if i["job"] == "Director"])

# Convert features into a single text column tags
movies["tags"] = movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]
movies["tags"] = movies["tags"].apply(lambda x: " ".join(x))
# movies['tags'] = movies['tags'].apply(lambda x:x.lower())
# movies['title'] = movies['title'].apply(lambda x:x.lower())


In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split into train and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
svd_model = SVD()
svd_model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x14a7893a0>

In [102]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["tags"])

cosine_sim = cosine_similarity(tfidf_matrix)

knn_model = NearestNeighbors(metric="cosine", algorithm="brute")
knn_model.fit(tfidf_matrix)

def content_recommend(title, n=5):
    idx = movies[movies["title"] == title].index[0]
    distances, indices = knn_model.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    return movies["title"].iloc[indices[0][1:]].tolist()


In [103]:
def collaborative_recommend(user_id, n=5):
    movie_ids = ratings["movieId"].unique()
    
    user_rated = ratings[ratings["userId"] == user_id]["movieId"].values
    movie_ids = [m for m in movie_ids if m not in user_rated]

    predictions = [svd_model.predict(user_id, m) for m in movie_ids]
    
    predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    recommended_ids = [p.iid for p in predictions]
    recommended_movies = movies[movies["id"].isin(recommended_ids)]["title"].tolist()
    
    return recommended_movies


In [106]:
def hybrid_recommend(user_id, title, alpha=0.5, n=5):
    content = content_recommend(title, n=5)
    collab = collaborative_recommend(user_id, n=5)
    # print(content)
    # print(collab)
    hybrid = list(set(content + collab))
    
    if title not in hybrid:
        hybrid.append(title)  
        
    movie_s = {movie: (alpha * content.count(movie) + (1 - alpha) * collab.count(movie))
                    for movie in hybrid}

    sorted_movies = sorted(movie_s, key=movie_s.get, reverse=True)

    if title in sorted_movies:
        sorted_movies.remove(title)
    sorted_movies.insert(0, title)  # Place searched movie at the top

    return sorted_movies[:n] 

print(hybrid_recommend(user_id=1, title="Avatar", alpha=0.5, n=8))


['Avatar', 'Star Trek Into Darkness', 'Alien', 'The Rundown', 'Alien³', 'The Green Mile', 'Planet of the Apes', 'The Fast and the Furious']
