In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, KFold
from implicit.als import AlternatingLeastSquares
from scipy.sparse import coo_matrix
from sklearn.neighbors import NearestNeighbors
import joblib
import ast


In [52]:
movie=pd.read_csv('/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/Cleaned Movie Data.csv')
user=pd.read_csv('/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/User_Profiles.csv')

Movie Data

In [53]:
movie.head(2)

Unnamed: 0.1,Unnamed: 0,movie_id,title,overview,genres,release_date,release_year,runtime,original_language,poster_path,vote_average,vote_count,popularity,production_companies
0,0,447273,Snow White,A princess joins forces with seven dwarfs to l...,"Family, Fantasy",2025-03-19,2025,109,en,https://image.tmdb.org/t/p/w500/xWWg47tTfparvj...,3.232,41,24.058,"Walt Disney Pictures, Marc Platt Productions"
1,1,1437446,The Twister: Caught in the Storm,"In May 2011, a massive tornado ripped through ...",Documentary,2025-03-18,2025,89,en,https://image.tmdb.org/t/p/w500/ggXb37lX9gW4SR...,6.9,8,15.751,RAW


In [54]:
movie=movie.drop(columns=['Unnamed: 0', 'poster_path'])

In [55]:
movie = movie.dropna(subset=['genres'])

In [56]:
movie.head(2)

Unnamed: 0,movie_id,title,overview,genres,release_date,release_year,runtime,original_language,vote_average,vote_count,popularity,production_companies
0,447273,Snow White,A princess joins forces with seven dwarfs to l...,"Family, Fantasy",2025-03-19,2025,109,en,3.232,41,24.058,"Walt Disney Pictures, Marc Platt Productions"
1,1437446,The Twister: Caught in the Storm,"In May 2011, a massive tornado ripped through ...",Documentary,2025-03-18,2025,89,en,6.9,8,15.751,RAW


In [57]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9880 entries, 0 to 9879
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              9880 non-null   int64  
 1   title                 9880 non-null   object 
 2   overview              9880 non-null   object 
 3   genres                9880 non-null   object 
 4   release_date          9880 non-null   object 
 5   release_year          9880 non-null   int64  
 6   runtime               9880 non-null   int64  
 7   original_language     9880 non-null   object 
 8   vote_average          9880 non-null   float64
 9   vote_count            9880 non-null   int64  
 10  popularity            9880 non-null   float64
 11  production_companies  9880 non-null   object 
dtypes: float64(2), int64(4), object(6)
memory usage: 926.4+ KB


In [58]:
#movie['runtime_min'] = movie['runtime']

In [59]:
scaler=MinMaxScaler()
movie[['popularity','vote_average', 'runtime']]=scaler.fit_transform(movie[['popularity','vote_average', 'runtime']])

In [60]:
movie.head(2)

Unnamed: 0,movie_id,title,overview,genres,release_date,release_year,runtime,original_language,vote_average,vote_count,popularity,production_companies
0,447273,Snow White,A princess joins forces with seven dwarfs to l...,"Family, Fantasy",2025-03-19,2025,0.252315,en,0.3232,41,1.0,"Walt Disney Pictures, Marc Platt Productions"
1,1437446,The Twister: Caught in the Storm,"In May 2011, a massive tornado ripped through ...",Documentary,2025-03-18,2025,0.206019,en,0.69,8,0.60303,RAW


User Data

In [61]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   user_id                   5000 non-null   int64 
 1   watched_movies            5000 non-null   object
 2   liked_movies              5000 non-null   object
 3   disliked_movies           5000 non-null   object
 4   liked_genres              5000 non-null   object
 5   disliked_genres           5000 non-null   object
 6   language_preferred        5000 non-null   object
 7   watch_timestamps          5000 non-null   object
 8   user_ratings              5000 non-null   object
 9   viewing_frequency         5000 non-null   object
 10  fav_production_companies  4987 non-null   object
dtypes: int64(1), object(10)
memory usage: 429.8+ KB


In [62]:
print(user.columns)

Index(['user_id', 'watched_movies', 'liked_movies', 'disliked_movies',
       'liked_genres', 'disliked_genres', 'language_preferred',
       'watch_timestamps', 'user_ratings', 'viewing_frequency',
       'fav_production_companies'],
      dtype='object')


Converts string representations of lists into actual Python lists

In [63]:
user['watched_movies'] = user['watched_movies'].apply(ast.literal_eval)
user['liked_movies'] = user['liked_movies'].apply(ast.literal_eval)
user['disliked_movies'] = user['disliked_movies'].apply(ast.literal_eval)
user['liked_genres'] = user['liked_genres'].apply(ast.literal_eval)
user['disliked_genres'] = user['disliked_genres'].apply(ast.literal_eval)

In [64]:
mlb=MultiLabelBinarizer()
liked_genres_encoded=mlb.fit_transform(user['liked_genres'])
disliked_genres_encoded=mlb.transform(user['disliked_genres'])
liked_genres_user=pd.DataFrame(liked_genres_encoded, columns=mlb.classes_)
disliked_genres_user=pd.DataFrame(disliked_genres_encoded, columns=mlb.classes_)
user=pd.concat([user, liked_genres_user,disliked_genres_user],axis=1)

In [65]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 49 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   user_id                   5000 non-null   int64 
 1   watched_movies            5000 non-null   object
 2   liked_movies              5000 non-null   object
 3   disliked_movies           5000 non-null   object
 4   liked_genres              5000 non-null   object
 5   disliked_genres           5000 non-null   object
 6   language_preferred        5000 non-null   object
 7   watch_timestamps          5000 non-null   object
 8   user_ratings              5000 non-null   object
 9   viewing_frequency         5000 non-null   object
 10  fav_production_companies  4987 non-null   object
 11  Action                    5000 non-null   int64 
 12  Adventure                 5000 non-null   int64 
 13  Animation                 5000 non-null   int64 
 14  Comedy                  

In [66]:
viewing_freq_map = {
    'High': 3,
    'Medium': 2,
    'Low': 1
}
user['viewing_frequency_numerical'] = user['viewing_frequency'].map(viewing_freq_map)
user['viewing_frequency_numerical'] = user['viewing_frequency_numerical'].fillna(0)


In [67]:
print(user['viewing_frequency_numerical'].unique())

[2 1 3]


In [68]:
user=user.dropna(subset=['fav_production_companies'])
count=user['fav_production_companies'].isnull().sum()
print(f"Missung values is:{count}")

Missung values is:0


Feature Engineering

Movie Dataset:

In [69]:
print(movie['genres'].unique())

['Family, Fantasy' 'Documentary' 'Thriller, Horror' ...
 'Drama, Adventure, Fantasy, Science Fiction, War' 'Family'
 'Drama, Adventure, Mystery']


In [70]:
movie['genres_list'] = movie['genres'].apply(lambda x: x.split(', '))

In [71]:
genres_encoded = mlb.fit_transform(movie['genres_list'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
movie = pd.concat([movie, genres_df], axis=1)

In [72]:
##movie = movie.drop(columns=['genres'])

In [73]:
movie['decade'] = (movie['release_year'] // 10) * 10
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9880 entries, 0 to 9879
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              9880 non-null   int64  
 1   title                 9880 non-null   object 
 2   overview              9880 non-null   object 
 3   genres                9880 non-null   object 
 4   release_date          9880 non-null   object 
 5   release_year          9880 non-null   int64  
 6   runtime               9880 non-null   float64
 7   original_language     9880 non-null   object 
 8   vote_average          9880 non-null   float64
 9   vote_count            9880 non-null   int64  
 10  popularity            9880 non-null   float64
 11  production_companies  9880 non-null   object 
 12  genres_list           9880 non-null   object 
 13  Action                9880 non-null   int64  
 14  Adventure             9880 non-null   int64  
 15  Animation            

User Dataset

In [74]:
genre_columns = mlb.classes_
for genre in genre_columns:
    user[f'liked_{genre}_preference'] = user['liked_genres'].apply(lambda x: genre in x).astype(int)

In [75]:
user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4987 entries, 0 to 4999
Data columns (total 69 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   user_id                           4987 non-null   int64 
 1   watched_movies                    4987 non-null   object
 2   liked_movies                      4987 non-null   object
 3   disliked_movies                   4987 non-null   object
 4   liked_genres                      4987 non-null   object
 5   disliked_genres                   4987 non-null   object
 6   language_preferred                4987 non-null   object
 7   watch_timestamps                  4987 non-null   object
 8   user_ratings                      4987 non-null   object
 9   viewing_frequency                 4987 non-null   object
 10  fav_production_companies          4987 non-null   object
 11  Action                            4987 non-null   int64 
 12  Adventure           

In [76]:
user_movie_interaction = pd.DataFrame(0, index=user['user_id'], columns=movie['movie_id'])
for i, row in user.iterrows():
    for movie_id in row['liked_movies']:
        user_movie_interaction.loc[row['user_id'], movie_id] = 1

In [77]:
user_production_companies_encoded = pd.get_dummies(user['fav_production_companies'], prefix='production_company')
user = pd.concat([user, user_production_companies_encoded], axis=1)

In [78]:
user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4987 entries, 0 to 4999
Columns: 939 entries, user_id to production_company_Yong Film
dtypes: int64(59), object(10), uint8(870)
memory usage: 6.8+ MB


## Model Selection & Training

- 1.Collaborative Filtering (Matrix Factorization using SVD) 
- 2.Content-Based Filtering

### Collaborative Filtering using SVD (Singular Value Decomposition)

In [79]:
svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(user_movie_interaction.fillna(0))
svd_reconstructed = np.dot(svd_matrix, svd.components_)

In [80]:
mse = mean_squared_error(user_movie_interaction.fillna(0), svd_reconstructed)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.0009652109454170243


### Content-Based Filtering

In [81]:
movie_features = movie[['release_year', 'popularity', 'runtime'] + list(genres_df.columns)]

In [82]:
scaler = StandardScaler()
movie_features_scaled = scaler.fit_transform(movie_features)
cosine_sim = cosine_similarity(movie_features_scaled)

In [83]:
def get_content_based_recommendations(movie_id, cosine_sim, movie_df, top_n=10):
    print(f"Requested movie_id: {movie_id}")
    print(f"Available movie_ids in movie_df: {movie_df['movie_id'].unique()}")
    if movie_id not in movie_df['movie_id'].values:
        print(f"Movie ID {movie_id} not found in the movie dataframe.")
        return pd.DataFrame() 
    idx = movie_df[movie_df['movie_id'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]
    movie_indices = [i[0] for i in sim_scores]
    return movie_df.iloc[movie_indices][['movie_id', 'title', 'genres']]

In [84]:
recommendations = get_content_based_recommendations(1726, cosine_sim, movie, top_n=5)
print(recommendations)

Requested movie_id: 1726
Available movie_ids in movie_df: [ 447273 1437446 1204967 ...  518978  517024  515908]
      movie_id                       title                              genres
4079     13475                   Star Trek  Science Fiction, Action, Adventure
466      10138                  Iron Man 2  Adventure, Action, Science Fiction
1017     20526                TRON: Legacy  Adventure, Action, Science Fiction
579      68721                  Iron Man 3  Action, Adventure, Science Fiction
907     127585  X-Men: Days of Future Past  Action, Adventure, Science Fiction


### Hybrid Model

In [85]:
user_movie_interaction = pd.DataFrame(0, index=user['user_id'], columns=movie['movie_id'])

In [86]:
for i, row in user.iterrows():
    for movie_id in row['liked_movies']:
        user_movie_interaction.loc[row['user_id'], movie_id] = 1
user_similarity = cosine_similarity(user_movie_interaction.fillna(0))
user_similarity_df = pd.DataFrame(user_similarity, index=user['user_id'], columns=user['user_id'])

In [87]:
print(user_similarity_df.head())

user_id  1     2     3       4       5     6     7         8         9     \
user_id                                                                     
1         1.0   0.0   0.0  0.0000  0.0000   0.0   0.0  0.000000  0.083333   
2         0.0   1.0   0.0  0.0000  0.0000   0.0   0.0  0.069007  0.000000   
3         0.0   0.0   1.0  0.0000  0.0000   0.0   0.0  0.000000  0.000000   
4         0.0   0.0   0.0  1.0000  0.1557   0.0   0.0  0.155700  0.000000   
5         0.0   0.0   0.0  0.1557  1.0000   0.0   0.0  0.000000  0.000000   

user_id  10    ...  4991  4992      4993  4994  4995      4996  4997  \
user_id        ...                                                     
1         0.0  ...   0.0   0.0  0.000000   0.0   0.0  0.000000   0.0   
2         0.0  ...   0.0   0.0  0.000000   0.0   0.0  0.089087   0.0   
3         0.0  ...   0.0   0.0  0.000000   0.0   0.0  0.000000   0.0   
4         0.0  ...   0.0   0.0  0.000000   0.0   0.0  0.000000   0.0   
5         0.0  ...   0.0   0

In [88]:
def hybrid_recommendation(user_id, movie_id, content_sim, user_sim, movie_df, user_movie_interaction, alpha=0.5, top_n=5):
    idx = movie_df[movie_df['movie_id'] == movie_id].index[0]
    content_sim_scores = list(enumerate(content_sim[idx]))
    content_sim_scores = sorted(content_sim_scores, key=lambda x: x[1], reverse=True)
    user_idx = user_movie_interaction[user_movie_interaction.index == user_id].index[0]
    user_sim_scores = user_sim[user_idx]
    content_sim_scores = [score[1] for score in content_sim_scores]
    content_sim_scores = [score / max(content_sim_scores) for score in content_sim_scores] 
    user_sim_scores = user_sim_scores.tolist()
    user_sim_scores = [score / max(user_sim_scores) for score in user_sim_scores]
    hybrid_scores = []
    for i, (content_score, user_score) in enumerate(zip(content_sim_scores, user_sim_scores)):
        combined_score = alpha * content_score + (1 - alpha) * user_score
        hybrid_scores.append((movie_df.iloc[i]['movie_id'], movie_df.iloc[i]['title'], combined_score))
    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[2], reverse=True)
    top_recommendations = hybrid_scores[:top_n]
    print(f"Top {top_n} Hybrid Recommendations for User {user_id} on Movie {movie_id}:")
    for movie in top_recommendations:
        print(f"Movie ID: {movie[0]}, Title: {movie[1]}, Combined Score: {movie[2]}")
    
    return top_recommendations

In [89]:
user_id = 1 
movie_id = 1726
recommendations = hybrid_recommendation(user_id, movie_id, cosine_sim, user_similarity, movie, user_movie_interaction, top_n=5)

Top 5 Hybrid Recommendations for User 1 on Movie 1726:
Movie ID: 1437446, Title: The Twister: Caught in the Storm, Combined Score: 0.9998764974505508
Movie ID: 1436366, Title: Jasmine That Blooms in Autumn, Combined Score: 0.5849660028938579
Movie ID: 1436712, Title: Girl-Boy, Combined Score: 0.5381568836293948
Movie ID: 1434009, Title: We Live Here, Combined Score: 0.5355395769714884
Movie ID: 1417181, Title: Dawn of Impressionism: Paris 1874, Combined Score: 0.5322756375666748


In [90]:
recommendations = hybrid_recommendation(5, 1726, cosine_sim, user_similarity, movie, user_movie_interaction, top_n=5)
print(recommendations)

Top 5 Hybrid Recommendations for User 5 on Movie 1726:
Movie ID: 1437920, Title: Centered: Joe Lieberman, Combined Score: 0.998284782507556
Movie ID: 1440404, Title: Ambivalence, Combined Score: 0.582440863891718
Movie ID: 1441966, Title: Take the Money and Run, Combined Score: 0.5562170041066365
Movie ID: 1436815, Title: Oy'una Geldik, Combined Score: 0.5513961923696953
Movie ID: 1434009, Title: We Live Here, Combined Score: 0.5281460135182078
[(1437920, 'Centered: Joe Lieberman', 0.998284782507556), (1440404, 'Ambivalence', 0.582440863891718), (1441966, 'Take the Money and Run', 0.5562170041066365), (1436815, "Oy'una Geldik", 0.5513961923696953), (1434009, 'We Live Here', 0.5281460135182078)]


## Model Training

In [91]:
def evaluate_recommendations(user_id, movie_id, recommendations, top_n=5):
    actual_liked_movies = user.loc[user['user_id'] == user_id, 'liked_movies'].values[0]
    recommended_movies = [movie[0] for movie in recommendations] 
    relevant_recommended_movies = [movie for movie in recommended_movies if movie in actual_liked_movies]
    precision_at_k = len(relevant_recommended_movies) / top_n if top_n > 0 else 0
    recall_at_k = len(relevant_recommended_movies) / len(actual_liked_movies) if len(actual_liked_movies) > 0 else 0
    print(f"Precision@{top_n}: {precision_at_k}")
    print(f"Recall@{top_n}: {recall_at_k}")
    return precision_at_k, recall_at_k
precision, recall = evaluate_recommendations(1, 1726, recommendations, top_n=5)

Precision@5: 0.0
Recall@5: 0.0


In [92]:
def evaluate_for_multiple_users(user_ids, movie_ids, top_n=5):
    precisions = []
    recalls = []
    for user_id in user_ids:
        for movie_id in movie_ids:
            recommendations = hybrid_recommendation(user_id, movie_id, cosine_sim, user_similarity, movie, user_movie_interaction, top_n=top_n)
            precision, recall = evaluate_recommendations(user_id, movie_id, recommendations, top_n)
            precisions.append(precision)
            recalls.append(recall)
    avg_precision = sum(precisions) / len(precisions) if len(precisions) > 0 else 0
    avg_recall = sum(recalls) / len(recalls) if len(recalls) > 0 else 0
    print(f"Average Precision@{top_n}: {avg_precision}")
    print(f"Average Recall@{top_n}: {avg_recall}")
    return avg_precision, avg_recall
user_ids = [1, 2, 3]
movie_ids = [1726, 447273, 1204967]
avg_precision, avg_recall = evaluate_for_multiple_users(user_ids, movie_ids, top_n=5)

Top 5 Hybrid Recommendations for User 1 on Movie 1726:
Movie ID: 1437446, Title: The Twister: Caught in the Storm, Combined Score: 0.9998764974505508
Movie ID: 1436366, Title: Jasmine That Blooms in Autumn, Combined Score: 0.5849660028938579
Movie ID: 1436712, Title: Girl-Boy, Combined Score: 0.5381568836293948
Movie ID: 1434009, Title: We Live Here, Combined Score: 0.5355395769714884
Movie ID: 1417181, Title: Dawn of Impressionism: Paris 1874, Combined Score: 0.5322756375666748
Precision@5: 0.0
Recall@5: 0.0
Top 5 Hybrid Recommendations for User 1 on Movie 447273:
Movie ID: 1437446, Title: The Twister: Caught in the Storm, Combined Score: 0.9464313346145627
Movie ID: 447273, Title: Snow White, Combined Score: 0.5
Movie ID: 1436366, Title: Jasmine That Blooms in Autumn, Combined Score: 0.48793457167229354
Movie ID: 1417181, Title: Dawn of Impressionism: Paris 1874, Combined Score: 0.47784615651284723
Movie ID: 1434009, Title: We Live Here, Combined Score: 0.4671139334491524
Precision@5

In [93]:
def evaluate_hybrid_model(user_ids, movie_ids, top_n=5):
    avg_precision, avg_recall = evaluate_for_multiple_users(user_ids, movie_ids, top_n)
    
    print(f"Hybrid Model Evaluation Metrics (Top {top_n} Recommendations):")
    print(f"Average Precision@{top_n}: {avg_precision:.4f}")
    print(f"Average Recall@{top_n}: {avg_recall:.4f}")
    
    return avg_precision, avg_recall
evaluate_hybrid_model(user_ids=[1, 2, 3], movie_ids=[1726, 447273, 1204967], top_n=5)

Top 5 Hybrid Recommendations for User 1 on Movie 1726:
Movie ID: 1437446, Title: The Twister: Caught in the Storm, Combined Score: 0.9998764974505508
Movie ID: 1436366, Title: Jasmine That Blooms in Autumn, Combined Score: 0.5849660028938579
Movie ID: 1436712, Title: Girl-Boy, Combined Score: 0.5381568836293948
Movie ID: 1434009, Title: We Live Here, Combined Score: 0.5355395769714884
Movie ID: 1417181, Title: Dawn of Impressionism: Paris 1874, Combined Score: 0.5322756375666748
Precision@5: 0.0
Recall@5: 0.0
Top 5 Hybrid Recommendations for User 1 on Movie 447273:
Movie ID: 1437446, Title: The Twister: Caught in the Storm, Combined Score: 0.9464313346145627
Movie ID: 447273, Title: Snow White, Combined Score: 0.5
Movie ID: 1436366, Title: Jasmine That Blooms in Autumn, Combined Score: 0.48793457167229354
Movie ID: 1417181, Title: Dawn of Impressionism: Paris 1874, Combined Score: 0.47784615651284723
Movie ID: 1434009, Title: We Live Here, Combined Score: 0.4671139334491524
Precision@5

(0.0, 0.0)

In [94]:
mse = mean_squared_error(user_movie_interaction.fillna(0), svd_reconstructed)
print(f'Mean Squared Error (MSE): {mse}')

Mean Squared Error (MSE): 0.0009652109454170243


Combined collaborative filtering (SVD + user similarity) and content-based filtering (cosine similarity) into a hybrid recommendation system and set up the evaluation framework to assess the performance of your model

In [95]:
movie.to_csv('processed_movie_data.csv', index=False) 
user.to_csv('processed_user_data.csv', index=False)
print("Successfully saved the processed data")

Successfully saved the processed data


#### Model

In [45]:
user["liked_movies"] = user["liked_movies"].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [46]:
best_mse = float("inf")
best_n_components = 50

In [47]:
for n in [10, 20, 50, 100, 150]:
    svd = TruncatedSVD(n_components=n, random_state=42)
    svd_matrix = svd.fit_transform(user_movie_interaction.fillna(0))
    svd_reconstructed = np.dot(svd_matrix, svd.components_)
    mse = mean_squared_error(user_movie_interaction.fillna(0), svd_reconstructed)
    if mse < best_mse:
        best_mse = mse
        best_n_components = n

print(f"Best n_components: {best_n_components}, MSE: {best_mse}")

Best n_components: 150, MSE: 0.0008283749625260083


In [48]:
def recommend_for_new_user(preferred_genres, language_preference, top_n=5):
    filtered_movies = movies[movies["genres"].apply(lambda x: any(genre in x for genre in preferred_genres))]
    if language_preference:
        filtered_movies = filtered_movies[filtered_movies["original_language"] == language_preference]
    return filtered_movies.sort_values(by="popularity", ascending=False).head(top_n)

In [49]:
def evaluate_recommendations(user_id, recommendations, top_n=5):
    actual_liked_movies = users.loc[users["user_id"] == user_id, "liked_movies"].values[0]
    recommended_movies = [movie["movie_id"] for movie in recommendations]
    relevant_recommended_movies = [m for m in recommended_movies if m in actual_liked_movies]
    precision = len(relevant_recommended_movies) / top_n if top_n > 0 else 0
    recall = len(relevant_recommended_movies) / len(actual_liked_movies) if len(actual_liked_movies) > 0 else 0
    return precision, recall

## TESTING

In [96]:
import numpy as np
import pandas as pd
from flask import Flask, render_template, request, jsonify
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Read movie and user data
movies = pd.read_csv("/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/processed_movie_data.csv")  # Ensure this contains 'movie_id', 'title', 'genres'
users = pd.read_csv("/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/processed_user_data.csv")  # Ensure this contains 'user_id', 'liked_movies'

# Convert liked_movies from string to list
users["liked_movies"] = users["liked_movies"].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Movie-User Interaction Matrix
user_movie_interaction = pd.DataFrame(0, index=users['user_id'], columns=movies['movie_id'])

# Fill in the interaction matrix
for i, row in users.iterrows():
    for movie_id in row['liked_movies']:
        user_movie_interaction.loc[row['user_id'], movie_id] = 1

# SVD Model (TruncatedSVD for dimensionality reduction)
svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(user_movie_interaction.fillna(0))
svd_reconstructed = np.dot(svd_matrix, svd.components_)

# Similarity Matrix for Content-Based Filtering (Cosine Similarity)
scaler = StandardScaler()
movie_features_scaled = scaler.fit_transform(movies.iloc[:, 13:33])  # Assuming genres columns start from index 13
cosine_sim = cosine_similarity(movie_features_scaled)

# Content-Based Recommendation Function
def get_content_based_recommendations(movie_id, cosine_sim, top_n=10):
    idx = movies[movies['movie_id'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['movie_id', 'title', 'genres']]

# Hybrid Recommendation Function
def hybrid_recommendation(user_id, movie_id, alpha=0.5, top_n=5):
    # Content-based scores
    idx = movies[movies['movie_id'] == movie_id].index[0]
    content_sim_scores = list(enumerate(cosine_sim[idx]))
    content_sim_scores = sorted(content_sim_scores, key=lambda x: x[1], reverse=True)

    # User-based similarity scores
    user_idx = user_movie_interaction[user_movie_interaction.index == user_id].index[0]
    user_sim_scores = cosine_similarity(user_movie_interaction.loc[user_idx].values.reshape(1, -1), user_movie_interaction.fillna(0))
    user_sim_scores = user_sim_scores.flatten()

    content_sim_scores = [score[1] for score in content_sim_scores]
    content_sim_scores = [score / max(content_sim_scores) for score in content_sim_scores]
    user_sim_scores = user_sim_scores.tolist()
    user_sim_scores = [score / max(user_sim_scores) for score in user_sim_scores]

    # Combine scores
    hybrid_scores = []
    for i, (content_score, user_score) in enumerate(zip(content_sim_scores, user_sim_scores)):
        combined_score = alpha * content_score + (1 - alpha) * user_score
        hybrid_scores.append((movies.iloc[i]['movie_id'], movies.iloc[i]['title'], combined_score))

    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[2], reverse=True)
    return hybrid_scores[:top_n]

# Function to Handle Cold Start (New User)
def recommend_for_new_user(preferred_genres, language_preference, top_n=5):
    filtered_movies = movies[movies["genres"].apply(lambda x: any(genre in x for genre in preferred_genres))]
    if language_preference:
        filtered_movies = filtered_movies[filtered_movies["original_language"] == language_preference]
    return filtered_movies.sort_values(by="popularity", ascending=False).head(top_n)

# Flask app setup
app = Flask(__name__)

@app.route("/")
def home():
    return render_template("index.html")  # Serve the HTML page

@app.route("/recommend", methods=["GET"])
def recommend():
    try:
        # Get parameters from the URL
        user_id = int(request.args.get("user_id"))
        movie_id = int(request.args.get("movie_id"))
        top_n = int(request.args.get("top_n", 5))

        # Log inputs for debugging
        print(f"Received request with user_id: {user_id}, movie_id: {movie_id}, top_n: {top_n}")

        # Check if the user exists in the users DataFrame
        user = users[users["user_id"] == user_id]
        if user.empty:
            print(f"No user found with user_id {user_id}")  # Log this error
            return jsonify({"message": f"No user found with user_id {user_id}"}), 404

        # Check if the movie exists in the movies DataFrame
        movie = movies[movies["movie_id"] == movie_id]
        if movie.empty:
            print(f"No movie found with movie_id {movie_id}")  # Log this error
            return jsonify({"message": f"No movie found with movie_id {movie_id}"}), 404

        # Get the liked_movies list for the user
        liked_movies = user.iloc[0]["liked_movies"]
        
        if not liked_movies:
            # Cold Start: New User (No watched history)
            preferred_genres = ["Action", "Comedy"]  # This can be dynamic based on user data
            language_preference = "en"
            recommendations = recommend_for_new_user(preferred_genres, language_preference, top_n)
            return jsonify({"message": "New User - No watched history", "recommendations": recommendations.to_dict(orient="records")})

        # Hybrid Recommendation
        recommendations = hybrid_recommendation(user_id, movie_id, alpha=0.5, top_n=top_n)

        # Convert the recommendation data to native Python types (e.g., int) to avoid JSON serialization issues
        recommendations = [
            {
                "movie_id": int(movie[0]),  # Convert numpy.int64 to int
                "title": movie[1],
                "combined_score": movie[2]
            }
            for movie in recommendations
        ]

        return jsonify({"user_id": user_id, "movie_id": movie_id, "recommendations": recommendations})

    except Exception as e:
        # Log the error and provide detailed message
        print(f"Error occurred: {str(e)}")  # This will print the error in the console/logs
        return jsonify({"error": f"An error occurred while fetching recommendations: {str(e)}"}), 500

@app.route("/movie-details", methods=["GET"])
def movie_details():
    try:
        movie_id = int(request.args.get("movie_id"))

        # Check if the movie exists
        movie = movies[movies["movie_id"] == movie_id]
        if movie.empty:
            print(f"No movie found with movie_id {movie_id}")  # Log this error
            return jsonify({"message": f"No movie found with movie_id {movie_id}"}), 404

        # Fetch the movie details
        movie_details = movie.iloc[0]

        # Prepare the response data
        response = {
            "title": movie_details["title"],
            "overview": movie_details["overview"],
            "genres": movie_details["genres"],
            "release_date": movie_details["release_date"],
            "original_language": movie_details["original_language"],
            "runtime": movie_details["runtime"]
        }

        return jsonify(response)

    except Exception as e:
        # Log the error and provide detailed message
        print(f"Error occurred: {str(e)}")
        return jsonify({"error": f"An error occurred while fetching movie details: {str(e)}"}), 500

In [50]:
import numpy as np
import pandas as pd
import ast  # Import ast for safe evaluation
from flask import Flask, render_template, request, jsonify
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

# Read movie and user data
movies = pd.read_csv("/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/processed_movie_data.csv")  # Ensure this contains 'movie_id', 'title', 'genres'
users = pd.read_csv("/Users/antropravin/Desktop/Bezohminds/Task/Movie Recommendation System/processed_user_data.csv")  # Ensure this contains 'user_id', 'liked_movies'

# Convert liked_movies from string to list
users["liked_movies"] = users["liked_movies"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# One-Hot Encoding the genres column
mlb = MultiLabelBinarizer()
# Safely parse the genres column using ast.literal_eval()
genre_one_hot = mlb.fit_transform(movies['genres'].apply(ast.literal_eval))  # Assuming genres are stored as string lists

# Convert the result into a DataFrame and concatenate it with the original movies DataFrame
genre_columns = mlb.classes_
genre_df = pd.DataFrame(genre_one_hot, columns=genre_columns)

# Now merge the one-hot encoded genres with the movies DataFrame
movies = pd.concat([movies, genre_df], axis=1)

# Movie-User Interaction Matrix
user_movie_interaction = pd.DataFrame(0, index=users['user_id'], columns=movies['movie_id'])

# Fill in the interaction matrix
for i, row in users.iterrows():
    for movie_id in row['liked_movies']:
        user_movie_interaction.loc[row['user_id'], movie_id] = 1

# SVD Model (TruncatedSVD for dimensionality reduction)
svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(user_movie_interaction.fillna(0))
svd_reconstructed = np.dot(svd_matrix, svd.components_)

# Similarity Matrix for Content-Based Filtering (Cosine Similarity)
scaler = StandardScaler()
movie_features_scaled = scaler.fit_transform(movies[genre_columns])  # Only use the new one-hot encoded columns
cosine_sim = cosine_similarity(movie_features_scaled)

# Content-Based Recommendation Function
def get_content_based_recommendations(movie_id, cosine_sim, top_n=10):
    idx = movies[movies['movie_id'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['movie_id', 'title', 'genres']]

# Hybrid Recommendation Function
def hybrid_recommendation(user_id, movie_id, alpha=0.5, top_n=5):
    # Content-based scores
    idx = movies[movies['movie_id'] == movie_id].index[0]
    content_sim_scores = list(enumerate(cosine_sim[idx]))
    content_sim_scores = sorted(content_sim_scores, key=lambda x: x[1], reverse=True)

    # User-based similarity scores
    user_idx = user_movie_interaction[user_movie_interaction.index == user_id].index[0]
    user_sim_scores = cosine_similarity(user_movie_interaction.loc[user_idx].values.reshape(1, -1), user_movie_interaction.fillna(0))
    user_sim_scores = user_sim_scores.flatten()

    content_sim_scores = [score[1] for score in content_sim_scores]
    content_sim_scores = [score / max(content_sim_scores) for score in content_sim_scores]
    user_sim_scores = user_sim_scores.tolist()
    user_sim_scores = [score / max(user_sim_scores) for score in user_sim_scores]

    # Combine scores
    hybrid_scores = []
    for i, (content_score, user_score) in enumerate(zip(content_sim_scores, user_sim_scores)):
        combined_score = alpha * content_score + (1 - alpha) * user_score
        hybrid_scores.append((movies.iloc[i]['movie_id'], movies.iloc[i]['title'], combined_score))

    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[2], reverse=True)
    return hybrid_scores[:top_n]

# Function to Handle Cold Start (New User)
def recommend_for_new_user(preferred_genres, language_preference, top_n=5):
    filtered_movies = movies[movies["genres"].apply(lambda x: any(genre in x for genre in preferred_genres))]
    if language_preference:
        filtered_movies = filtered_movies[filtered_movies["original_language"] == language_preference]
    return filtered_movies.sort_values(by="popularity", ascending=False).head(top_n)

# Flask app setup
app = Flask(__name__)

@app.route("/")
def home():
    return render_template("index.html")  # Serve the HTML page

@app.route("/recommend", methods=["GET"])
def recommend():
    try:
        # Get parameters from the URL
        user_id = int(request.args.get("user_id"))
        movie_id = int(request.args.get("movie_id"))
        top_n = int(request.args.get("top_n", 5))

        # Log inputs for debugging
        print(f"Received request with user_id: {user_id}, movie_id: {movie_id}, top_n: {top_n}")

        # Check if the user exists in the users DataFrame
        user = users[users["user_id"] == user_id]
        if user.empty:
            print(f"No user found with user_id {user_id}")  # Log this error
            return jsonify({"message": f"No user found with user_id {user_id}"}), 404

        # Check if the movie exists in the movies DataFrame
        movie = movies[movies["movie_id"] == movie_id]
        if movie.empty:
            print(f"No movie found with movie_id {movie_id}")  # Log this error
            return jsonify({"message": f"No movie found with movie_id {movie_id}"}), 404

        # Get the liked_movies list for the user
        liked_movies = user.iloc[0]["liked_movies"]
        
        if not liked_movies:
            # Cold Start: New User (No watched history)
            preferred_genres = ["Action", "Comedy"]  # This can be dynamic based on user data
            language_preference = "en"
            recommendations = recommend_for_new_user(preferred_genres, language_preference, top_n)
            return jsonify({"message": "New User - No watched history", "recommendations": recommendations.to_dict(orient="records")})

        # Hybrid Recommendation
        recommendations = hybrid_recommendation(user_id, movie_id, alpha=0.5, top_n=top_n)

        # Convert the recommendation data to native Python types (e.g., int) to avoid JSON serialization issues
        recommendations = [
            {
                "movie_id": int(movie[0]),  # Convert numpy.int64 to int
                "title": movie[1],
                "combined_score": movie[2]
            }
            for movie in recommendations
        ]

        return jsonify({"user_id": user_id, "movie_id": movie_id, "recommendations": recommendations})

    except Exception as e:
        # Log the error and provide detailed message
        print(f"Error occurred: {str(e)}")  # This will print the error in the console/logs
        return jsonify({"error": f"An error occurred while fetching recommendations: {str(e)}"}), 500

@app.route("/movie-details", methods=["GET"])
def movie_details():
    try:
        movie_id = int(request.args.get("movie_id"))

        # Check if the movie exists
        movie = movies[movies["movie_id"] == movie_id]
        if movie.empty:
            print(f"No movie found with movie_id {movie_id}")  # Log this error
            return jsonify({"message": f"No movie found with movie_id {movie_id}"}), 404

        # Fetch the movie details
        movie_details = movie.iloc[0]

        # Prepare the response data
        response = {
            "title": movie_details["title"],
            "overview": movie_details["overview"],
            "genres": movie_details["genres"],
            "release_date": movie_details["release_date"],
            "original_language": movie_details["original_language"],
            "runtime": movie_details["runtime"]
        }

        return jsonify(response)

    except Exception as e:
        # Log the error and provide detailed message
        print(f"Error occurred: {str(e)}")
        return jsonify({"error": f"An error occurred while fetching movie details: {str(e)}"}), 500

if __name__ == "__main__":
    app.run(debug=True)


ValueError: malformed node or string on line 1: <ast.Name object at 0x158a5e770>

In [None]:
print(movies['runtime'].head())  # Check the first few rows to verify the data format

0    0.252315
1    0.206019
2    0.000000
3    0.000000
4    0.317130
Name: runtime, dtype: float64


: 