# Movie Recommendations Machine Learning Algorithm Analysis

In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

## Loading the datasets

In [55]:
movies = pd.read_csv('../datasets/movies.csv')
ratings = pd.read_csv('../datasets/ratings.csv')

In [56]:
print(movies.isnull().sum())
print(ratings.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [57]:
# Join the movies and ratings
data = pd.merge(ratings, movies, on='movieId')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


## Core

### User-item interaction matrix

In [58]:
user_item_matrix = data.pivot_table(index='userId', columns='title', values='rating')
user_item_matrix.fillna(0, inplace=True)

user_item_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Matrix factorization using SVD

In [59]:
# Normalize the user-item interaction matrix
user_ratings_mean = np.mean(user_item_matrix.values, axis=1)
user_item_matrix_demeaned = user_item_matrix.values - user_ratings_mean.reshape(-1, 1)

In [60]:
# Normalize the user-item interaction matrix
user_ratings_mean = np.mean(user_item_matrix.values, axis=1)
user_item_matrix_demeaned = user_item_matrix.values - user_ratings_mean.reshape(-1, 1)

# Perform Singular Value Decomposition
U, sigma, Vt = svds(user_item_matrix_demeaned, k=50)

# Convert sigma to diagonal matrix
sigma = np.diag(sigma)

# Reconstruct the matrix
reconstructed_matrix = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predicted_ratings = pd.DataFrame(reconstructed_matrix, columns=user_item_matrix.columns)

In [61]:
predicted_ratings.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
0,-0.067955,0.01862,-0.041533,-0.037173,-0.047273,-0.007202,0.549227,-0.0089,-0.607781,0.221628,...,0.014309,-0.459805,0.022586,-0.052623,-0.020094,0.346683,-0.284519,-0.18676,1.499991,0.034606
1,-0.028293,-0.011688,-0.010462,0.001095,-0.002724,-0.007396,0.00429,0.008886,0.150959,-0.005892,...,0.004586,-0.014558,-0.027255,-0.034335,0.016768,0.05921,-0.104489,-0.009522,0.05707,0.000111
2,0.023213,0.009783,0.013288,0.010796,0.010376,0.006465,0.091815,-0.002024,0.016746,-0.003368,...,0.007332,0.045168,0.027881,0.027486,0.00645,0.019643,0.000363,0.013496,0.052682,0.011861
3,-0.008667,0.006796,-0.014741,-0.005001,0.014988,-0.033562,-0.372983,0.009115,-0.204434,0.045343,...,-0.004028,-0.133861,-0.069741,-0.057557,0.004191,0.086672,-0.199954,-0.035476,0.019514,-0.005279
4,0.011838,-0.000451,-0.00275,-0.010783,-0.01269,-0.013753,-0.105477,0.000415,0.02214,-0.09382,...,0.001723,0.047886,0.011493,0.003101,-0.007398,-0.074235,-0.004574,0.044573,-0.09133,-0.001727


### Building the recommend_movies function

In [62]:
def recommend_movies(predictions_df: pd.DataFrame, user_id: int, movies_df: pd.DataFrame, original_ratings_df: pd.DataFrame, num_recommendations=10):
  # Sort the user's predictions
  user_row_number = user_id - 1
  sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
  
  # Merge user's data with the movies data
  user_data = original_ratings_df[original_ratings_df['userId'] == user_id]
  already_rated = user_data.merge(movies_df, how='left', on='movieId').sort_values(['rating'], ascending=False)
  
  # Recommend movies that the user hasn't seen yet
  recommendations = movies_df[~movies_df['movieId'].isin(already_rated['movieId'])].merge(
    pd.DataFrame(sorted_user_predictions).reset_index(), how='left', on='title'
  ).rename(columns={user_row_number: 'Predictions'}).sort_values('Predictions', ascending=False)
  
  return already_rated, recommendations.head(num_recommendations)

In [63]:
already_rated, recommendations = recommend_movies(predicted_ratings, 1, movies, ratings, 10)

In [67]:
print(f"User has already rated {len(already_rated)} movies")

User has already rated 232 movies


In [64]:
print("User with ID 1 has already rated these movies:")
already_rated.head(10)

User with ID 1 has already rated these movies:


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
231,1,5060,5.0,964984002,M*A*S*H (a.k.a. MASH) (1970),Comedy|Drama|War
185,1,2872,5.0,964981680,Excalibur (1981),Adventure|Fantasy
89,1,1291,5.0,964981909,Indiana Jones and the Last Crusade (1989),Action|Adventure
90,1,1298,5.0,964984086,Pink Floyd: The Wall (1982),Drama|Musical
190,1,2948,5.0,964982191,From Russia with Love (1963),Action|Adventure|Thriller
189,1,2947,5.0,964982176,Goldfinger (1964),Action|Adventure|Thriller
188,1,2944,5.0,964981872,"Dirty Dozen, The (1967)",Action|Drama|War
186,1,2899,5.0,964982703,Gulliver's Travels (1939),Adventure|Animation|Children
184,1,2858,5.0,964980868,American Beauty (1999),Drama|Romance
179,1,2700,5.0,964980985,"South Park: Bigger, Longer and Uncut (1999)",Animation|Comedy|Musical


In [66]:
print("\nTop 10 movie recommendations for the user with ID 1:")
print(recommendations)


Top 10 movie recommendations for the user with ID 1:
      movieId                           title  \
736      1036                 Die Hard (1988)   
844      1221  Godfather: Part II, The (1974)   
974      1387                     Jaws (1975)   
615       858           Godfather, The (1972)   
1328     1968      Breakfast Club, The (1985)   
874      1259              Stand by Me (1986)   
1927     2804       Christmas Story, A (1983)   
1416     2080       Lady and the Tramp (1955)   
2765     4011                   Snatch (2000)   
1417     2081      Little Mermaid, The (1989)   

                                         genres  Predictions  
736                       Action|Crime|Thriller     4.023800  
844                                 Crime|Drama     3.326251  
974                               Action|Horror     3.303518  
615                                 Crime|Drama     2.894619  
1328                               Comedy|Drama     2.870946  
874                         

## Evaluation

In [68]:
def rmse(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

In [71]:
true_ratings = user_item_matrix.values[user_item_matrix.values.nonzero()].flatten()
pred_ratings = reconstructed_matrix[user_item_matrix.values.nonzero()].flatten()
error = rmse(true_ratings, pred_ratings)

In [72]:
print("RMSE:", error)

RMSE: 1.9965238759965793
