In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Part 1 : Preparing the Data

In [3]:
#Reading in the Data - https://grouplens.org/datasets/movielens/1m/
movies = pd.io.parsers.read_csv('/content/drive/My Drive/Big Data & AI Project/Data/ml-1m/movies.dat',
    names=['movieId', 'title', 'genre'],
    engine='python', delimiter='::')

ratings = pd.io.parsers.read_csv('/content/drive/My Drive/Big Data & AI Project/Data/ml-1m/ratings.dat', 
    names=['userId', 'movieId', 'rating', 'timestamp'],
    engine='python', delimiter='::')

In [4]:
ratings_v2 = ratings.drop(columns = ['timestamp'])
train_ratings,test_ratings = train_test_split(ratings_v2,test_size=0.7, random_state=0)

In [5]:
training_matrix = train_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

test_matrix = test_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

# Part 2 : Model Comparisons

## 2.1 User-User and Item-Item Collaborative Filtering

In [6]:
from sklearn.metrics.pairwise import pairwise_distances

user_sim = pairwise_distances(training_matrix, metric = 'cosine')
item_sim = pairwise_distances(training_matrix.T, metric = 'cosine')

In [7]:
def user_item_pred(rating_matrix, similarity_matrix, type):
  if type == 'user':
    mean_user_rating = rating_matrix.mean(axis = 1)
    ratings_diff = rating_matrix - mean_user_rating[:,np.newaxis]
    pred = mean_user_rating[:,np.newaxis] + similarity_matrix.dot(ratings_diff)/ np.array([np.abs(similarity_matrix).sum(axis = 1)]).T
  elif type == 'item':
    pred = rating_matrix.dot(similarity_matrix)/np.array([np.abs(similarity_matrix).sum(axis = 1)])
  return pred

In [8]:
user_preds = user_item_pred(training_matrix,user_sim, type = 'user')
item_preds = user_item_pred(training_matrix,item_sim, type = 'item')

  after removing the cwd from sys.path.
  """


In [9]:
convert_dict = {'userId': str,'movieId': str}
  
test_ratings = test_ratings.astype(convert_dict) 

In [10]:
from sklearn.metrics import mean_squared_error

def rmse(preds,actuals):
  preds = pd.DataFrame(preds)
  preds.columns = training_matrix.columns
  preds['userId'] = ratings.userId.unique()
  preds = preds.melt(id_vars= 'userId')
  preds['userId'] = preds['userId'].astype(str)
  preds['movieId'] = preds['movieId'].astype(str)
  final_preds = pd.merge(actuals, preds, how='left', on=['userId',"movieId"])
  error = (final_preds['rating'] - final_preds['value']).tolist()
  newlist = [x for x in error if np.isnan(x) == False]
  rmse_val = np.sqrt(sum(np.square(newlist))/len(newlist))
  return rmse_val

In [12]:
rmse(user_preds,test_ratings)

3.537943529275091

In [13]:
rmse(item_preds,test_ratings)

3.6481988810422616

## 2.2 SVD

In [11]:
!pip install git+https://github.com/mayukh18/reco.git
from reco.metrics import rmse
import sys
sys.path.insert(0,'/content/drive/My Drive/Big Data & AI Project/Data/')
import svd_updated2

Collecting git+https://github.com/mayukh18/reco.git
  Cloning https://github.com/mayukh18/reco.git to /tmp/pip-req-build-x7kmvb4y
  Running command git clone -q https://github.com/mayukh18/reco.git /tmp/pip-req-build-x7kmvb4y
Building wheels for collected packages: reco
  Building wheel for reco (setup.py) ... [?25l[?25hdone
  Created wheel for reco: filename=reco-0.2.1-cp37-cp37m-linux_x86_64.whl size=9631019 sha256=fd20ccb9466bf8aff8b2be862ce0b58821c726630f7e2b2b019ff40980d9d820
  Stored in directory: /tmp/pip-ephem-wheel-cache-47_j0p2t/wheels/cf/63/60/5fc8ea800203f6dfc9b24d3d42c332b0e6c85d00808ea02292
Successfully built reco
Installing collected packages: reco
Successfully installed reco-0.2.1


In [12]:
svd = svd_updated2.SVDRecommender(no_of_features=8)
user_item_matrix, users, items = svd.create_utility_matrix(train_ratings, formatizer={'user':'userId', 'item':'movieId', 'value':'rating'})
svd.fit(user_item_matrix, users, items)

In [13]:
preds = svd.predict(test_ratings, formatizer = {'user':'userId', 'item': 'movieId'})
print(rmse(preds, list(test_ratings['rating'])))

1.1262310581847832


# Part 3: Recommendations using SVD

## 3.1 Training SVD on Entire Data

In [17]:
ratings_matrix = ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [18]:
ratings_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3913,3914,3915,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925,3926,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,4.0,0.0,3.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ratings_matrix_SVD = ratings_matrix.to_numpy()
user_ratings_mean = np.mean(ratings_matrix_SVD, axis = 1)
ratings_matrix_SVD = ratings_matrix_SVD - user_ratings_mean.reshape(-1, 1)

In [20]:
from scipy.sparse.linalg import svds

U, sigma, Vt = svds(ratings_matrix_SVD, k = 50)
sigma = np.diag(sigma)

In [21]:
SVD_final_preds = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
preds_df = pd.DataFrame(SVD_final_preds, columns = ratings_matrix.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,3913,3914,3915,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925,3926,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936,3937,3938,3939,3940,3941,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,0.512867,-0.089172,0.310181,-0.002005,-0.052401,-0.189827,0.23836,0.006466,-0.099315,-0.069682,-0.321492,0.111577,0.034795,0.320576,-0.118217,-0.012647,0.065573,-0.098318,0.064081,-0.005914,0.091936,0.180563,-0.009566,2.641693,-0.012495,0.765179,0.019784,0.002917,0.053079,0.014856,...,0.01881,-0.018782,0.022249,0.227852,-0.067653,-0.046039,-0.023574,-0.019405,-0.005116,-0.032921,-0.008259,-0.019157,0.007527,-0.008687,-0.02563,-0.013563,0.01524,-0.044665,-0.009568,-0.043549,-0.003131,-0.008221,-0.005948,0.031885,-0.003424,-0.001159,-0.002124,-0.002827,0.010393,-0.001068,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,0.772656,0.046179,-0.054562,0.042344,0.04839,0.347313,1.074905,-0.099782,0.008163,0.250869,2.186638,0.018789,-0.002199,0.218934,0.824475,0.139274,-0.007135,0.053071,-0.156952,0.044739,-0.00296,0.453298,-0.007484,0.920325,0.016566,1.335129,-0.015066,-0.045602,0.034649,0.12201,...,-0.042363,-0.137822,-0.112071,0.380783,-0.036273,-0.016174,0.00292,-0.148021,-0.017614,-0.033474,0.086133,0.008153,-0.126819,0.109208,0.001798,0.151866,0.014118,0.032897,0.005764,0.042259,0.022404,0.00326,0.010556,0.137181,-0.042184,0.006759,-0.005789,0.00034,0.002024,0.016013,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,-0.023476,0.034796,0.065942,0.008661,0.110348,-0.002952,-0.122061,0.063974,0.061033,0.081799,0.329471,0.149579,0.095352,-0.161493,0.022545,-0.009284,-0.002677,-0.14271,0.012345,-0.085331,0.076139,-0.355795,-0.008579,1.046871,-0.088946,0.383583,-0.018144,-0.038618,0.113984,0.006942,...,0.007233,-0.047221,0.066474,-0.179455,0.097428,0.034113,0.008098,-0.024784,-0.012749,-0.007394,-0.01722,0.004719,0.113348,-0.074943,-0.145795,0.128619,0.112567,0.0455,-0.018027,-0.058946,-0.00277,-0.035276,-0.008085,0.132182,-0.017005,0.014383,0.006598,-0.006217,-0.000342,0.000518,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,-0.375831,0.068658,0.011199,0.069699,-0.037529,-0.238788,0.060607,-0.043418,0.053152,0.078237,0.357185,-0.096005,-0.028243,-0.067169,0.246164,-0.020379,0.034461,-0.022225,-0.012327,0.009182,0.01473,0.215893,-0.019687,-0.293933,-0.011511,0.145326,-0.029213,0.030029,-0.045409,-0.030684,...,-0.015077,-0.030208,0.028357,-0.072643,-0.135727,-0.053318,-0.012962,-0.054465,0.00587,-0.018048,-0.006836,-0.008222,-0.027214,-0.071677,-0.094072,-0.010745,-0.103191,-0.031297,-0.02392,-0.015053,-0.017914,-0.029561,-0.024299,-0.057678,-0.11145,-0.015473,-0.007123,-0.007416,-0.011508,-0.010038,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,-0.251178,0.012337,-0.084051,0.258937,0.01657,0.980536,1.267869,0.275619,-0.008139,-0.038832,1.849627,0.107649,-0.168424,0.386541,1.790343,0.192379,-0.054356,0.267566,1.027817,0.374665,-0.010445,1.94798,0.017468,2.784035,0.274397,1.422393,0.040553,0.022926,1.3458,0.104507,...,0.075475,0.330767,0.15047,-0.261636,0.085163,-0.014229,-0.029247,0.124172,0.092875,0.061895,0.034757,0.054386,0.047055,0.048403,0.082926,0.129035,-0.174646,0.102727,0.024732,0.04728,0.017818,0.041451,0.041595,-0.007138,-0.080448,0.018639,0.034068,0.026941,0.035905,0.024459,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


## 3.2 Recommending Movies for a User

In [21]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):

    user_row_number = userID - 1
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)

    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
               right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                      

    return user_full, recommendations

In [101]:
user_ratings, user_recommendations = recommend_movies(preds_df, 4, movies, ratings, 5)
user_ratings.drop(columns = ['timestamp']).head(10)

Unnamed: 0,userId,movieId,rating,title,genre
0,4,3468,5,"Hustler, The (1961)",Drama
9,4,1198,5,Raiders of the Lost Ark (1981),Action|Adventure
19,4,2947,5,Goldfinger (1964),Action
18,4,2692,5,Run Lola Run (Lola rennt) (1998),Action|Crime|Romance
17,4,1201,5,"Good, The Bad and The Ugly, The (1966)",Action|Western
15,4,1387,5,Jaws (1975),Action|Horror
10,4,1954,5,Rocky (1976),Action|Drama
6,4,2028,5,Saving Private Ryan (1998),Action|Drama|War
5,4,260,5,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
20,4,1240,5,"Terminator, The (1984)",Action|Sci-Fi|Thriller


In [102]:
user_recommendations

Unnamed: 0,movieId,title,genre
2488,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
1261,1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
583,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
1176,1200,Aliens (1986),Action|Sci-Fi|Thriller|War
1274,1304,Butch Cassidy and the Sundance Kid (1969),Action|Comedy|Western


## 3.3 Identifying Similar Movies

In [28]:
corr_mat = np.corrcoef(preds_df.T)
col_idx = ratings_matrix.columns.get_loc(296)
corr_specific = corr_mat[col_idx]

In [29]:
sim_movies = pd.DataFrame({'corr_specific':corr_specific, 'movieId': ratings_matrix.columns}).sort_values('corr_specific', ascending=False).head(10)
sim_movies = sim_movies[sim_movies['movieId'] != 296]

In [30]:
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(movie_data[movie_data.movieId == movie_id].title.values[0]))
    for id in top_indexes:
        print(movie_data[movie_data.movieId == id].title.values[0])

In [31]:
print_similar_movies(movies, 296, sim_movies['movieId'])

Recommendations for Pulp Fiction (1994): 

GoodFellas (1990)
Fargo (1996)
Reservoir Dogs (1992)
Usual Suspects, The (1995)
Jackie Brown (1997)
Silence of the Lambs, The (1991)
Trainspotting (1996)
L.A. Confidential (1997)
Seven (Se7en) (1995)
