In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [31]:
# import the data, covert that into a convenient data
PATH= 'ratings_small.csv'
df = pd.read_csv(PATH, sep= ",") # taking 100000 user-movie rating tuples
# df = pd.read_csv(PATH, sep= ",")
df.shape

(100004, 4)

In [3]:
df.drop('timestamp', axis=1, inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [4]:
# shuffling the data
from sklearn.utils import shuffle
df = shuffle(df)

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating
30318,214,2467,5.0
1942,15,5553,2.0
84520,564,3572,2.0
68386,475,7454,2.0
82496,562,1220,3.5


In [6]:
# train - 80% of the data.
num_rows = df.shape[0]
train, test = df.copy(deep = True), df[int(num_rows*0.8):].copy(deep = True)
train[int(num_rows*0.8):]['rating']  = train[int(num_rows*0.8):]['rating']*0
train = shuffle(train, random_state=0)

In [7]:
# creating user-movie rating utility matrix
user_movie_rating =train.groupby(['userId','movieId'])['rating'].max().unstack().fillna(0)
user_movie_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



- Null values replaced  with zeros

In [8]:
def low_rank_k(rank, u, s, v):
    # SVD 
    u_new = u[:,:rank]
    v_new = v[:rank, :]
    s_new = s[:rank]
    singular_mat = np.diag(s_new)
    low_rank_mat = np.dot(np.dot(u_new, singular_mat), v_new)
    return low_rank_mat

In [9]:
def error(y_hat, y):
    # MSE
    y_hat = np.array(y_hat)
    y = np.array(y)
    MSE_ = (y_hat- y)**2
    MSE = MSE_.sum()/len(y_hat)
    return MSE

In [10]:
relevance_threshold = 1
relevant_df = df.loc[(df['rating'] >= relevance_threshold)]

In [11]:
print("Number of relevant user item pair is {} out of {}".format(len(relevant_df), len(df)))

Number of relevant user item pair is 98903 out of 100004


In [12]:
%%time
u, s, v = np.linalg.svd(user_movie_rating, full_matrices=False)
optimal_k=8
Pred = low_rank_k(optimal_k, u, s, v)
Pred = pd.DataFrame(Pred, columns=user_movie_rating.columns, index= user_movie_rating.index)
y_hat = [Pred.loc[int(test.iloc[i]['userId']), int(test.iloc[i]['movieId'])] 
             for i in range(len(test))] 

CPU times: user 18.3 s, sys: 240 ms, total: 18.5 s
Wall time: 19.4 s


In [13]:
u.shape, v.shape

((671, 671), (671, 9066))

### MSE

In [14]:
Pred
error(y_hat, test['rating'])

9.000763165797308

In [15]:
test['prediction'] = y_hat
test.head()

Unnamed: 0,userId,movieId,rating,prediction
2712,17,318,5.0,3.500242
30106,213,97742,2.5,0.111157
86111,575,5247,1.0,0.319659
61096,445,3999,2.5,-0.004328
79222,547,3188,3.0,0.303503


In [175]:
from collections import Counter
user_freq_dict = Counter(test['userId'].tolist())
max_voting = max(user_freq_dict.values())
for i in user_freq_dict:
    if user_freq_dict[i] == max_voting:
        user_choice =i

In [176]:
one_user= test.loc[(test['userId'] == 17)].sort_values('rating', 
                      inplace=False, 
                      ascending=False)
one_user.head()

Unnamed: 0,userId,movieId,rating,prediction
2712,17,318,5.0,3.500242
2777,17,1252,5.0,2.035789
2821,17,2076,5.0,1.286774
2786,17,1348,5.0,0.49513
2897,17,4226,5.0,3.112099


### Recall and Precision

In [155]:
relevance_threshold = 2
recommended_movies = set(one_user.loc[(one_user['prediction'] >= relevance_threshold)]['movieId'].tolist())
relavent_movies = set(one_user.loc[(one_user['rating'] >= relevance_threshold)]['movieId'].tolist())
len(relavent_movies), len(recommended_movies)

(74, 9)

In [156]:
recommended_movies_intersection_relavant_movies = [i for i in recommended_movies if i in relavent_movies]
len(recommended_movies_intersection_relavant_movies)

9

In [157]:
len(recommended_movies), len(recommended_movies_intersection_relavant_movies), len(relavent_movies)

(9, 9, 74)

In [158]:
recall = len(recommended_movies_intersection_relavant_movies)/len(relavent_movies)
precision = len(recommended_movies_intersection_relavant_movies)/len(recommended_movies)

In [159]:
precision, recall

(1.0, 0.12162162162162163)

In [160]:
f_score = 2*precision*recall/(precision+recall)
f_score

0.21686746987951808

### Recall@k and Precision@k
- recall and precision on top k predictions

In [161]:
k=18
top_k_predictions_df = one_user.sort_values('prediction', ascending=False)[:k]
top_k_predictions_df.head()

Unnamed: 0,userId,movieId,rating,prediction
2857,17,2858,4.5,4.744039
2749,17,1089,5.0,3.572829
2712,17,318,5.0,3.500242
2696,17,47,5.0,3.236452
2897,17,4226,5.0,3.112099


In [162]:
relevance_threshold=2
top_k_movies_recommended = set(top_k_predictions_df.loc[(top_k_predictions_df['prediction'] >= relevance_threshold)]['movieId'].tolist())
top_k_movies_relevant = set(top_k_predictions_df.loc[(top_k_predictions_df['rating'] >= relevance_threshold)]['movieId'].tolist())
# top_k_predictions_df['rating']

In [163]:
recommended_movies_intersection_relavant_movies = [i for i in top_k_movies_recommended
                                                   if i in top_k_movies_relevant]
recommended_movies_intersection_relavant_movies;

In [164]:
len(top_k_movies_recommended), len(recommended_movies_intersection_relavant_movies), len(top_k_movies_relevant)

(9, 9, 18)

In [165]:
if len(top_k_movies_relevant)!=0:
    recall = len(recommended_movies_intersection_relavant_movies)/len(top_k_movies_relevant)
else:
    recall = 1
if len(top_k_movies_recommended)!=0:
    precision = len(recommended_movies_intersection_relavant_movies)/len(top_k_movies_recommended)
else:
    precision = 1

In [166]:
precision, recall

(1.0, 0.5)

In [167]:
f_score = 2*precision*recall/(precision+recall)
f_score

0.6666666666666666

### Making top k predictions

In [171]:
one_user= test.loc[(test['userId'] == 20)].sort_values('rating', 
                      inplace=False, 
                      ascending=False)
one_user.head()

Unnamed: 0,userId,movieId,rating,prediction
3572,20,1148,5.0,0.352982
3563,20,780,5.0,1.102425
3618,20,38038,4.5,0.031081
3597,20,3406,4.5,0.02689
3579,20,1269,4.0,0.33912


In [177]:
movie_meta_data_PATH= "movies_metadata.csv"
movie_df = pd.read_csv(movie_meta_data_PATH, sep= ",") # taking 100000 user-movie rating tuples
movie_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [178]:
top_k_predictions_df = one_user.sort_values('prediction', ascending=False)[:k]
a=top_k_predictions_df['movieId'].unique().tolist()

In [179]:
[movie_df[movie_df.id== str(i)].title.item() for i in a if str(i) in movie_df.id.tolist()]

['Point Break',
 'The Million Dollar Hotel',
 'Shriek If You Know What I Did Last Friday the Thirteenth',
 '5 Card Stud',
 'Lonely Hearts',
 'Cool Hand Luke',
 'Judgment Night']