### MSE, Precision, Recall and F1 - SVD


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
# import the data, covert that into a convenient data
PATH= '/home/ac-optimus/data_science/recommendation_sys/ratings_small.csv'
df = pd.read_csv(PATH, sep= ",") # taking 100000 user-movie rating tuples
# df = pd.read_csv(PATH, sep= ",")
df.shape

(100004, 4)

In [4]:
df.drop('timestamp', axis=1, inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [5]:
# shuffling the data
from sklearn.utils import shuffle
df = shuffle(df)

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating
42714,306,1515,3.0
50343,370,4105,5.0
46521,342,916,5.0
66502,468,5932,4.0
6845,40,44191,4.5


In [7]:
# train - 80% of the data.
num_rows = df.shape[0]
train, test = df.copy(deep = True), df[int(num_rows*0.8):].copy(deep = True)
train[int(num_rows*0.8):]['rating']  = train[int(num_rows*0.8):]['rating']*0
train = shuffle(train, random_state=0)

In [8]:
# creating user-movie rating utility matrix
user_movie_rating =train.groupby(['userId','movieId'])['rating'].max().unstack().fillna(0)
user_movie_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



- Null values replaced  with zeros

In [9]:
def low_rank_k(rank, u, s, v):
    # SVD 
    u_new = u[:,:rank]
    v_new = v[:rank, :]
    s_new = s[:rank]
    singular_mat = np.diag(s_new)
    low_rank_mat = np.dot(np.dot(u_new, singular_mat), v_new)
    return low_rank_mat

In [10]:
def error(y_hat, y):
    # MSE
    y_hat = np.array(y_hat)
    y = np.array(y)
    MSE_ = (y_hat- y)**2
    MSE = MSE_.sum()
    return MSE

In [11]:
relevance_threshold = 1
relevant_df = df.loc[(df['rating'] >= relevance_threshold)]

In [12]:
print("Number of relevant user item pair is {} out of {}".format(len(relevant_df), len(df)))

Number of relevant user item pair is 98903 out of 100004


In [13]:
u, s, v = np.linalg.svd(user_movie_rating, full_matrices=False)
optimal_k=10
Pred = low_rank_k(optimal_k, u, s, v)
Pred = pd.DataFrame(Pred, columns=user_movie_rating.columns, index= user_movie_rating.index)
y_hat = [Pred.loc[int(test.iloc[i]['userId']), int(test.iloc[i]['movieId'])] 
             for i in range(len(test))] 

In [14]:
u.shape, v.shape

((671, 671), (671, 9066))

In [15]:
Pred
error(y_hat, test['rating'])

180558.99904622458

In [16]:
test['prediction'] = y_hat
test.head()

Unnamed: 0,userId,movieId,rating,prediction
98511,659,724,2.0,0.209932
24033,171,2683,3.0,0.313255
74602,518,2403,3.0,0.370018
7747,48,76251,3.5,1.761016
97326,652,68835,4.5,0.0


In [17]:
test['prediction'].min()

-2.872866818686975

In [18]:
from collections import Counter
user_freq_dict = Counter(test['userId'].tolist())
max_voting = max(user_freq_dict.values())
for i in user_freq_dict:
    if user_freq_dict[i] == max_voting:
        user_choice =i

In [36]:
one_user= test.loc[(test['userId'] == user_choice)].sort_values('rating', 
                      inplace=False, 
                      ascending=False)
one_user.head()

Unnamed: 0,userId,movieId,rating,prediction
79347,547,3741,5.0,0.47542
78898,547,1945,5.0,0.205474
79578,547,4969,5.0,0.202307
79488,547,4428,5.0,0.0
79137,547,2925,5.0,0.088564


In [37]:
one_user

Unnamed: 0,userId,movieId,rating,prediction
79347,547,3741,5.0,0.475420
78898,547,1945,5.0,0.205474
79578,547,4969,5.0,0.202307
79488,547,4428,5.0,0.000000
79137,547,2925,5.0,0.088564
...,...,...,...,...
79917,547,7577,0.5,0.087529
79714,547,6127,0.5,-0.072134
80125,547,26485,0.5,0.000000
78552,547,562,0.5,-0.028781


In [58]:
one_user= test.loc[(test['userId'] == 19)].sort_values('rating', 
                      inplace=False, 
                      ascending=False)
one_user.head()

Unnamed: 0,userId,movieId,rating,prediction
3355,19,968,5.0,0.835827
3327,19,898,5.0,1.60328
3216,19,356,5.0,4.094253
3145,19,111,5.0,3.041217
3415,19,1207,5.0,1.173142


In [59]:
test['prediction'].max()

6.136070357112792

### Recall and Precision

In [60]:
relevance_threshold = 2
recommended_movies = set(one_user.loc[(one_user['prediction'] >= relevance_threshold)]['movieId'].tolist())
relavent_movies = set(one_user.loc[(one_user['rating'] >= relevance_threshold)]['movieId'].tolist())
len(relavent_movies), len(recommended_movies)

(87, 16)

In [61]:
recommended_movies_intersection_relavant_movies = [i for i in recommended_movies if i in relavent_movies]
len(recommended_movies_intersection_relavant_movies)

16

In [62]:
len(recommended_movies), len(recommended_movies_intersection_relavant_movies), len(relavent_movies)

(16, 16, 87)

In [63]:
recall = len(recommended_movies_intersection_relavant_movies)/len(relavent_movies)
precision = len(recommended_movies_intersection_relavant_movies)/len(recommended_movies)

In [64]:
precision, recall

(1.0, 0.1839080459770115)

In [65]:
f_score = 2*precision*recall/(precision+recall)
f_score

0.3106796116504854

### Recall@k and Precision@k
- recall and precision on top k predictions

In [66]:
k=18
top_k_predictions_df = one_user.sort_values('prediction', ascending=False)[:k]
top_k_predictions_df.head()

Unnamed: 0,userId,movieId,rating,prediction
3198,19,318,4.0,5.417728
3130,19,50,4.0,4.375613
3275,19,527,4.0,4.214355
3216,19,356,5.0,4.094253
3128,19,47,5.0,3.366782


In [67]:
relevance_threshold=2
top_k_movies_recommended = set(top_k_predictions_df.loc[(top_k_predictions_df['prediction'] >= relevance_threshold)]['movieId'].tolist())
top_k_movies_relevant = set(top_k_predictions_df.loc[(top_k_predictions_df['rating'] >= relevance_threshold)]['movieId'].tolist())
# top_k_predictions_df['rating']

In [68]:
recommended_movies_intersection_relavant_movies = [i for i in top_k_movies_recommended
                                                   if i in top_k_movies_relevant]
recommended_movies_intersection_relavant_movies

[356, 36, 39, 329, 924, 1259, 47, 527, 111, 50, 912, 21, 1304, 25, 316, 318]

In [69]:
len(top_k_movies_recommended), len(recommended_movies_intersection_relavant_movies), len(top_k_movies_relevant)

(16, 16, 18)

In [70]:
recall = len(recommended_movies_intersection_relavant_movies)/len(top_k_movies_relevant)
precision = len(recommended_movies_intersection_relavant_movies)/len(top_k_movies_recommended)

In [71]:
precision, recall

(1.0, 0.8888888888888888)

In [56]:
f_score = 2*precision*recall/(precision+recall)
f_score

0.56