# Users Predictions

In [1]:
#################################
## 1. Importing Libraries
#################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#################################
## 2. Importing Dataset
#################################
df = pd.read_csv('../datasets/ml-100k/u.data.csv', sep='\t', header=None,names=['UserID','ItemID','Rating','TimeStamp'])

#################################
## 3. Sparse Matrix
#################################
# Number of Users
n_users = df['UserID'].unique().shape[0]
print('Number of Users: %s'%(n_users))

# Number of Films
n_items = df['ItemID'].unique().shape[0]
print('Number of Items: %s'%(n_items))

# Building a Sparse Matrix
ratings = np.zeros((n_users, n_items))

for row in df.itertuples():
    ratings[row[1]-1,row[2]-1] = row[3]
    
# Sparsity Coefficient: % de la matriz que no está vacía
sparsity =float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0]*ratings.shape[1])
sparsity *= 100
print('Sparsity Coefficient: %.2f '%(sparsity))

#################################
## 4. Train and Test
#################################
from sklearn.model_selection import train_test_split
ratings_train, ratings_test = train_test_split(ratings,test_size=0.3, random_state = 42) 
print('Training Matrix: Rows %s Columns %s'%(ratings_train.shape[0],ratings_train.shape[1]))
print('Testing Matrix: Rows %s Columns %s'%(ratings_test.shape[0],ratings_test.shape[1]))

#################################
## 5. Similarity Matrix
#################################
import sklearn
# sim_matrix = 1- sklearn.metrics.pairwise.cosine_distances(ratings_train)
sim_matrix = sklearn.metrics.pairwise.cosine_similarity(ratings_train)

Number of Users: 943
Number of Items: 1682
Sparsity Coefficient: 6.30 
Training Matrix: Rows 660 Columns 1682
Testing Matrix: Rows 283 Columns 1682


In [3]:
#################################
## 6. Predictions
#################################
# Valores que no conocemos de los items que el usuario no ha tomado o visualizado, se pueden obtener ponderando la matriz de distancias con los ratings de los usuarios que si han visto esas películas y normalizando los datos
users_pred = sim_matrix.dot(ratings_train) / np.array([np.abs(sim_matrix).sum(axis=1)]).T

In [4]:
users_pred

array([[2.10259747e+00, 5.86975978e-01, 3.40264192e-01, ...,
        0.00000000e+00, 7.33611460e-03, 6.04379414e-03],
       [1.40999723e+00, 2.91863934e-01, 2.68085289e-01, ...,
        0.00000000e+00, 3.50378592e-03, 2.32963985e-03],
       [1.69014833e+00, 3.13648440e-01, 3.26127887e-01, ...,
        0.00000000e+00, 3.25391767e-03, 1.77210119e-03],
       ...,
       [1.73393747e+00, 4.06719333e-01, 3.21166908e-01, ...,
        0.00000000e+00, 2.71269625e-03, 9.00511411e-03],
       [2.34361031e+00, 8.10544770e-01, 4.73941025e-01, ...,
        0.00000000e+00, 1.01130066e-02, 9.66427605e-03],
       [2.36796969e+00, 5.98146138e-01, 3.85569804e-01, ...,
        0.00000000e+00, 6.39996638e-03, 5.37442746e-03]])

In [5]:
from sklearn.metrics import mean_squared_error
def get_mse(preds,actuals):
    preds = preds[actuals.nonzero()].flatten() # Flatten para pasarlo a vector
    actuals = actuals[actuals.nonzero()].flatten()
    return mean_squared_error(preds,actuals)
get_mse(users_pred,ratings_train)

7.878218313143215