# This notebook runs a Collaborative Filtering Engine with nearest neightbors fitting and an Item Based Engine and then the prediction errors are compared

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import sklearn
from sklearn.metrics import mean_squared_error



In [2]:
df = pd.read_csv('df_to_recommender.csv')

In [3]:
df=df[['user_id', 'Id','overall']].copy()

In [4]:
df.shape

(7771, 3)

### It is necessary to make a zeros matrix to fill with ratings values

In [5]:
def pivotarray(df):
    users,i= np.unique(df['user_id'],return_inverse=True)
    item,j= np.unique(df['Id'],return_inverse=True)
    a=np.zeros((len(users),len(item)))
    a[i,j]=df['overall']
    return a

In [6]:
ratings1 = pivotarray (df)
ratings1.shape

(214, 7771)

In [7]:
sparsity = float(len(ratings1.nonzero()[0])) 
sparsity /= (ratings1.shape[0] * ratings1.shape[1]) 
sparsity *= 100 
print('Sparsity: {:4.2f}%'.format(sparsity)) 
 

Sparsity: 0.47%


### Then try with stronger delimitations to see if it improves Sparsity

In [8]:
df2 = df[df.overall>4.0]

In [9]:
df2.shape

(2542, 3)

In [10]:
ratings2 = pivotarray (df2)

In [11]:
sparsity = float(len(ratings2.nonzero()[0])) 
sparsity /= (ratings2.shape[0] * ratings2.shape[1]) 
sparsity *= 100 
print('Sparsity: {:4.2f}%'.format(sparsity)) 

Sparsity: 0.49%


In [12]:
df3 = df[df.overall>=5.0]

In [13]:
df3.shape

(379, 3)

In [14]:
ratings3 = pivotarray (df3)

### As it can be seen, taking only 379 rows we get 0.89% sparsity. It means hat we only have rating information for 0.89% the data and for the others it is just zeros

In [15]:
sparsity = float(len(ratings3.nonzero()[0])) 
sparsity /= (ratings3.shape[0] * ratings3.shape[1]) 
sparsity *= 100 
print('Sparsity: {:4.2f}%'.format(sparsity)) 

Sparsity: 0.89%


### It has been selected the first option to make the engines

In [16]:
train, test = train_test_split(ratings1, test_size=0.33, random_state = 42)
print (train.shape, test.shape)


((143, 7771), (71, 7771))


In [17]:
dist_out = 1- sklearn.metrics.pairwise.cosine_distances (train)

In [18]:
user_pred= dist_out.dot(train) / np.array ([np.abs(dist_out.sum(axis = 1))]).T

In [19]:
user_pred

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

### Define a function for calculating the root mean square error (RMSE) to get the error/accuracy idea

In [20]:
def get_mse (pred, actual):
    pred = pred [actual.nonzero()].flatten()
    actual = actual [actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)
   

In [21]:
get_mse (user_pred, train)

0.0

In [22]:
get_mse (user_pred, test)

17.663474240422723

In [23]:
k= 5
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors (k, 'cosine')
neigh.fit (train)


NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius='cosine')

In [24]:
top_k_distances, top_k_users = neigh.kneighbors (train, return_distance= True)

In [25]:
top_k_distances.shape

(143, 5)

In [26]:
top_k_users.shape

(143, 5)

In [27]:
top_k_distances[0]

array([  0.        ,  30.41381265,  30.48360215,  30.6757233 ,  30.6757233 ])

In [28]:
top_k_users[0]

array([  0, 121,  99,  34, 135])

### Now get the top k users for each user and use their rating information while predicting the ratings using the weighted sum of all of the ratings of these top k similar users

In [29]:
user_pred_k = np.zeros (train.shape)
for i in range (train.shape[0]):
    user_pred_k [i, :] = top_k_distances[i].T.dot(train[top_k_users][i])/ np.array([np.abs(top_k_distances[i].T)
                                                                                    .sum(axis = 0)]).T

In [30]:
user_pred_k

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [31]:
user_pred_k.shape

(143, 7771)

### Evaluating predictions

In [32]:
get_mse (user_pred_k, train)

17.595351043643262

In [33]:
get_mse (user_pred_k, test)

17.663474240422723

# Item based recommender

In [34]:
k = train.shape[1]
neigh = NearestNeighbors(k,'cosine') 

In [35]:
neigh.fit(train.T) 

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=7771, p=2,
         radius='cosine')

In [36]:
top_k_distances,top_k_users = neigh.kneighbors(train.T, return_distance=True) 

### Predict the beer ratings

In [37]:
item__pred = train.dot(top_k_distances) / np.array([np.abs(top_k_distances).sum(axis=1)])

In [38]:
item__pred.shape

(143, 7771)

In [39]:
item__pred

array([[  0.00000000e+00,   6.34625513e-05,   1.25960448e-04, ...,
          3.55691868e-02,   3.40800766e-02,   7.00597406e-02],
       [  0.00000000e+00,   6.34625513e-05,   1.25960448e-04, ...,
          1.46034897e-02,   1.39921121e-02,   2.87641297e-02],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          4.44454288e-02,   4.25847133e-02,   8.75430533e-02],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   2.26728807e-04, ...,
          8.75547211e-03,   8.38892276e-03,   1.72454352e-02],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          9.11293832e-03,   8.73142358e-03,   1.79495275e-02],
       [  0.00000000e+00,   0.00000000e+00,   2.01536717e-04, ...,
          5.18194079e-03,   4.96499794e-03,   1.02067396e-02]])

### Evaluating predictions

In [40]:
get_mse(item__pred, train)

17.237507235136277

In [41]:
get_mse (item__pred, test)

17.311478757505622