### Import libraries to work with, open file and explore data

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
from sklearn import linear_model

In [2]:
data = pd.read_csv('/Users/AlexDiez/Documents/repoml/datasets/ml-100k/u.data.csv', header = None, sep = '\t')
data.head(3)

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [3]:
data.columns= ['UserID', 'ItemID', 'Rating', 'Timestamp']

In [4]:
data.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
n_items = data.ItemID.unique().shape[0]
n_items

1682

In [6]:
n_users = data.UserID.unique().shape[0]
n_users

943

## Create arrays to work with and similarity matrix

In [7]:
ratings = np.zeros((n_users, n_items))

In [8]:
for row in data.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]

In [9]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0]*ratings.shape[1])
sparsity *= 100
sparsity

6.304669364224531

In [10]:
ratings_train, ratings_test = sklearn.model_selection.train_test_split(ratings, test_size = 0.3, random_state = 42)

In [11]:
sim_matrix = 1 - sklearn.metrics.pairwise.cosine_distances(ratings_train)

In [12]:
users_prediction = sim_matrix.dot(ratings_train) / np.array([np.abs(sim_matrix).sum(axis=1)]).T

In [19]:
users_prediction

array([[2.10259747e+00, 5.86975978e-01, 3.40264192e-01, ...,
        0.00000000e+00, 7.33611460e-03, 6.04379414e-03],
       [1.40999723e+00, 2.91863934e-01, 2.68085289e-01, ...,
        0.00000000e+00, 3.50378592e-03, 2.32963985e-03],
       [1.69014833e+00, 3.13648440e-01, 3.26127887e-01, ...,
        0.00000000e+00, 3.25391767e-03, 1.77210119e-03],
       ...,
       [1.73393747e+00, 4.06719333e-01, 3.21166908e-01, ...,
        0.00000000e+00, 2.71269625e-03, 9.00511411e-03],
       [2.34361031e+00, 8.10544770e-01, 4.73941025e-01, ...,
        0.00000000e+00, 1.01130066e-02, 9.66427605e-03],
       [2.36796969e+00, 5.98146138e-01, 3.85569804e-01, ...,
        0.00000000e+00, 6.39996638e-03, 5.37442746e-03]])

## Mean squared error function to determine precision

In [14]:
from sklearn.metrics import mean_squared_error

In [20]:
def get_mse(preds, actuals):
    if preds.shape[0]!= actuals.shape[0]:
        actuals = actuals.T
    preds = preds[actuals.nonzero()].flatten()
    actuals = actuals[actuals.nonzero()].flatten()
    return mean_squared_error(actuals, preds)

In [21]:
get_mse(users_prediction, ratings_train)

7.878218313143215

## KNN sobre users

In [22]:
from sklearn.neighbors import NearestNeighbors

In [23]:
k = 12

In [24]:
neighbors = NearestNeighbors(k , 'cosine')

In [25]:
neighbors.fit(ratings_train)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                 radius='cosine')

In [26]:
top_k_distances, top_k_users = neighbors.kneighbors(ratings_train, return_distance = True)

In [28]:
users_predict_k = np.zeros(ratings_train.shape)
for i in range(ratings_train.shape[0]):
    users_predict_k[i,:] = top_k_distances[i].T.dot(ratings_train[top_k_users][i]) / np.array([np.abs(top_k_distances[i].T.sum(axis=0))]).T

In [29]:
get_mse(users_predict_k, ratings_train)

8.295210936585795

## KNN sobre ítems

In [30]:
n_movies = ratings_train.shape[1]
n_movies

1682

In [31]:
neighbors = NearestNeighbors(n_movies, 'cosine')

In [32]:
neighbors.fit(ratings_train.T)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=1682, p=2,
                 radius='cosine')

In [33]:
top_k_distances, top_k_items = neighbors.kneighbors(ratings_train.T, return_distance=True)

In [34]:
item_pred = ratings_train.dot(top_k_distances) / np.array([np.abs(top_k_distances).sum(axis=1)])

In [35]:
item_pred.shape

(660, 1682)

In [36]:
get_mse(item_pred, ratings_train)

11.172565375125632

## Recommendation on KNN

In [37]:
k = 30
neighbors = NearestNeighbors(k, 'cosine')
neighbors.fit(ratings_train.T)
top_k_distances, top_k_items = neighbors.kneighbors(ratings_train.T, return_distance=True)

In [38]:
np.seterr(invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [39]:
preds = np.zeros(ratings_train.T.shape)
for i in range(ratings_train.T.shape[0]):
    preds[i, :] = top_k_distances[i].dot(ratings_train.T[top_k_items][i]) / np.array([np.abs(top_k_distances[i]).sum(axis=0)]).T

In [40]:
get_mse(preds, ratings_train)

8.732017359673444