In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
header = ['user_id','item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=header)

In [5]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [9]:
# Unique users and items
u_users = df.user_id.unique().shape[0]
u_items = df.item_id.unique().shape[0]
print('Number of users: ' + str(u_users) + ' | Number of items: ' + str(u_items))

Number of users: 943 | Number of items: 1682


In [11]:
# Split data to train and to test
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.25)

In [13]:
# Create matrix for train and test data 
train_data_mat = np.zeros((u_users, u_items))
for line in train_data.itertuples():
    train_data_mat[line[1]-1, line[2]-1] = line[3]

test_data_mat = np.zeros((u_users, u_items))
for line in test_data.itertuples():
    test_data_mat[line[1]-1, line[2]-1] = line[3]


In [37]:
# Function to predict
def predict(ratings, similarity, type='users'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [51]:
# Create a pairwise distance 
from sklearn.metrics.pairwise import pairwise_distances
user_sim = pairwise_distances(train_data_mat)
item_sim = pairwise_distances(test_data_mat.T)


In [53]:
# Make prediction
item_prediction = predict(train_data_mat, item_sim, type='item')
user_prediction = predict(train_data_mat, user_sim, type='user')

In [55]:
item_prediction

array([[0.46708999, 0.56561492, 0.61496081, ..., 0.96526619, 0.89738569,
        0.96526619],
       [0.11723986, 0.15034378, 0.16198799, ..., 0.2619014 , 0.24338501,
        0.2619014 ],
       [0.07329662, 0.08161624, 0.0858383 , ..., 0.12378683, 0.11566322,
        0.12378683],
       ...,
       [0.03830002, 0.05770324, 0.06363132, ..., 0.11049458, 0.10202345,
        0.11049458],
       [0.15011185, 0.18518098, 0.20459883, ..., 0.32999329, 0.30644142,
        0.32999329],
       [0.27502715, 0.332717  , 0.36247297, ..., 0.58114072, 0.5390628 ,
        0.58114072]])

In [57]:
user_prediction

array([[ 1.68410834,  0.60935845,  0.48569244, ...,  0.26638512,
         0.26413575,  0.26732531],
       [ 1.45974163,  0.34584584,  0.16867426, ..., -0.09603985,
        -0.09793997, -0.09316929],
       [ 1.47754094,  0.32086273,  0.13654879, ..., -0.14534657,
        -0.14705382, -0.14202966],
       ...,
       [ 1.41162247,  0.29094761,  0.1009971 , ..., -0.18248737,
        -0.18456914, -0.17935765],
       [ 1.47204593,  0.35243725,  0.1969458 , ..., -0.06106049,
        -0.06306986, -0.05867416],
       [ 1.53438475,  0.43653058,  0.30366473, ...,  0.07193197,
         0.06967887,  0.07332149]])

In [63]:
# Evaluation of the recommendation
from sklearn.metrics import mean_squared_error
from math import sqrt

def rsme(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print('User-based cf rsme: ' + str(rsme(user_prediction, test_data_mat)))
print('Item-based cf rsme: ' + str(rsme(item_prediction, test_data_mat)))

User-based cf rsme: 3.088050717474797
Item-based cf rsme: 3.3591132693342063
