## Sample Surprise Usage - Recommendation Model

In [1]:
import numpy as np
import pandas as pd

In [2]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings_df = pd.read_csv('./ml-100k/u.data', sep='\t', names=names)
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
print(ratings_df.describe())
print("")
nusers = ratings_df["user_id"].nunique()
nproducts = ratings_df["item_id"].nunique()
print(str(nusers)+ " number of unique users")
print(str(nproducts)+" number of unique items")

            user_id        item_id         rating     timestamp
count  100000.00000  100000.000000  100000.000000  1.000000e+05
mean      462.48475     425.530130       3.529860  8.835289e+08
std       266.61442     330.798356       1.125674  5.343856e+06
min         1.00000       1.000000       1.000000  8.747247e+08
25%       254.00000     175.000000       3.000000  8.794487e+08
50%       447.00000     322.000000       4.000000  8.828269e+08
75%       682.00000     631.000000       4.000000  8.882600e+08
max       943.00000    1682.000000       5.000000  8.932866e+08

943 number of unique users
1682 number of unique items


In [4]:
ratings = np.zeros((nusers , nproducts))

In [5]:

# create a nusers X nitems matrix and fill in the ratings values
for row in ratings_df.itertuples():    
    ratings[row[1]-1, row[2]-1] = row[3]
ratings

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [6]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
#         print(test_ratings)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

In [7]:
train, test = train_test_split(ratings)
print(train.shape)
print(test.shape)

(943, 1682)
(943, 1682)


In [11]:
import sklearn.metrics.pairwise

sim = sklearn.metrics.pairwise.cosine_similarity(train)

In [29]:
sim.shape

(943, 943)

In [19]:
sim.dot(ratings).shape

(943, 1682)

In [30]:
# find top k users
k = 40
topk_users = np.argsort(sim)[::-1][:,:k]
topk_users.shape

(943, 40)

In [34]:
sim[1,:][topk_users].shape

(943, 40)

In [41]:
pred = np.zeros(ratings.shape)

for i in range(1):
    top_k_users = np.argsort(sim)[::-1][:,:k]
    for j in range(1):
        print(sim[i, :][top_k_users])
#         pred[i, j] = sim[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
#         pred[i, j] /= np.sum(np.abs(sim[i, :][top_k_users]))

[[0.04246507 0.04857668 0.03659149 ... 0.04281694 0.05590607 0.06953194]
 [0.0112352  0.16323997 0.04710228 ... 0.08591283 0.07015717 0.0483224 ]
 [0.02958247 0.04109062 0.04934126 ... 0.20348051 0.03136474 0.06485375]
 ...
 [0.08756336 0.19213031 0.17929848 ... 0.36943377 0.16917381 0.21963097]
 [0.1168792  0.15026322 0.09648089 ... 0.10065468 0.14273825 0.1498689 ]
 [0.         0.         0.         ... 0.02112611 0.02317461 0.02340399]]


In [43]:
top_k_users = np.argsort(sim)[::-1][:,:k]

In [44]:
top_k_users.shape

(943, 40)

In [56]:
ratings[1, [top_k_users]].shape

(1, 943, 40)

In [14]:
def cosine_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [15]:

user_cosine_similarity = cosine_similarity(train, kind='user')
item_cosine_similarity = cosine_similarity(train, kind='item')
print(item_cosine_similarity[:4, :4])

[[1.         0.41099516 0.32290243 0.45816468]
 [0.41099516 1.         0.26223334 0.49585896]
 [0.32290243 0.26223334 1.         0.2949136 ]
 [0.45816468 0.49585896 0.2949136  1.        ]]


In [16]:
user_cosine_similarity

array([[1.        , 0.15861357, 0.03762763, ..., 0.15518078, 0.17000039,
        0.37819237],
       [0.15861357, 1.        , 0.08459062, ..., 0.06878963, 0.13714026,
        0.11044833],
       [0.03762763, 0.08459062, 1.        , ..., 0.02938477, 0.10251852,
        0.01425584],
       ...,
       [0.15518078, 0.06878963, 0.02938477, ..., 1.        , 0.11296811,
        0.08159914],
       [0.17000039, 0.13714026, 0.10251852, ..., 0.11296811, 1.        ,
        0.1735517 ],
       [0.37819237, 0.11044833, 0.01425584, ..., 0.08159914, 0.1735517 ,
        1.        ]])

### Surprise package usage for recommendations

In [4]:
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate

In [2]:
data = Dataset.load_builtin('ml-100k')

algo = SVD()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9389  0.9371  0.9343  0.9415  0.9311  0.9366  0.0036  
MAE (testset)     0.7410  0.7387  0.7354  0.7405  0.7341  0.7379  0.0027  
Fit time          4.13    4.17    4.33    4.00    4.08    4.14    0.11    
Test time         0.19    0.12    0.14    0.15    0.16    0.15    0.02    


{'fit_time': (4.1325812339782715,
  4.172383785247803,
  4.325289249420166,
  4.001381874084473,
  4.0825560092926025),
 'test_mae': array([0.74098938, 0.73870579, 0.7353792 , 0.74047967, 0.73410978]),
 'test_rmse': array([0.93888748, 0.93713818, 0.93425613, 0.94148612, 0.93109926]),
 'test_time': (0.18724584579467773,
  0.1197199821472168,
  0.1377861499786377,
  0.15451622009277344,
  0.16004014015197754)}

In [None]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=.25)

In [None]:
algo = SVDpp()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
algo = NMF()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)