In [1]:
from domino.utils.jupyter import notebook_init
notebook_init()

In [2]:
import os.path
import re

import pandas as pd
np = pd.np

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
path = lambda filename, sep='/': os.path.join(*filename.split(sep))

In [4]:
ratings = pd.read_csv(path('data/ratings-matrix.csv'))

In [5]:
ratings.head()

Unnamed: 0,user,restaurant,rating
0,http://schema.org/resource/UID_DA1B2081FC9049D...,http://schema.org/resource/eatery_997678,5
1,http://schema.org/resource/UID_BEFC285B7F08B09...,http://schema.org/resource/eatery_1966404,5
2,http://schema.org/resource/UID_520F77291D513D5...,http://schema.org/resource/eatery_717364,3
3,http://schema.org/resource/UID_C2D157D8CE6AA2E...,http://schema.org/resource/eatery_931622,5
4,http://schema.org/resource/UID_F604743AFCC6E45...,http://schema.org/resource/eatery_2054462,4


In [6]:
ratings.shape

(511753, 3)

In [7]:
ratings.apply(lambda x: len(x.unique()))

user          301350
restaurant      7029
rating             5
dtype: int64

In [8]:
user_counts = ratings.groupby('user').size()

In [9]:
user_counts.groupby(user_counts).size().head(20)

1     207872
2      52311
3      19558
4       8914
5       4375
6       2448
7       1506
8       1028
9        691
10       526
11       367
12       298
13       233
14       186
15       150
16       117
17        86
18       100
19        70
20        54
dtype: int64

To avoid dealing with the cold-start problem, we'll only consider users with at least 10 reviews

In [10]:
ratings = ratings[ratings.user.isin(set(user_counts[user_counts >= 10].index))].copy()

In [11]:
ratings.shape

(43381, 3)

In [12]:
restaurant_counts = ratings.groupby('restaurant').size()

In [13]:
restaurant_counts.groupby(restaurant_counts).size().head(20)

1     1227
2      687
3      420
4      323
5      267
6      181
7      175
8      136
9      111
10     103
11      80
12      79
13      78
14      59
15      54
16      50
17      49
18      46
19      37
20      31
dtype: int64

Again, to avoid the cold-start problem with the restaurants, let's keep only those restaurants with at least 10 reviews

In [14]:
ratings = ratings[
    ratings.restaurant.isin(
        set(restaurant_counts[restaurant_counts >= 10].index)
    )
].copy()

In [15]:
ratings.shape

(32495, 3)

In [16]:
ratings.apply(lambda x: len(x.unique()))

user          2643
restaurant    1227
rating           5
dtype: int64

In [17]:
ratings = ratings.pivot_table(index='user', columns='restaurant', values='rating', aggfunc='mean')

In [18]:
nusers, nitems = ratings.shape
nusers, nitems

(2643, 1227)

In [19]:
X = ratings.values.copy()

means = np.nanmean(X, axis=1)
X -= means.reshape((nusers, 1))

stds = np.nanstd(X, axis=1)
X /= stds.reshape((nusers, 1))

voted = ~np.isnan(X)

In [20]:
def intersection(n1):
    return [
        np.arange(nitems)[
            (voted[n1,] * voted[n2,]) > 0
        ]
        
        for n2 in range(nusers)
    ]

def similarities(n1):
    res = []
    
    for n2, i in zip(range(nusers), intersection(n1)):
        u1, u2 = X[n1, i], X[n2, i]
        
        res.append(
            u1.dot(u2) / np.sqrt(u1.dot(u1) * u1.dot(u2))
        )
        
    return np.array(res)

In [21]:
def prediction(u1, K=10):
    sims = np.nan_to_num(similarities(u1))
    predictions = []
    
    for item in range(nitems):
        if voted[u1, item]: 
            predictions.append(X[u1, item])
            continue
        
        votes = voted[:, item]
        
        ratings = X[votes, item]
        users = np.arange(nusers)[votes]
        sims_f = sims[votes]
        
        neighs = (sims_f ** 2).argsort()[-K:]
        pred = sims_f[neighs].dot(ratings[neighs]) / np.abs(sims_f[neighs]).sum()
        
        predictions.append(pred)
        
    pred = np.array(predictions)
    pred = (pred * stds[u1] + means[u1]).round().astype(int)
    
    pred = np.where(pred < 1, 1, pred)
    pred = np.where(pred > 5, 5, pred)
    
    return pred

pred = prediction(0)

pred

array([3, 4, 3, ..., 1, 4, 4])