## Collaborative filtering 

In [16]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### Initialization

In [3]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
trainingset_file = 'dataset/ml-100k/u3.base'
testset_file= 'dataset/ml-100k/u3.test'
n_users = 943
n_items = 1682
ratings = np.zeros((n_users, n_items))

### Load training set 

In [10]:
df = pd.read_csv(trainingset_file, sep='\t', names=names)
print('Load training set...')
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
print('Finished loading.')
print('Size of rating matrix %d*%d.' % (n_users, n_items))
print('Effective score number of the training set %d.' % len(df))
df.head()

Load training set...
Finished loading.
Size of rating matrix 943*1682.
Effective score number of the training set 80000.


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,6,5,887431973


### Matrix density of training set

In [11]:
def cal_sparsity():
    sparsity = float(len(ratings.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    print('Matrix density of training set is: {:4.2f}%'.format(sparsity))

cal_sparsity()
print()

Matrix density of training set is: 5.04%



### Naive baseline model 

$$\hat{r}_{xi}= \bar{r}_{user\; x} + \bar{r}_{item\; i} - \mu$$

In [17]:
def rmse(pred, actual):
    '''calculate prediction rmse'''
    from sklearn.metrics import mean_squared_error
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

def cal_mean():
    '''Calculate mean value'''
    # population mean, each uesr mean, each item mean 
    global all_mean, user_mean, item_mean 
    all_mean = np.mean(ratings[ratings!=0])
    user_mean = sum(ratings.T) / sum((ratings!=0).T)
    item_mean = sum(ratings) / sum((ratings!=0))
    print('Exist User/Item mean NaN?', np.isnan(user_mean).any(), np.isnan(item_mean).any())
    # fill in NaN with population mean
    user_mean = np.where(np.isnan(user_mean), all_mean, user_mean)
    item_mean = np.where(np.isnan(item_mean), all_mean, item_mean)
    print('Exist User/Item mean NaN?', np.isnan(user_mean).any(), np.isnan(item_mean).any())
    print('Finsh，population mean is %.4f' % all_mean)

cal_mean()

Exist User/Item mean NaN? False True
Exist User/Item mean NaN? False False
Finsh，population mean is 3.5311


In [18]:
def predict_naive(user, item):
    prediction = item_mean[item] + user_mean[user] - all_mean
    return prediction

In [20]:
print('Loading test set...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('Test set size: %d' % len(test_df))
print('Navie model:')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_naive(user, item))
    targets.append(actual)

print('Test set rmse: %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

Loading test set...
Test set size: 20000
Navie model:
Test set rmse: 0.9691



### item-item based cf model 

+ use Cosine distance to calculate similarilty:
    $$sim(x, y) = \frac{r_x\cdot r_y}{\|r_x\|\|r_y\|}$$
    
+ weighted prediction:
    $$\hat{r}_{xi} = \frac{\sum_{j\in N(x)}s_{ij}\cdot r_{xj}}{\sum_{j\in N(x)} s_{ij}}\;,$$
    where $N(x)$ is the rating data by user $x$. 


In [23]:
def cal_similarity(ratings, kind, epsilon=1e-9):
    '''uisng Cosine distance to calculate similarilty'''
    '''epsilon: aviod Divide-by-zero error ，Correct it.'''
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [25]:
print('Calculate similarity martrix...')
user_similarity = cal_similarity(ratings, kind='user')
item_similarity = cal_similarity(ratings, kind='item')
print('Finished.')
print('example: (item-item)')
print(np.round_(item_similarity[:10,:10], 3))

Calculate similarity martrix...
Finished.
example: (item-item)
[[1.    0.296 0.279 0.388 0.252 0.114 0.518 0.41  0.416 0.199]
 [0.296 1.    0.177 0.405 0.211 0.099 0.331 0.31  0.207 0.152]
 [0.279 0.177 1.    0.275 0.118 0.104 0.311 0.125 0.207 0.121]
 [0.388 0.405 0.275 1.    0.265 0.091 0.411 0.391 0.357 0.219]
 [0.252 0.211 0.118 0.265 1.    0.016 0.28  0.214 0.202 0.031]
 [0.114 0.099 0.104 0.091 0.016 1.    0.128 0.065 0.164 0.139]
 [0.518 0.331 0.311 0.411 0.28  0.128 1.    0.342 0.43  0.279]
 [0.41  0.31  0.125 0.391 0.214 0.065 0.342 1.    0.364 0.166]
 [0.416 0.207 0.207 0.357 0.202 0.164 0.43  0.364 1.    0.25 ]
 [0.199 0.152 0.121 0.219 0.031 0.139 0.279 0.166 0.25  1.   ]]


In [26]:
def predict_itemCF(user, item, k=100):
    '''item-item CF, predict rating'''
    nzero = ratings[user].nonzero()[0]
    prediction = ratings[user, nzero].dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero])
    return prediction

In [28]:
print('Loading test set...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('Test set size %d' % len(test_df))
print('item-item CF:')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_itemCF(user, item))
    targets.append(actual)

print('Test set rmse: %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

Loading test set...
Test set size 20000
item-item CF:
Test set rmse: 1.0042



### User-user based CF model 
+ Cold start problem: when denominator is $0$, the result would be $NaN$. Thus we use the baseline result to replace $NaN$.

In [29]:
def predict_userCF(user, item, k=100):
    '''user-user CF, predict rating'''
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = ratings[nzero, item].dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero])
    # Cold start problem: the item has not been scored yet
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction

In [30]:
print('Loading test set...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('Test set size %d' % len(test_df))
print('user-user CF:')

for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_userCF(user, item))
    targets.append(actual)

print('Test set rmse: %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

Loading test set...
Test set size 20000
user-user CF:
Test set rmse: 1.0133



### User-based CF combining baseline model 
+ prediction model:
    $$\hat{r}_{xi} = b_{xi} + \frac{\sum_{j\in N(x)}s_{ij}\cdot (r_{xj} - b_{xj})}{\sum_{j\in N(x)} s_{ij}}\;,$$
    where $b_{xi}$ is the predicted rate for user $x$ to item $i$ using baseline model, and $N(x)$ is the rating data by user $x$. 

In [31]:
def predict_userCF_baseline(user, item, k=100):
    '''user-user CF combining baseline, predict rating'''
    nzero = ratings[:,item].nonzero()[0]
    baseline = user_mean + item_mean[item] - all_mean
    prediction = (ratings[nzero, item] - baseline[nzero]).dot(user_similarity[user, nzero])\
                / sum(user_similarity[user, nzero]) + baseline[user]
    if np.isnan(prediction):
        prediction = baseline[user]
    return prediction

print('loading test dataset...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('Test set size %d' % len(test_df))
print('user-based CF with baseline:')

for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_userCF_baseline(user, item))
    targets.append(actual)
    
print('Test set rmse: %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

loading test dataset...
Test set size 20000
user-based CF with baseline:
Test set rmse: 0.9519



### Modified model
+ further improve item-based CF with baseline model by regulating rating in range (1, 5). 

In [35]:
def predict_biasCF(user, item, k=100):
    '''item based CF combining baseline, predict rating'''
    nzero = ratings[user].nonzero()[0]
    baseline = item_mean + user_mean[user] - all_mean
    prediction = (ratings[user, nzero] - baseline[nzero]).dot(item_similarity[item, nzero])\
                / sum(item_similarity[item, nzero]) + baseline[item]
    if prediction > 5:
        prediction = 5
    if prediction < 1:
        prediciton = 1
    return prediction

print('loading test dataset...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('Test set size %d' % len(test_df))
print('item-based CF with baseline:')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_biasCF(user, item))
    targets.append(actual)

print('Test set rmse: %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

loading test dataset...
Test set size 20000
item-based CF with baseline:
Test set rmse: 0.9344

