## Collaborative filtering 

In [16]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### Initialization

In [3]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
trainingset_file = 'dataset/ml-100k/u3.base'
testset_file= 'dataset/ml-100k/u3.test'
n_users = 943
n_items = 1682
ratings = np.zeros((n_users, n_items))

### Load training set 

In [10]:
df = pd.read_csv(trainingset_file, sep='\t', names=names)
print('Load training set...')
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
print('Finished loading.')
print('Size of rating matrix %d*%d.' % (n_users, n_items))
print('Effective score number of the training set %d.' % len(df))
df.head()

Load training set...
Finished loading.
Size of rating matrix 943*1682.
Effective score number of the training set 80000.


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,6,5,887431973


### Matrix density of training set

In [11]:
def cal_sparsity():
    sparsity = float(len(ratings.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    print('Matrix density of training set is: {:4.2f}%'.format(sparsity))

cal_sparsity()
print()

Matrix density of training set is: 5.04%



### Naive baseline model 

$$\hat{r}_{xi}= \bar{r_{user x}} + \bar{r_{item i}} - \mu$$

In [17]:
def rmse(pred, actual):
    '''calculate prediction rmse'''
    from sklearn.metrics import mean_squared_error
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

def cal_mean():
    '''Calculate mean value'''
    # population mean, each uesr mean, each item mean 
    global all_mean, user_mean, item_mean 
    all_mean = np.mean(ratings[ratings!=0])
    user_mean = sum(ratings.T) / sum((ratings!=0).T)
    item_mean = sum(ratings) / sum((ratings!=0))
    print('Exist User/Item mean NaN?', np.isnan(user_mean).any(), np.isnan(item_mean).any())
    # fill in NaN with population mean
    user_mean = np.where(np.isnan(user_mean), all_mean, user_mean)
    item_mean = np.where(np.isnan(item_mean), all_mean, item_mean)
    print('Exist User/Item mean NaN?', np.isnan(user_mean).any(), np.isnan(item_mean).any())
    print('Finsh，population mean is %.4f' % all_mean)

cal_mean()

Exist User/Item mean NaN? False True
Exist User/Item mean NaN? False False
Finsh，population mean is 3.5311


In [18]:
def predict_naive(user, item):
    prediction = item_mean[item] + user_mean[user] - all_mean
    return prediction

In [20]:
print('Loading test set...')
test_df = pd.read_csv(testset_file, sep='\t', names=names)
test_df.head()
predictions = []
targets = []
print('Test set size: %d' % len(test_df))
print('Navie model:')
for row in test_df.itertuples():
    user, item, actual = row[1]-1, row[2]-1, row[3]
    predictions.append(predict_naive(user, item))
    targets.append(actual)

print('Test set rmse: %.4f' % rmse(np.array(predictions), np.array(targets)))
print()

Loading test set...
Test set size: 20000
Navie model:
Test set rmse: 0.9691



In [21]:
test_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,5,3,889751712
1,1,11,2,875072262
2,1,16,5,878543541
3,1,25,4,875071805
4,1,35,1,878542420
