# Recommender system project:
Code based on a tutorial by Agnes Johannsdottir using MovieLens data set.  
Here I use the million song data set from Kaggle.

P1.  Problems include that the similiarity matrix is extremely sparse (with 163000 users for instance) and the kernel crashes when generating it.  I addressed these issues by removing users and items with fewer than 30 entries.  This still provides a very sparse matrix (99.8%), and resulted in a RMSE of ~7 with either a item-item collaboration or user-item collaboration. Note that this RMSE is worse than the standard deviation! 

Limiting the data set to heavily reviewed songs and 52 active users provided a 92% sparse matrix; now the RMSE is better than the std, although not by a large amount.  Now the RMSE is 6~7 and the std is ~8.1 (note the RMSE has some variance due to how the makeup of the test and training set).

P2.  Here I've written a function with generates a similarity matrix identical to that of the sklearn function.

P3.  Here I've assigned all 'ratings' greater than 5 to 5, so that the scale is fixed.  This does not help though, and the std is still less than the RMSE. (note that in this part I did not limit users and items to minimize the sparsness of the matrix).

P4.  Imputed with user averages.  This didn't improve the RMSE by much.

conclusions: Overall, I find that the recommender system performs at best equal to the 'average' value, and so is providing little value.  I think the next steps are to find a better way to impute missing values, perhaps by taking item averages and then imputing with user bias...


# P1: using sklearn pairwise_distance to generate the similarity matrix

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [235]:
data = []
with open("/Users/alexandersatz/Documents/Cuny/IS643_recommenderSys/project1/kaggle_visible_evaluation_triplets.txt", "r") as f:
    for line in f:
        line = line[:-1]
        line =line.split('\t')
        data.append(line)

In [236]:
data[:10]

[['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOBONKR12A58A7A7E0', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOEGIYH12A6D4FC0E3', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOFLJQZ12A6D4FADA6', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOHTKMO12AB01843B0', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SODQZCY12A6D4F9D11', '1'],
 ['fd50c4007b68a3737fe052d5a4f78ce8aa117f3d', 'SOXLOQG12AF72A2D55', '1'],
 ['d7083f5e1d50c264277d624340edaaf3dc16095b', 'SOUVUHC12A67020E3B', '1'],
 ['d7083f5e1d50c264277d624340edaaf3dc16095b', 'SOUQERE12A58A75633', '1'],
 ['d7083f5e1d50c264277d624340edaaf3dc16095b', 'SOIPJAX12A8C141A2D', '1'],
 ['d7083f5e1d50c264277d624340edaaf3dc16095b', 'SOEFCDJ12AB0185FA0', '2']]

In [237]:
de_item = defaultdict(int)
for row in data:
    de_item[row[1]] += 1
    

In [238]:
#Here I'm removing items with few reviews.
d_itemGTE5 = {}
for key, value in de_item.iteritems():
    if value > 100:
        d_itemGTE5[key] = 1

In [239]:
print len(d_itemGTE5)
print len(de_item)

1758
163206


In [240]:
shortdata = []
for x in data:
    if x[1] in d_itemGTE5:
        shortdata.append(x)

In [241]:
de_user = defaultdict(int)
for row in shortdata:
    de_user[row[0]] += 1
    

In [242]:
#Here I'm removing users with few reviews.
d_usersGTE10 = {}
for key, value in de_user.iteritems():
    if value > 30:
        d_usersGTE10[key] = 1

In [243]:
shortdata2 = []
for x in shortdata:
    if x[0] in d_usersGTE10:
        shortdata2.append(x)

In [244]:
shortdata = shortdata2

In [245]:
## at atleast 20 occurances of each, this leaves 92219 rows.
print(len(data))
print(len(shortdata))

1450933
1998


In [246]:
#data from https://www.kaggle.com/c/msdchallenge/data
#the number of times a user listens to a song is tracked

#here 'rating' is number of listens to the song.
header = ['user_id', 'item_id', 'rating']
df = pd.DataFrame(shortdata, columns=header)

In [247]:
df.shape

(1998, 3)

In [248]:
## So each row is a userID, an item_id, and a rating
##df_o = df_o.head(100000) this causes the matrix to be too sparse
df.head(5)

Unnamed: 0,user_id,item_id,rating
0,37a309640004e11829ff173e9c3f9fbd2e07ab29,SOGTQNI12AB0184A5C,6
1,37a309640004e11829ff173e9c3f9fbd2e07ab29,SOPUCYA12A8C13A694,39
2,37a309640004e11829ff173e9c3f9fbd2e07ab29,SOELOOM12AB017DB4C,62
3,37a309640004e11829ff173e9c3f9fbd2e07ab29,SONYKOW12AB01849C9,7
4,37a309640004e11829ff173e9c3f9fbd2e07ab29,SOOFTNW12AB017DB3E,1


In [249]:
d_users, d_items = {}, {}
c = 0
for x in arrayusers:
    c +=1
    d_users[x] = c 
c = 0
for x in arrayitems:
    c +=1
    d_items[x] = c 


In [250]:
#I need to convert each user_id to an int, and each item_id to an int.
arrayusers = df.user_id.unique()
arrayitems = df.item_id.unique()

In [251]:
# Number of users left
len(d_users)

58

In [252]:
for index, row in df.iterrows():
    df.set_value(index, 'user_id', d_users[row['user_id']])
    df.set_value(index, 'item_id', d_items[row['item_id']])
    #df[index]['user_id'] = d_users[row['user_id']]
    #df[index]['item_id'] = d_items[row['item_id']]

In [253]:
df.head(5)

Unnamed: 0,user_id,item_id,rating
0,1,1,6
1,1,2,39
2,1,3,62
3,1,4,7
4,1,5,1


In [254]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of items = ' + str(n_items)  

Number of users = 58 | Number of items = 423


In [255]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [256]:
print train_data.head(10)
print train_data.shape
print df.shape

     user_id item_id rating
190        6     139      1
654       20     123      6
1461      43     138      1
638       19     271      1
660       20      86      1
1104      33      59      1
1381      41     121      2
1556      46      62      2
838       25     272      1
738       22       4      3
(1498, 3)
(1998, 3)


In [257]:
#Create two user-item matrices, one for training and another for testing
## row number = user_id-1, column number is item_id-1, and value is the rating.
## so each row has all ratings for every movie for that user (else a zero)
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [258]:
##943 users
train_data_matrix.shape

(58, 423)

In [259]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [260]:
## a 2Dmatrix with similarity score 0-1 between all users
user_similarity.shape

(58, 58)

In [261]:
## a 2D matrix with similarity score 0-1 between all items
## note that when a rating is 0, it gives a 0 score automatically...
item_similarity.shape

(423, 423)

In [262]:
#predict(train_data_matrix, user_similarity, type='user')
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)  # array with mean for each user
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [263]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [264]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [265]:
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))

User-based CF RMSE: 5.82894997334
Item-based CF RMSE: 5.92528052976


In [266]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print 'The sparsity level is ' +  str(sparsity*100) + '%'

The sparsity level is 91.9%


In [267]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix))

User-based CF MSE: 6.07667169796


In [268]:
df[['rating']] = df[['rating']].apply(pd.to_numeric)
np.std((df['rating']))

8.341333292967498

# P2:  Handcoded similarity function

I use a small test matrix to test my handcoded function with the sklearn function and show the two functions to provide equivalent results.  Granted, my function is much slower with large matrix (brutally slow).



In [190]:
#a test matrix
rows  = 3
cols = 4
low = 0
high = 5
step = 2

test = np.random.choice([x for x in xrange(low,high,step)],rows*cols)
test.resize(rows,cols)
test

array([[2, 2, 0, 0],
       [0, 0, 0, 2],
       [2, 4, 4, 4]])

In [196]:
def genSimMatrix(mat):
    newmat = np.matlib.zeros((mat.shape[0],mat.shape[0]))
    for x in range(0, mat.shape[0]):
        for y in range(0, mat.shape[0]):
            sim =  np.dot(mat[x], mat[y,].T)/(np.dot(mat[x], mat[x,].T)*np.dot(mat[y], mat[y,].T))**.5
            newmat[x,y] = 1-sim
    return newmat
        
        

#user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
#item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [197]:
import numpy.matlib
handcoded = genSimMatrix(test)

In [198]:
# the handcoded similarity matrix is below.  ## note that larger numbers mean MORE different.
handcoded

matrix([[ 0.        ,  1.        ,  0.41165159],
        [ 1.        ,  0.        ,  0.4452998 ],
        [ 0.41165159,  0.4452998 ,  0.        ]])

In [194]:
usingsci = pairwise_distances(test, metric='cosine', Y = None)

In [195]:
# the sklearn matrix is below.
usingsci

array([[  2.22044605e-16,   1.00000000e+00,   4.11651595e-01],
       [  1.00000000e+00,   0.00000000e+00,   4.45299804e-01],
       [  4.11651595e-01,   4.45299804e-01,   0.00000000e+00]])

# P3:  set all 'ratings' greater than 5 to 5

The scale for 'ratings' in confusing, as one can listen to a song as many times as one like, akin to ranking it as high as one desires.  Here I've set the limit to 5 listens.  This makes understanding the RMSE easier.  

Note that the RMSE is ~2.6 while the standard deviation is 1.6.  And so our collaborative recommender is worse than just taking the average (unless we take efforts to reduce how sparse the data is).

In [207]:
#data from https://www.kaggle.com/c/msdchallenge/data
#the number of times a user listens to a song is tracked

#here 'rating' is number of listens to the song.
header = ['user_id', 'item_id', 'rating']
df = pd.DataFrame(shortdata, columns=header)

In [209]:
# max listen is now 5, which represents 'liking' a song
for index, row in df.iterrows():
    if int(row['rating']) > 5:
        row['rating'] = 5
         

In [211]:
arrayusers = df.user_id.unique()
arrayitems = df.item_id.unique()
d_users, d_items = {}, {}
c = 0
for x in arrayusers:
    c +=1
    d_users[x] = c 
c = 0
for x in arrayitems:
    c +=1
    d_items[x] = c 
for index, row in df.iterrows():
    df.set_value(index, 'user_id', d_users[row['user_id']])
    df.set_value(index, 'item_id', d_items[row['item_id']])
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))

User-based CF RMSE: 2.64583489919
Item-based CF RMSE: 2.65285747954


In [212]:
#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix))

User-based CF MSE: 2.63190546368


In [216]:
df[['rating']] = df[['rating']].apply(pd.to_numeric)

In [217]:
np.std((df['rating']))

1.5516252476670083

# P4: impute with user averages.

In [303]:
data = []
with open("/Users/alexandersatz/Documents/Cuny/IS643_recommenderSys/project1/kaggle_visible_evaluation_triplets.txt", "r") as f:
    for line in f:
        line = line[:-1]
        line =line.split('\t')
        data.append(line)

de_item = defaultdict(int)
for row in data:
    de_item[row[1]] += 1
    


# In[201]:

#Here I'm removing items with few reviews.
d_itemGTE5 = {}
for key, value in de_item.iteritems():
    if value > 100:
        d_itemGTE5[key] = 1


# In[202]:

print len(d_itemGTE5)
print len(de_item)


# In[203]:

shortdata = []
for x in data:
    if x[1] in d_itemGTE5:
        shortdata.append(x)


# In[204]:

de_user = defaultdict(int)
for row in shortdata:
    de_user[row[0]] += 1

d_usersGTE10 = {}
for key, value in de_user.iteritems():
    if value > 30:
        d_usersGTE10[key] = 1


# In[206]:

shortdata2 = []
for x in shortdata:
    if x[0] in d_usersGTE10:
        shortdata2.append(x)
        

shortdata = shortdata2


header = ['user_id', 'item_id', 'rating']
df = pd.DataFrame(shortdata, columns=header)



arrayusers = df.user_id.unique()
arrayitems = df.item_id.unique()




d_users, d_items = {}, {}
c = 0
for x in arrayusers:
    c +=1
    d_users[x] = c 
c = 0
for x in arrayitems:
    c +=1
    d_items[x] = c 




for index, row in df.iterrows():
    df.set_value(index, 'user_id', d_users[row['user_id']])
    df.set_value(index, 'item_id', d_items[row['item_id']])


n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
 


from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)



train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

###
for row in range (0, len(train_data_matrix)):
    b = train_data_matrix[row]
    m = np.mean(b[b>0])
    for col in range(0, len(b)):
        if b[col] == 0:
            train_data_matrix[row,col] = m
            
    
    

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')




item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')


print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))


1758
163206
User-based CF RMSE: 6.805159312
Item-based CF RMSE: 6.86238598893
