In [15]:
import pandas as pd
import numpy as np
import math

In [233]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, dtype={'user_id':str})

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, dtype={'movie_id':str, 'user_id':str})

m_cols = ['movie_id', 'title', 'release_date']
movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=m_cols, usecols=range(3), encoding='latin-1', dtype={'movie_id':str})

data = pd.merge(pd.merge(ratings, users), movies)
data = data[['user_id','title', 'movie_id','rating','release_date','sex','age']]


print("BD has", str(data.shape[0]),"ratings")
print("BD has",data.user_id.nunique(),"users")
print("BD has",data.movie_id.nunique(),"movies")
data.head()


BD has 100000 ratings
BD has 943 users
BD has 1682 movies


Unnamed: 0,user_id,title,movie_id,rating,release_date,sex,age
0,196,Kolya (1996),242,3,24-Jan-1997,M,49
1,305,Kolya (1996),242,5,24-Jan-1997,M,23
2,6,Kolya (1996),242,4,24-Jan-1997,M,42
3,234,Kolya (1996),242,4,24-Jan-1997,M,60
4,63,Kolya (1996),242,3,24-Jan-1997,M,31


In [234]:
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

data['for_testing'] = False
grouped = data.groupby('user_id', group_keys=False).apply(assign_to_set)
data_train = data[grouped.for_testing == False]
data_test = data[grouped.for_testing == True]
print(data_train.shape)
print(data_test.shape)
print(data_train.index & data_test.index)

print("Training data_set has", str(data_train.shape[0]),"ratings")
print("Test data set has", str(data_test.shape[0]),"ratings")
print("La BD has", data.movie_id.nunique(), "movies")


(79619, 8)
(20381, 8)
Int64Index([], dtype='int64')
Training data_set has 79619 ratings
Test data set has 20381 ratings
La BD has 1682 movies


In [16]:
data_train = pd.read_pickle('data_train.pkl')

In [17]:
data_test = pd.read_pickle('data_train.pkl')

##### How to get the set of movies from user with id  "1"?

In [18]:
def pivot_db(ratings, adjust_users=False):
    ratings_pivoted = ratings.pivot(
        index='movie_id',
        columns='user_id',
        values='rating'
    )
    if adjust_users:
        ratings_pivoted = ratings_pivoted.apply(lambda u: u-u.mean(), axis=0)
    ratings_pivoted['users_rated']=ratings_pivoted.apply(lambda m: m[-m.isnull()].index, axis=1)    
    return ratings_pivoted#.fillna(0)

In [19]:
# users = data_train_pivoted['users_rated'].iloc[0]

In [20]:
def sim_euclid(a,b):
    from scipy.spatial.distance import euclidean
    return 1.0/(1.0+euclidean(a,b))  

In [21]:
def sim_pearson(a,b):
    from scipy.spatial.distance import correlation
    return 1-correlation(a,b)

In [22]:
def sim_cosine(a,b):
    from scipy.spatial.distance import cosine
    return 1-cosine(a,b)

In [23]:
def compare(pivoted_m1, pivoted_m2, similarity):
#     m1 = df.loc[m_id1]
#     m2 = df.loc[m_id2]
    if (len(pivoted_m1)==0 or len(pivoted_m2)==0):
        return 1
    users_1 = pivoted_m1['users_rated']
    users_2 = pivoted_m2['users_rated']
    intersection = users_1[users_1.isin(users_2)]
    if len(intersection)==0: 
        return 1
    return similarity(pivoted_m1.loc[intersection], pivoted_m2.loc[intersection])

In [24]:
from tqdm import tqdm

In [25]:
class CollaborativeItemReco:
    """ Collaborative filtering using a custom sim(u,u'). """    
    def __init__(self, ratings, similarity, adjust_users=False, k=None, sim_matrix_path=None):
        """ Constructor """        
        self.df = pivot_db(ratings, adjust_users)        
        self.similarity=similarity
        if not sim_matrix_path:
            self.sim_matrix = pd.DataFrame(1, columns=self.df.index, index=self.df.index)
        else:
            self.sim_matrix = pd.read_pickle(sim_matrix_path)
        self.k = k+1 if k else len(self.df.index)
    def learn(self):
        """ Prepare data structures for estimation. Similarity matrix for users """
        for i, mId1 in enumerate(tqdm(self.df.index)):
            for j, mId2 in enumerate(self.df.index[i+1:]):
                sim = compare(self.df.loc[mId1],self.df.loc[mId2], self.similarity)
                self.sim_matrix.loc[mId1,mId2]=sim
                self.sim_matrix.loc[mId2,mId1]=sim
                
    def estimate_basic(self, u, j):
        # u is user
        # j is movie
        if u not in self.df:
            print('u_{} not in training'.format(u))
            return 3
        if j not in self.sim_matrix:
            print('m_{} not in training'.format(j))
            return 3
        u_ratings = self.df[u][self.df[u]>0]
        num=0
        den=0
        P_k = self.sim_matrix[j].loc[u_ratings.index].sort_values(ascending=False).iloc[1:self.k]
        means_movies = self.df.apply(lambda m: m.loc[m['users_rated']].mean(),axis=1)
        for i, sim in P_k.iteritems():
            num+=sim*u_ratings[i]
            den+=sim
        if den==0: 
            if means_movies[j]>0:
                # return the mean movie rating if there is no similar for the computation
                return means_movies[j]
            else:
                # else return mean user rating 
                return u_ratings.mean()
        return num/den
    
    def estimate_mean(self, u, j):
        # u is user
        # j is movie
        if u not in self.df:
            print('u_{} not in training'.format(u))
            return 3
        if j not in self.sim_matrix:
            print('m_{} not in training'.format(j))
            return 3
        u_ratings = self.df[u][self.df[u]>0]
        num=0
        den=0
        P_k = self.sim_matrix[j].loc[u_ratings.index].sort_values(ascending=False).iloc[1:]
        means_movies = self.df.apply(lambda m: m.loc[m['users_rated']].mean(),axis=1)
        for i, sim in P_k.iteritems():
            r_i_mean = means_movies[i]
            num+=sim*(u_ratings[i]-r_i_mean)
            den+=sim
        if den==0: 
            if means_movies[j]>0:
                # return the mean movie rating if there is no similar for the computation
                return means_movies[j]
            else:
                # else return mean user rating 
                return u_ratings.mean()
        return means_movies[j]+num/den

In [14]:
# data_train_sample = data_train[data_train.movie_id.apply(lambda x: int(x)<100)]
# data_test_sample = data_test[data_test.movie_id.apply(lambda x: int(x)<100)]

In [15]:
# item_cosine_sample = CollaborativeItemReco(data_train_sample, sim_cosine)
# item_cosine_sample.learn()

In [282]:
item_cosine_sample.estimate_mean("1","1")

4.1127366135337615

In [221]:
# data_test = data_test[data_test.movie_id.apply(lambda x: int(x)<100)]

In [26]:
item_cosine = CollaborativeItemReco(data_train, sim_cosine, sim_matrix_path='cosine.pkl')
# item_cosine.learn()


In [27]:
# item_cosine.estimate_mean("1","1")

In [244]:
# item_cosine.sim_matrix.to_pickle('cosine.pkl')

In [28]:
item_cosine_adjusted = CollaborativeItemReco(data_train, sim_cosine, adjust_users=True, sim_matrix_path='cosine_adjusted.pkl')
# item_cosine_adjusted.learn()

In [288]:
# item_cosine_adjusted.sim_matrix.to_pickle('cosine_adjusted.pkl')

In [29]:
item_pearson = CollaborativeItemReco(data_train, sim_pearson, sim_matrix_path='pearson.pkl')
# item_pearson.learn()

In [253]:
# item_pearson.sim_matrix.to_pickle('pearson.pkl')

In [30]:
item_pearson_adjusted = CollaborativeItemReco(data_train, sim_pearson, adjust_users=True, sim_matrix_path='pearson_adjusted.pkl')
# item_pearson_adjusted.learn()

In [290]:
# item_pearson_adjusted.sim_matrix.to_pickle('pearson_adjusted.pkl')

In [31]:
item_euclid = CollaborativeItemReco(data_train, sim_euclid, sim_matrix_path='euclid.pkl')
# item_euclid.learn()

In [23]:
# item_euclid.sim_matrix.to_pickle('euclid.pkl')

In [225]:
# data

In [62]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [63]:
def evaluate(estimate_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    u_train = set(data_train.user_id)
    m_train = set(data_train.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in zip(data_test.user_id, data_test.movie_id)])
    real = data_test.rating.values
    return compute_rmse(estimated, real)

In [64]:
import importlib
import tqdm
importlib.reload(tqdm)

<module 'tqdm' from 'C:\\Users\\annae\\miniconda3\\lib\\site-packages\\tqdm\\__init__.py'>

In [32]:
all_models = {'cosine':item_cosine, 'cosine_adj':item_cosine_adjusted, 'pearson':item_pearson, 'pearson_adj':item_pearson_adjusted, 'euclid':item_euclid}

In [46]:
# evaluate(model.estimate_basic, data_train, data_test.sample(1))
data_small_test = data_test.sample(5000)

In [None]:
rmse = []
for model_label, model in all_models.items():
    print('--------- {} ---------'.format(model_label))
    rmse.append(evaluate(model.estimate_basic, data_train, data_small_test))
    print('RMSE basic of {}: {}'.format(model_label,rmse[-1]))
    rmse.append(evaluate(model.estimate_mean, data_train, data_small_test))
    print('RMSE mean of {}: {}'.format(model_label,rmse[-1]))

--------- cosine ---------
RMSE basic of cosine: 1.0398174011473917
RMSE mean of cosine: 0.9193122545127611
--------- cosine_adj ---------


In [72]:
item_cosine_adjusted.sim_matrix

movie_id,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.025098,0.162196,-0.875737,-0.117133,-0.046471,0.708204,-0.909721,-0.492082,-0.674801,...,0.009940,-0.671812,1.0,0.124780,0.559209,-0.219901,-0.562044,-0.677364,-0.564178,-0.580007
10,0.025098,1.000000,0.116664,1.000000,1.000000,-1.000000,-0.129503,1.000000,0.391735,0.316081,...,-0.950209,-0.256820,1.0,-1.000000,-1.000000,0.571167,1.000000,0.083036,-0.477636,1.000000
100,0.162196,0.116664,1.000000,-0.817677,-0.727805,-0.730226,-0.917233,0.214148,0.650415,-0.174623,...,0.096840,-0.647018,1.0,0.259696,-0.280391,-0.034903,-0.859685,-0.402282,0.358543,-0.682630
1000,-0.875737,1.000000,-0.817677,1.000000,1.000000,0.854249,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.0,-1.000000,1.000000,1.000000,0.876966,0.967888,1.000000,-1.000000
1001,-0.117133,1.000000,-0.727805,1.000000,1.000000,-0.174419,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.0,-1.000000,1.000000,1.000000,-1.000000,-0.490367,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.219901,0.571167,-0.034903,1.000000,1.000000,1.000000,-1.000000,1.000000,1.000000,1.000000,...,-0.139150,1.000000,1.0,-1.000000,1.000000,1.000000,1.000000,-1.000000,-1.000000,1.000000
996,-0.562044,1.000000,-0.859685,0.876966,-1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.0,-0.035374,1.000000,1.000000,1.000000,0.818792,1.000000,0.188672
997,-0.677364,0.083036,-0.402282,0.967888,-0.490367,1.000000,0.563062,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.0,1.000000,1.000000,-1.000000,0.818792,1.000000,0.975430,-0.463109
998,-0.564178,-0.477636,0.358543,1.000000,1.000000,-1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.0,0.863904,1.000000,-1.000000,1.000000,0.975430,1.000000,1.000000


In [60]:
pd.read_pickle('pearson.pkl')

movie_id,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.220107,0.039075,0.333333,-0.413670,1.000000,1.0,0.5000,0.000000,-0.350000,...,0.743792,-0.188982,1.0,0.232586,,0.353553,0.360041,0.273861,0.171499,0.0
10,0.220107,1.000000,-0.052973,1.000000,,,,1.0000,0.500000,1.000000,...,1.000000,-1.000000,1.0,,,1.000000,,,1.000000,1.0
100,0.039075,-0.052973,1.000000,1.000000,0.064875,0.485643,,0.6875,0.294884,0.266254,...,0.634762,-0.232625,1.0,0.113739,0.547723,0.224733,0.707107,0.740741,0.883883,
1000,0.333333,1.000000,1.000000,1.000000,,,,1.0000,1.000000,1.000000,...,1.000000,1.000000,1.0,,1.000000,1.000000,,,1.000000,
1001,-0.413670,,0.064875,,1.000000,-0.188982,1.0,1.0000,1.000000,1.000000,...,,,1.0,,1.000000,1.000000,,1.000000,1.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.353553,1.000000,0.224733,1.000000,1.000000,1.000000,,1.0000,,,...,0.307794,,1.0,,,1.000000,1.000000,,,1.0
996,0.360041,,0.707107,,,,1.0,,,,...,1.000000,1.000000,1.0,0.500000,,1.000000,1.000000,0.577350,,1.0
997,0.273861,,0.740741,,1.000000,,,,,,...,1.000000,1.000000,1.0,,,,0.577350,1.000000,1.000000,0.5
998,0.171499,1.000000,0.883883,1.000000,1.000000,,1.0,1.0000,,1.000000,...,1.000000,1.000000,1.0,,1.000000,,,1.000000,1.000000,


In [None]:
# evaluate(item_cosine_sample.estimate_basic, data_train_sample, data_test_sample)

In [None]:
# print('RMSE for Collaborative Recomender: %s' % evaluate(item_cosine.estimate_mean,data_train,data_test))

In [None]:
# print('RMSE for Collaborative Recomender: %s' % evaluate(item_cosine.estimate_basic,data_train,data_test))

In [48]:
def precision_recall(estimate_f, data_train, data_test, N=25):
    all_movies_ids = set(data_train.movie_id.values).union(set(data_train.movie_id.values))
    in_top = 0
    ratings_5_test = data_test[data_test.rating==5]
    print(len(ratings_5_test))
    for i, row in ratings_5_test.iterrows():
        user_seen = list(data_train[data_train.user_id==row.user_id].movie_id.values)+list(data_test[data_test.user_id==row.user_id].movie_id.values)
        unseen = all_movies_ids.difference(user_seen)
        choosen_unseen = np.random.choice(list(unseen), min(len(unseen), 100), replace=False)
        ranked_random = pd.Series(list(map(lambda i:estimate_f(u=row.user_id,j=i),choosen_unseen))+[estimate_f(row.user_id, row.movie_id)],index=list(choosen_unseen)+[row.movie_id])
        index_row = np.argwhere(ranked_random.sort_values(ascending=False).index.values==row.movie_id).flatten()[0]
        if index_row<25:
            in_top+=1
    return in_top/len(ratings_5_test)
#     print(row.user_id)
#     break

In [49]:
rmse = []
for model_label, model in all_models.items():
    print('--------- {} ---------'.format(model_label))
    rmse.append(precision_recall(model.estimate_basic, data_train, data_small_test))
    print('P/R basic of {}: {}'.format(model_label,rmse[-1]))
    rmse.append(precision_recall(model.estimate_mean, data_train, data_small_test))
    print('P/R mean of {}: {}'.format(model_label,rmse[-1]))

--------- cosine ---------
1096


KeyboardInterrupt: 

In [45]:
precision_recall(item_cosine.estimate_basic,data_train,data_small_test)

1


0.0

In [218]:
len(data_train)

11501

In [188]:
# data_test, data_train


In [169]:
def rank(u,i):
    return np.sum([int(c) for c in i])

Index(['98', '97', '69', '78', '77', '86', '85', '76', '58', '49', '57', '84',
       '75', '39', '93', '48', '74', '38', '19', '37', '82', '91', '55', '46',
       '72', '9', '54', '36', '45', '18', '8', '35', '26', '53', '25', '43',
       '34', '15', '6', '51', '14', '23', '5', '3', '20', '11'],
      dtype='object')

34

In [177]:
np.where(ranked_random.sort_values(ascending=False).index==row.movie_id)

array([[37],
       [38]], dtype=int64)

In [None]:
data_train.user_id

In [44]:
for u,i in ids_to_estimate:
    print(u,i, str(u) in data_train.user_id)

92 67 False
110 67 False
184 67 False
326 67 False
453 67 False
622 67 False
643 67 False
684 67 False
712 67 False
727 67 False
741 67 False
833 67 False
868 67 False
892 67 False
196 25 False
157 25 False
99 25 False
59 25 False
243 25 False
222 25 False
279 25 False
145 25 False
90 25 False
271 25 False
44 25 False
264 25 False
268 25 False
256 25 False
15 25 False
207 25 False
14 25 False
193 25 False
78 25 False
185 25 False
159 25 False
24 25 False
277 25 False
343 25 False
354 25 False
406 25 False
417 25 False
424 25 False
450 25 False
452 25 False
453 25 False
458 25 False
503 25 False
504 25 False
518 25 False
525 25 False
520 25 False
517 25 False
533 25 False
537 25 False
560 25 False
577 25 False
555 25 False
588 25 False
614 25 False
625 25 False
632 25 False
636 25 False
672 25 False
692 25 False
703 25 False
765 25 False
790 25 False
823 25 False
835 25 False
843 25 False
848 25 False
852 25 False
896 25 False
897 25 False
899 25 False
933 25 False
936 25 False
194 94 F

640 96 False
645 96 False
648 96 False
659 96 False
661 96 False
698 96 False
709 96 False
712 96 False
758 96 False
753 96 False
774 96 False
796 96 False
807 96 False
815 96 False
830 96 False
826 96 False
867 96 False
871 96 False
883 96 False
889 96 False
901 96 False
913 96 False
932 96 False
301 29 False
246 29 False
13 29 False
267 29 False
11 29 False
110 29 False
1 29 False
56 29 False
158 29 False
197 29 False
109 29 False
313 29 False
328 29 False
387 29 False
495 29 False
541 29 False
545 29 False
543 29 False
642 29 False
650 29 False
660 29 False
796 29 False
798 29 False
830 29 False
846 29 False
864 29 False
881 29 False
896 29 False
286 85 False
291 85 False
308 85 False
10 85 False
293 85 False
49 85 False
327 85 False
453 85 False
476 85 False
551 85 False
640 85 False
690 85 False
751 85 False
788 85 False
804 85 False
916 85 False
308 24 False
178 24 False
251 24 False
279 24 False
54 24 False
159 24 False
314 24 False
318 24 False
363 24 False
371 24 False
374 24 

592 22 False
600 22 False
618 22 False
615 22 False
638 22 False
682 22 False
683 22 False
686 22 False
694 22 False
737 22 False
788 22 False
823 22 False
826 22 False
833 22 False
881 22 False
916 22 False
933 22 False
286 72 False
200 72 False
102 72 False
42 72 False
216 72 False
49 72 False
307 72 False
144 72 False
159 72 False
327 72 False
328 72 False
336 72 False
343 72 False
363 72 False
389 72 False
393 72 False
406 72 False
716 72 False
727 72 False
848 72 False
881 72 False
889 72 False
892 72 False
887 72 False
921 72 False
934 72 False
943 72 False
6 69 False
62 69 False
210 69 False
276 69 False
7 69 False
42 69 False
267 69 False
90 69 False
271 69 False
198 69 False
41 69 False
269 69 False
5 69 False
77 69 False
187 69 False
184 69 False
144 69 False
65 69 False
235 69 False
288 69 False
188 69 False
311 69 False
106 69 False
312 69 False
318 69 False
327 69 False
331 69 False
339 69 False
343 69 False
407 69 False
416 69 False
419 69 False
456 69 False
487 69 False


305 48 False
291 48 False
308 48 False
201 48 False
13 48 False
269 48 False
18 48 False
1 48 False
296 48 False
65 48 False
221 48 False
156 48 False
106 48 False
393 48 False
406 48 False
458 48 False
478 48 False
487 48 False
533 48 False
556 48 False
561 48 False
577 48 False
615 48 False
655 48 False
716 48 False
747 48 False
741 48 False
851 48 False
883 48 False
903 48 False
916 48 False
305 83 False
122 83 False
299 83 False
249 83 False
292 83 False
11 83 False
18 83 False
151 83 False
70 83 False
307 83 False
297 83 False
158 83 False
270 83 False
187 83 False
235 83 False
361 83 False
393 83 False
409 83 False
417 83 False
429 83 False
435 83 False
488 83 False
500 83 False
536 83 False
642 83 False
648 83 False
716 83 False
747 83 False
746 83 False
741 83 False
798 83 False
815 83 False
840 83 False
899 83 False
913 83 False
253 87 False
59 87 False
87 87 False
42 87 False
174 87 False
56 87 False
16 87 False
374 87 False
409 87 False
416 87 False
465 87 False
496 87 False

<div class  = "alert alert-success">**EXERCISE 3**<p>
Modify the similarity function with the following:
$$new\_sim(a,b) = sim(a,b) * \frac{min(50,|P_{ab}|)}{50} $$
where $|P_{ab}|$ is the number of common items with user $a$ and user $b$
</div>

<div class  = "alert alert-success">**EXERCISE 4**<p>
Is there a set of users where the systems work better than with othes users?
Does it depend on the number of rating per user?
</div>