In [1]:
import pandas as pd
import numpy as np
import math


In [233]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, dtype={'user_id':str})

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, dtype={'movie_id':str, 'user_id':str})

m_cols = ['movie_id', 'title', 'release_date']
movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=m_cols, usecols=range(3), encoding='latin-1', dtype={'movie_id':str})

data = pd.merge(pd.merge(ratings, users), movies)
data = data[['user_id','title', 'movie_id','rating','release_date','sex','age']]


print("BD has", str(data.shape[0]),"ratings")
print("BD has",data.user_id.nunique(),"users")
print("BD has",data.movie_id.nunique(),"movies")
data.head()


BD has 100000 ratings
BD has 943 users
BD has 1682 movies


Unnamed: 0,user_id,title,movie_id,rating,release_date,sex,age
0,196,Kolya (1996),242,3,24-Jan-1997,M,49
1,305,Kolya (1996),242,5,24-Jan-1997,M,23
2,6,Kolya (1996),242,4,24-Jan-1997,M,42
3,234,Kolya (1996),242,4,24-Jan-1997,M,60
4,63,Kolya (1996),242,3,24-Jan-1997,M,31


In [234]:
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

data['for_testing'] = False
grouped = data.groupby('user_id', group_keys=False).apply(assign_to_set)
data_train = data[grouped.for_testing == False]
data_test = data[grouped.for_testing == True]
print(data_train.shape)
print(data_test.shape)
print(data_train.index & data_test.index)

print("Training data_set has", str(data_train.shape[0]),"ratings")
print("Test data set has", str(data_test.shape[0]),"ratings")
print("La BD has", data.movie_id.nunique(), "movies")


(79619, 8)
(20381, 8)
Int64Index([], dtype='int64')
Training data_set has 79619 ratings
Test data set has 20381 ratings
La BD has 1682 movies


In [4]:
data_train = pd.read_pickle('data/data_train.pkl')

In [5]:
data_test = pd.read_pickle('data/data_train.pkl')

##### How to get the set of movies from user with id  "1"?

In [6]:
def pivot_db(ratings, adjust_users=False):
    ratings_pivoted = ratings.pivot(
        index='movie_id',
        columns='user_id',
        values='rating'
    )
    if adjust_users:
        ratings_pivoted = ratings_pivoted.apply(lambda u: u-u.mean(), axis=0)
    ratings_pivoted['users_rated']=ratings_pivoted.apply(lambda m: m[-m.isnull()].index, axis=1)    
    return ratings_pivoted#.fillna(0)

In [7]:
# users = data_train_pivoted['users_rated'].iloc[0]

In [8]:
def sim_euclid(a,b):
    from scipy.spatial.distance import euclidean
    return 1.0/(1.0+euclidean(a,b))  

In [9]:
def sim_pearson(a,b):
    from scipy.spatial.distance import correlation
    return 1-correlation(a,b)

In [10]:
def sim_cosine(a,b):
    from scipy.spatial.distance import cosine
    return 1-cosine(a,b)

In [11]:
def compare(pivoted_m1, pivoted_m2, similarity):
#     m1 = df.loc[m_id1]
#     m2 = df.loc[m_id2]
    if (len(pivoted_m1)==0 or len(pivoted_m2)==0):
        return 1
    users_1 = pivoted_m1['users_rated']
    users_2 = pivoted_m2['users_rated']
    intersection = users_1[users_1.isin(users_2)]
    if len(intersection)==0: 
        return 1
    return similarity(pivoted_m1.loc[intersection], pivoted_m2.loc[intersection])

In [12]:
from tqdm import tqdm

In [61]:
class CollaborativeItemReco:
    """ Collaborative filtering using a custom sim(u,u'). """    
    def __init__(self, ratings, similarity, adjust_users=False, k=None, sim_matrix_path=None):
        """ Constructor """        
        self.df = pivot_db(ratings, adjust_users)        
        self.similarity=similarity
        if not sim_matrix_path:
            self.sim_matrix = pd.DataFrame(1, columns=self.df.index, index=self.df.index)
        else:
            self.sim_matrix = pd.read_pickle(sim_matrix_path)
        self.k = k+1 if k else len(self.df.index)
    def learn(self):
        """ Prepare data structures for estimation. Similarity matrix for users """
        for i, mId1 in enumerate(tqdm(self.df.index)):
            for j, mId2 in enumerate(self.df.index[i+1:]):
                sim = compare(self.df.loc[mId1],self.df.loc[mId2], self.similarity)
                self.sim_matrix.loc[mId1,mId2]=sim
                self.sim_matrix.loc[mId2,mId1]=sim
                
    def estimate_basic(self, u, j):
        # u is user
        # j is movie
        if u not in self.df:
            print('u_{} not in training'.format(u))
            return 3
        if j not in self.sim_matrix:
            print('m_{} not in training'.format(j))
            return 3
        u_ratings = self.df[u][self.df[u]>0]
        num=0
        den=0
        P_k = self.sim_matrix[j].loc[u_ratings.index].sort_values(ascending=False).iloc[1:self.k]
        means_movies = self.df.apply(lambda m: m.loc[m['users_rated']].mean(),axis=1)
        for i, sim in P_k.iteritems():
            num+=sim*u_ratings[i]
            den+=sim
        if den==0: 
            if means_movies[j]>0:
                # return the mean movie rating if there is no similar for the computation
                return means_movies[j]
            else:
                # else return mean user rating 
                return u_ratings.mean()
        return num/den
    
    def estimate_mean(self, u, j):
        # u is user
        # j is movie
        if u not in self.df:
            print('u_{} not in training'.format(u))
            return 3
        if j not in self.sim_matrix:
            print('m_{} not in training'.format(j))
            return 3
        u_ratings = self.df[u][self.df[u]>0]
        num=0
        den=0
        P_k = self.sim_matrix[j].loc[u_ratings.index].sort_values(ascending=False).iloc[1:]
        means_movies = self.df.apply(lambda m: m.loc[m['users_rated']].mean(),axis=1)
        for i, sim in P_k.iteritems():
            r_i_mean = means_movies[i]
            num+=sim*(u_ratings[i]-r_i_mean)
            den+=sim
        if den==0: 
            if means_movies[j]>0:
                # return the mean movie rating if there is no similar for the computation
                return means_movies[j]
            else:
                # else return mean user rating 
                return u_ratings.mean()
        return means_movies[j]+num/den

In [14]:
# data_train_sample = data_train[data_train.movie_id.apply(lambda x: int(x)<100)]
# data_test_sample = data_test[data_test.movie_id.apply(lambda x: int(x)<100)]

In [15]:
# item_cosine_sample = CollaborativeItemReco(data_train_sample, sim_cosine)
# item_cosine_sample.learn()

In [221]:
# data_test = data_test[data_test.movie_id.apply(lambda x: int(x)<100)]

In [66]:
item_cosine = CollaborativeItemReco(data_train, sim_cosine, sim_matrix_path='data/cosine.pkl')
# item_cosine.learn()


In [244]:
# item_cosine.sim_matrix.to_pickle('data/cosine.pkl')

In [67]:
item_cosine_adjusted = CollaborativeItemReco(data_train, sim_cosine, adjust_users=True, sim_matrix_path='data/cosine_adjusted.pkl')
# item_cosine_adjusted.learn()

In [288]:
# item_cosine_adjusted.sim_matrix.to_pickle('cosine_adjusted.pkl')

In [68]:
item_pearson = CollaborativeItemReco(data_train, sim_pearson, sim_matrix_path='data/pearson.pkl')
# item_pearson.learn()

In [253]:
# item_pearson.sim_matrix.to_pickle('data/pearson.pkl')

In [88]:
item_pearson_adjusted = CollaborativeItemReco(data_train, sim_pearson, adjust_users=True, sim_matrix_path='pearson_adjusted.pkl')
# item_pearson_adjusted.learn()

In [92]:
# item_pearson_adjusted.sim_matrix

In [290]:
# item_pearson_adjusted.sim_matrix.to_pickle('data/pearson_adjusted.pkl')

In [94]:
item_euclid = CollaborativeItemReco(data_train, sim_euclid, sim_matrix_path='data/euclid.pkl')
# item_euclid.learn()

In [23]:
# item_euclid.sim_matrix.to_pickle('data/euclid.pkl')

In [62]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [63]:
def evaluate(estimate_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    u_train = set(data_train.user_id)
    m_train = set(data_train.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in zip(data_test.user_id, data_test.movie_id)])
    real = data_test.rating.values
    return compute_rmse(estimated, real)

In [64]:
# import importlib
# import tqdm
# importlib.reload(tqdm)

<module 'tqdm' from 'C:\\Users\\annae\\miniconda3\\lib\\site-packages\\tqdm\\__init__.py'>

In [108]:
all_models = {'cosine':item_cosine, 'pearson':item_pearson, 'euclid':item_euclid}

In [109]:
# evaluate(model.estimate_basic, data_train, data_test.sample(1))
data_small_test = data_test.sample(1000)

In [110]:
rmse = []
for model_label, model in all_models.items():
    model.sim_matrix = model.sim_matrix.fillna(0)
    print('--------- {} ---------'.format(model_label))
    rmse.append(evaluate(model.estimate_basic, data_train, data_small_test))
    print('RMSE basic of {}: {}'.format(model_label,rmse[-1]))
    rmse.append(evaluate(model.estimate_mean, data_train, data_small_test))
    print('RMSE mean of {}: {}'.format(model_label,rmse[-1]))

--------- cosine ---------
RMSE basic of cosine: 1.0366071543385287
RMSE mean of cosine: 0.9285720967063059
--------- pearson ---------
RMSE basic of pearson: 4.378503532488526
RMSE mean of pearson: 4.535351302983304
--------- euclid ---------
RMSE basic of euclid: 0.9879176904758223
RMSE mean of euclid: 0.9013302016495014


In [116]:
def precision_recall(estimate_f, data_train, data_test, N=25):
    all_movies_ids = set(data_train.movie_id.values).union(set(data_train.movie_id.values))
    in_top = 0
    ratings_5_test = data_test[data_test.rating==5]
#     print(len(ratings_5_test))
    for i, row in ratings_5_test.iterrows():
        user_seen = list(data_train[data_train.user_id==row.user_id].movie_id.values)+list(data_test[data_test.user_id==row.user_id].movie_id.values)
        unseen = all_movies_ids.difference(user_seen)
        choosen_unseen = np.random.choice(list(unseen), min(len(unseen), 50), replace=False)
#         print(i, len(choosen_unseen))
        ranked_random = pd.Series(list(map(lambda i:estimate_f(u=row.user_id,j=i),choosen_unseen))+[estimate_f(row.user_id, row.movie_id)],index=list(choosen_unseen)+[row.movie_id])
        index_row = np.argwhere(ranked_random.sort_values(ascending=False).index.values==row.movie_id).flatten()[0]
        if index_row<15:
            in_top+=1
    return in_top/len(ratings_5_test)
#     print(row.user_id)
#     break

In [119]:
prs = []
for model_label, model in all_models.items():
    print('--------- {} ---------'.format(model_label))
    prs.append(precision_recall(model.estimate_basic, data_train, data_small_test))
    print('P/R basic of {}: {}'.format(model_label,prs[i]))
    prs.append(precision_recall(model.estimate_mean, data_train, data_small_test))
    print('P/R mean of {}: {}'.format(model_label,prs[i+1]))


--------- cosine ---------
P/R basic of cosine: 0.423728813559322
P/R mean of cosine: 0.7584745762711864
--------- pearson ---------
P/R basic of pearson: 0.8559322033898306
P/R mean of pearson: 0.9279661016949152
--------- euclid ---------
P/R basic of euclid: 0.6991525423728814
P/R mean of euclid: 0.8177966101694916
