In [2]:
#first download data!! movielens 100k
import pandas as pd
import numpy as np
import math
import matplotlib.pylab as plt
import tqdm
# Load Data set
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols, dtype={'user_id':str})

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, dtype={'movie_id':str, 'user_id':str})

m_cols = ['movie_id', 'title', 'release_date']
movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=m_cols, usecols=range(3), encoding='latin-1', dtype={'movie_id':str})

data = pd.merge(pd.merge(ratings, users), movies)
data = data[['user_id','title', 'movie_id','rating','release_date','sex','age']]


print("BD has", str(data.shape[0]),"ratings")
print("BD has",data.user_id.nunique(),"users")
print("BD has",data.movie_id.nunique(),"movies")
data.head()



def evaluate(estimate_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(data_test.user_id, data_test.movie_id)
    estimated = np.array([estimate_f(u,i) if u in data_train.user_id else 3 for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    return compute_rmse(estimated, real)

def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))


## Divide the data in two sets: training and test
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.loc[sampled_ids, 'for_testing'] = True
    return df

data['for_testing'] = False
grouped = data.groupby('user_id', group_keys=False).apply(assign_to_set)
data_train = data[grouped.for_testing == False]
data_test = data[grouped.for_testing == True]
print(data_train.shape)
print(data_test.shape)
print(data_train.index & data_test.index)

print("Training data_set has "+ str(data_train.shape[0]) +" ratings")
print("Test data set has "+ str(data_test.shape[0]) +" ratings")
print("La BD has ", data.movie_id.nunique(), " movies")

Autosaving every 150 seconds
BD has 100000 ratings
BD has 943 users
BD has 1682 movies
(79619, 8)
(20381, 8)
Int64Index([], dtype='int64')
Training data_set has 79619 ratings
Test data set has 20381 ratings
La BD has  1682  movies


In [11]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class SVD_CollaborativeFiltering:
    """ Collaborative filtering using a custom sim(u,u'). """
    
    def __init__(self,DataFrame, num_components=10,
                 item_fact_reg=0.0, 
                 user_fact_reg=0.0):
        """ Constructor """
        self.df = DataFrame
        self.num_components = num_components
        self.item_fact_reg = item_fact_reg
        self.user_fact_reg = user_fact_reg
        
        urm = pd.pivot_table(self.df[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating',fill_value=0)
        self.n_users, self.n_items = urm.shape
        self.ratings = np.float32(urm.values)
        
        user_index = np.arange(len(urm.index))
        self.users = dict(zip(user_index,urm.index ))
        self.users_index2id = dict(zip(urm.index,user_index)) 
        
        movie_index = np.arange(len(urm.columns))
        self.movies = dict(zip(movie_index,urm.columns ))   
        self.movies_index2id = dict(zip(urm.columns, movie_index))

        self.sample_row, self.sample_col = self.ratings.nonzero()
        self.n_samples = len(self.sample_row)
    
    def __sdg__(self):
        for idx in self.training_indices:
            u = self.sample_row[idx]
            i = self.sample_col[idx]
            user_id = self.users[u]
            item_id = self.movies[i]
            
            prediction = self.estimate(user_id, item_id)
            error = (self.ratings[u,i] - prediction) # error
            
            #Update latent factors
            self.user_vecs[u, :] += self.learning_rate * \
                                    (error * self.item_vecs[i, :] - \
                                     self.user_fact_reg * self.user_vecs[u,:])
            self.item_vecs[i, :] += self.learning_rate * \
                                    (error * self.user_vecs[u, :] - \
                                     self.item_fact_reg * self.item_vecs[i,:])
                
                
    def learn(self,n_iter = 10, learning_rate=0.001):
        """ Train the model. """
        self.train_mse =[]
        self.test_mse = []
        iter_diff = 0
        
        # initialize latent vectors
        self.user_vecs = np.random.normal(scale=1./self.num_components,\
                                          size=(self.n_users, self.num_components))
        self.item_vecs = np.random.normal(scale=1./self.num_components,
                                          size=(self.n_items, self.num_components))

        self.learning_rate = learning_rate
        
        ctr =1
        while ctr <= n_iter:
            if ctr % 1 == 0:
                print('Iteration: {}'.format(ctr))
            self.training_indices = np.arange(self.n_samples)
            #shuffle training samples
            np.random.shuffle(self.training_indices)
            self.__sdg__()
            ctr += 1
                
            print('\tTrain mse: %s' % evaluate(reco.estimate,data_train,data_train))
            print('\tTest mse: %s' % evaluate(reco.estimate,data_test,data_train))
    
                
    def estimate(self, user_id, movie_id):
        """ Single user and item prediction."""
        try:
            u = self.users_index2id[user_id]
            i = self.movies_index2id[movie_id]
            prediction =  self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
            return prediction
        except:
            return 3
        
    def estimate_cosine(self, user_id, movie_id):
        """ Single user and item prediction."""
        from scipy.spatial.distance import cosine
        try:
            u = self.users_index2id[user_id]
            i = self.movies_index2id[movie_id]
            prediction =  1-cosine(self.user_vecs[u, :],self.item_vecs[i, :].T)
            return prediction
        except:
            return 3

In [12]:
reco = SVD_CollaborativeFiltering(data_train,num_components=40)


In [29]:
reco.learn(n_iter = 100)

Iteration: 1
	Train mse: 3.702579756365126
	Test mse: 3.709352144846075
Iteration: 2
	Train mse: 3.7023015428393755
	Test mse: 3.709073024475523
Iteration: 3
	Train mse: 3.70179632242225
	Test mse: 3.708565193591598
Iteration: 4
	Train mse: 3.700659612920658
	Test mse: 3.7074210399183403
Iteration: 5
	Train mse: 3.697787504667652
	Test mse: 3.7045281033201514
Iteration: 6
	Train mse: 3.6901462156529785
	Test mse: 3.696828938892019
Iteration: 7
	Train mse: 3.6695798509582427
	Test mse: 3.67610369429681
Iteration: 8
	Train mse: 3.6152978014156663
	Test mse: 3.6213964289343235
Iteration: 9
	Train mse: 3.481870542602031
	Test mse: 3.4869028026906914
Iteration: 10
	Train mse: 3.206682824413323
	Test mse: 3.2094205973843586
Iteration: 11
	Train mse: 2.7998167406480814
	Test mse: 2.798884247451207
Iteration: 12
	Train mse: 2.4051008297652774
	Test mse: 2.4002217001283737
Iteration: 13
	Train mse: 2.1114901001008692
	Test mse: 2.1033998966255005
Iteration: 14
	Train mse: 1.8974320308700465
	Te

In [30]:
# reco.estimate_cosine('1','2')

In [31]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [32]:
def evaluate(estimate_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    data_testable = data_test[data_test.movie_id.isin(data_train.movie_id)&data_test.user_id.isin(data_train.user_id)].copy()
    u_train = set(data_train.user_id)
    m_train = set(data_train.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in zip(data_testable.user_id, data_testable.movie_id)])
    real = data_testable.rating.values
    return compute_rmse(estimated, real)

In [33]:
print('--------- {} ---------'.format('SVD_estimate'))
print('RMSE dot {}'.format(evaluate(reco.estimate,data_train,data_test)))
print('RMSE cosine {}'.format(evaluate(reco.estimate_cosine,data_train,data_test)))

--------- SVD_estimate ---------
RMSE dot 0.9340701096010212
RMSE cosine 2.890285834744717


In [37]:
def precision_recall(estimate_f, data_train, data_test):
    all_movies_ids = set(data_train.movie_id.values).union(set(data_train.movie_id.values))
    in_top = 0
    ratings_5_test = data_test[data_test.rating==5]
#     print(len(ratings_5_test))
    for i, row in ratings_5_test.iterrows():
        user_seen = list(data_train[data_train.user_id==row.user_id].movie_id.values)+list(data_test[data_test.user_id==row.user_id].movie_id.values)
        unseen = all_movies_ids.difference(user_seen)
        choosen_unseen = np.random.choice(list(unseen), min(len(unseen), 50), replace=False)
        ranked_random = pd.Series(list(map(lambda i:estimate_f(row.user_id,i),choosen_unseen))+[estimate_f(row.user_id, row.movie_id)],index=list(choosen_unseen)+[row.movie_id])
        index_row = np.argwhere(ranked_random.sort_values(ascending=False).index.values==row.movie_id).flatten()[0]
        if index_row<15:
            in_top+=1
    return in_top/len(ratings_5_test)

In [38]:
print('--------- {} ---------'.format('SVD_estimate'))
print('P/R dot: {}'.format(precision_recall(reco.estimate,data_train,data_test)))
print('P/R cosine: {}'.format(precision_recall(reco.estimate_cosine,data_train,data_test)))

--------- SVD_estimate ---------
P/R dot: 0.8572741898414158
P/R cosine: 0.29648356699609285
