In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
%matplotlib inline

In [2]:
df = pd.read_csv('u.user',sep='|',header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
df = pd.read_csv('u.item',encoding='latin-1',sep='|',header=None,names=['movie_id','movie_title','release_date','video_release_date','IMDb URL','unknown','action','adventure','animation','children\'s','comedy','crime','documentary','drama','fantasy','film-noir','horror','musical','mystery','romance','sci-fi','thriller','war','western'])
id_to_title = dict(zip(df.movie_id,df.movie_title))
df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb URL,unknown,action,adventure,animation,children's,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
df_aux = pd.read_csv('u.data',sep='\t',header=None,names=['user_id','movie_id','rating','timestamp'])
print('rating values:',np.unique(df_aux.rating))
df = pd.merge(df_aux, df)
del df_aux
df.head()

rating values: [1 2 3 4 5]


Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,video_release_date,IMDb URL,unknown,action,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,875747190,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,883888671,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,879138235,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,876503793,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.metrics import pairwise_distances
from time import time as tt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr



def custom(metric='euclidean', min_common_items=2):
    
    def cosine_sim(u,v):
        mask = (u==14.)&(v==14)
        if mask.sum()>2:
            return (1-cosine(u[mask],v[mask]))/2
        else:
            return 1

    # Returns a distance-based similarity score for person1 and person2
    def euclid_sim(u,v):
        mask = (u!=14) & (v!=14)
        u = u[mask]
        v = v[mask]
        if mask.sum() < min_common_items:
            return 0
        #return distEuclid(rep['rating_x'],rep['rating_y']) 
        return 1.0/(1+euclidean(u,v))

    # Returns a pearsonCorrealation-based similarity score for person1 and person2
    def pearson_sim(u,v):
        mask = (u!=14) & (v!=14)
        u = u[mask]
        v = v[mask]
        if mask.sum() < min_common_items:
            return 0    
        res=pearsonr(u,v)[0]
        if(np.isnan(res)):
            return 0
        return res
    
    if metric=='euclidean':
        return euclid_sim
    if metric=='cosine':
        return cosine_sim
    if metric=='pearson':
        return pearson_sim

# ratings_matrix = df.pivot_table(index=['movie_id'],columns=['user_id'],values='rating')
# ratings_matrix = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)
# ratings_matrix.fillna( 14, inplace = True )
# t = tt()
# movie_similarity = pairwise_distances( ratings_matrix.values, metric=custom('euclidean') )
# t = tt()-t
# print('Training time:',t)
# ratings_matrix = pd.DataFrame( data= movie_similarity, index=ratings_matrix.index, columns=ratings_matrix.index )
# movie_similarity

In [6]:
from sklearn.metrics import pairwise_distances
from time import time as tt
from scipy.spatial.distance import euclidean, cosine
from scipy.stats import pearsonr


class CollaborativeFiltering:
#     def __init__(self,metric='euclidean',min_common_users=2):
    def __init__(self):
        self.sim_mat=None
        self.movie_list = None
        self.user_list = None
        self.min_common_users = 2
        return None
    
    def fit(self, df, metric='euclidean'):
        t = tt()
        ratings_matrix = df.pivot_table(index=['movie_id'],columns=['user_id'],values='rating')
        self.movie_list = set(ratings_matrix.index)
        self.user_list = set(ratings_matrix.columns)
        ratings_matrix = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)
        ratings_matrix.fillna( 14, inplace = True )
        movie_similarity = pairwise_distances( ratings_matrix.values, metric=custom(metric, self.min_common_users) )
        np.fill_diagonal( movie_similarity, 0 )
        self.sim_mat = pd.DataFrame( data= movie_similarity, index=ratings_matrix.index, columns=ratings_matrix.index )
        t = tt()-t
        print('Training time:',t)
        return None
        
    def predict(self, df, user_id, movie_id):
        df_reduced = df[df.user_id == user_id]
        user_movie_list = set( df_reduced.movie_id).intersection(self.movie_list)
        user_mean_rating = df_reduced.rating.mean()
        df_reduced = df_reduced[df_reduced.movie_id.isin(user_movie_list)]
        df_reduced = df_reduced.pivot_table(index='movie_id',columns='user_id',values='rating')
        df_reduced = df_reduced.sub(df_reduced.mean(axis=0), axis=1)
        rating_num=0.0
        rating_den=0.0
        num = (sim_mat[movie_id].filter(items=user_movie_list, axis=0) * df_reduced[user_id].filter(items=user_movie_list, axis=0)).sum()
        den = (sim_mat[movie_id].filter(items=user_movie_list, axis=0) ).sum()
        if den==0 or num/den>10:
            movie_mean_rating = df.rating[df.movie_id == movie_id].mean()
            if movie_mean_rating >0:
#                 print('Movie mean rating:', movie_mean_rating)
                # return the mean movie rating if there is no similar for the computation
                return int(round(movie_mean_rating))
            else:
#                 print('User mean rating:', user_mean_rating)
                # else return mean user rating 
                return int(round(user_mean_rating))
#         print('Original formula:', user_mean_rating, rating_num/rating_den)
        return int(round(user_mean_rating + rating_num/rating_den))

    def complete_prediction(self, df):
        movies = set(df.movie_id).intersection(self.movie_list)
        users = set(df.user_id).intersection(self.user_list)
        df_reduced = df.filter()
        results = []
        print('Total number of predictions to do:',len(users)*len(movies))
        count=0
        for movie in movies:
            partial_result = []
            for user in users:
                count += 1
                if count%100==0:
                    print('Percentage of predictions done:',count/len(users)/len(movies), end='\r')
                partial_result.append(self.predict(df,user,movie))
            results.append(partial_result)
        df_results = pd.DataFrame(data=results, index=movies, columns=users)
        return df_results
          
    def mse(self,df1,df2): 
        users_1 = set(df1.user_id)
        movies_1 = set(df1.movie_id)
        users_2 = set(df2.columns)
        movies_2 = set(df2.index)
        users = users_1.intersection(users_2)
        movies = movies_1.intersection(movies_2)
        ratings_matrix = df1.pivot_table(index='movie_id', columns='user_id', values='rating')
        mse = 0
        n = 0
        for user in users:
            for movie in movies:
                elem = ratings_matrix[user][movie]
                if ~np.isnan(elem):
                    mse += (df2[user][movie]-elem)**2
                    n+=1
        mse = mse/n
        return mse


SyntaxError: invalid syntax (<ipython-input-6-e22b6b1a3b86>, line 72)

In [181]:
metric_list = ['euclidean','cosine','pearson']

# cf = CollaborativeFiltering()
# for metric in metric_list:
#     cf.fit(df, metric=metric)
#     df_results = cf.complete_prediction(df)
#     print(metric+' mse:', cf.mse(df,df_results))

cf = CollaborativeFiltering()
cf.fit(df)
df_results = cf.complete_prediction(df)
    

Training time: 29.667712688446045
Total number of predictions to do: 1586126
Percentage of predictions done: 0.01178973171109987646

KeyboardInterrupt: 

In [48]:
data = pd.DataFrame({'movie_id':[1,1,1,2,2,2,3,3,3,4,4,4,5,5,5], 'user_id':[1,2,4,1,3,4,2,3,4,1,2,3,1,2,3], 'rating':[1,2,3,4,3,1,5,4,3,5,2,3,4,2,2]})
data.pivot_table(index='movie_id', columns='user_id', values='rating')
from tabulate import tabulate
pdtabulate=lambda df:tabulate(df,headers='keys',tablefmt='psql')
print(pdtabulate(data))
users = set(data.user_id)
movies = set(data.movie_id)
results = []
cf = CollaborativeFiltering(metric='euclidean')
cf.fit(data)
for movie in movies:
    partial_result = []
    for user in users:
        partial_result.append(cf.predict(data,user,movie))
    results.append(partial_result)
df_results = pd.DataFrame(data=results, index=movies, columns=users)
print(pdtabulate(df_results))

+----+------------+-----------+----------+
|    |   movie_id |   user_id |   rating |
|----+------------+-----------+----------|
|  0 |          1 |         1 |        1 |
|  1 |          1 |         2 |        2 |
|  2 |          1 |         4 |        3 |
|  3 |          2 |         1 |        4 |
|  4 |          2 |         3 |        3 |
|  5 |          2 |         4 |        1 |
|  6 |          3 |         2 |        5 |
|  7 |          3 |         3 |        4 |
|  8 |          3 |         4 |        3 |
|  9 |          4 |         1 |        5 |
| 10 |          4 |         2 |        2 |
| 11 |          4 |         3 |        3 |
| 12 |          5 |         1 |        4 |
| 13 |          5 |         2 |        2 |
| 14 |          5 |         3 |        2 |
+----+------------+-----------+----------+
+------------+-----+-----+-----+-----+
|   movie_id |   1 |   2 |   3 |   4 |
|------------+-----+-----+-----+-----|
|          1 |   1 |   2 | nan |   3 |
|          2 |   4 | nan | 

In [122]:
def mse(df1,df2): 
    users_1 = set(df1.user_id)
    movies_1 = set(df1.movie_id)
    users_2 = set(df2.columns)
    movies_2 = set(df2.index)
    users = users_1.intersection(users_2)
    movies = movies_1.intersection(movies_2)
    ratings_matrix = df1.pivot_table(index='movie_id', columns='user_id', values='rating')
    mse = 0
    n = 0
    for user in users:
        for movie in movies:
            elem = ratings_matrix[user][movie]
            if ~np.isnan(elem):
                mse += (df2[user][movie]-elem)**2
                n+=1
    mse = mse/n
    return mse

mse(data, df_results)

2.322736941246297

In [93]:
metric_list = ['euclidean','cosine','pearson']

for metric in metric_list:
    cf = CollaborativeFiltering(metric=metric)
    cf.fit(data)
    

5

In [40]:
cf = CollaborativeFiltering()
cf.fit(df)
cf.predict(df,2,2)

user_id          2
movie_id          
1         0.290323
10       -1.709677
13        0.290323
14        0.290323
19       -0.709677
...            ...
312      -0.709677
313       1.290323
314      -2.709677
315      -2.709677
316       1.290323

[62 rows x 1 columns]
Original formula: 3.7096774193548385 -0.07333349472550948


3.636343924629329

In [82]:
~np.isnan(np.nan)

False

In [162]:
np.round(1.5)

2.0

In [177]:
import time
for i in range(4):
    print('loading'+'.'*i,end='\r')
    time.sleep(0.5)

loading...

In [191]:
pd.DataFrame(data=[[1,np.nan,],[2, 2,2]]).isnull().sum()

0    0
1    1
2    1
dtype: int64

In [216]:
data = pd.DataFrame({'movie_id':[1,1,1,2,2,2,3,3,3,4,4,4,5,5,5], 'user_id':[1,2,4,1,3,4,2,3,4,1,2,3,1,2,3], 'rating':[1,2,3,4,3,1,5,4,3,5,2,3,4,2,2]})
data_piv = data.pivot_table(index='movie_id', columns='user_id', values='rating')
data_piv.fillna(-1, inplace=True)
data_piv.where(data_piv<0, data_piv*mat)

user_id,1,2,3,4
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,2.0,-1.0,9.0
2,16.0,-1.0,18.0,7.0
3,-1.0,45.0,40.0,33.0
4,60.0,26.0,42.0,-1.0
5,64.0,34.0,36.0,-1.0


In [109]:
mat = np.array(range(20)).reshape(5,4)
dataf = pd.DataFrame(data=mat)
dataf.where(dataf<10, dataf*mat)
dataf.where((dataf.columns), 0)

ValueError: Array conditional must be same shape as self

In [227]:
from time import time
t = time()
ratings_matrix = df.pivot_table(index=['movie_id'],columns=['user_id'],values='rating').sub(ratings_matrix.mean(axis=1), axis=0).fillna( 14, inplace = True )
movie_similarity = pairwise_distances( ratings_matrix.values, metric=custom() )
print('time 1:',time()-t)
t = time()


AttributeError: 'NoneType' object has no attribute 'mean'

In [14]:
t = tt()
metric='euclidean'
min_common_users=2
ratings_matrix = df.pivot_table(index=['movie_id'],columns=['user_id'],values='rating')
movie_list = set(ratings_matrix.index)
user_list = set(ratings_matrix.columns)
ratings_matrix = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)
ratings_matrix.fillna( 14, inplace = True )
movie_similarity = pairwise_distances( ratings_matrix.values, metric=custom(metric, min_common_users) )
np.fill_diagonal( movie_similarity, 0 )
sim_mat = pd.DataFrame( data= movie_similarity, index=ratings_matrix.index, columns=ratings_matrix.index )
t = tt()-t
print('Training time:',t)

Training time: 52.34082865715027


In [127]:
from time import time



user_id=1
movie_id=1

t =  time()

df_reduced = df[df.user_id == user_id]
user_movie_list = set( df_reduced.movie_id)
user_mean_rating = df_reduced.rating.mean()
df_reduced = df_reduced[df_reduced.movie_id.isin(user_movie_list)]
df_reduced = df_reduced.pivot_table(index='movie_id',columns='user_id',values='rating')
df_reduced = df_reduced.sub(df_reduced.mean(axis=0), axis=1)
rating_num=0.0
rating_den=0.0

num = (sim_mat[movie_id].filter(items=user_movie_list, axis=0) * df_reduced[user_id].filter(items=user_movie_list, axis=0)).sum()
den = (sim_mat[movie_id].filter(items=user_movie_list, axis=0) ).sum()
# for other_movie_id in user_movie_list:
#     rating_num += sim_mat[movie_id][other_movie_id] * df_reduced[user_id][other_movie_id]
#     rating_den += sim_mat[movie_id][other_movie_id]
t = time()-t
print('Prediction time:',t)

Prediction time: 0.019208431243896484


In [13]:
data = pd.DataFrame({'movie_id':[1,1,1,2,2,2,3,3,3,4,4,4,5,5,5], 'user_id':[1,2,4,1,3,4,2,3,4,1,2,3,1,2,3], 'rating':[1,2,3,4,3,1,5,4,3,5,2,3,4,2,2]})
data_piv = data.pivot_table(index='movie_id', columns='user_id', values='rating')
user_list = set([1,2])
a = data_piv.filter(items=user_list, axis=0).filter(items=user_list, axis=1)**2
a

user_id,1,2
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,4.0
2,16.0,


In [8]:
data_piv

user_id,1,2,3,4
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,2.0,,3.0
2,4.0,,3.0,1.0
3,,5.0,4.0,3.0
4,5.0,2.0,3.0,
5,4.0,2.0,2.0,


In [117]:
user_id=1
movie_id=270

num = (sim_mat[movie_id].filter(items=user_movie_list, axis=0) * df_reduced[user_id].filter(items=user_movie_list, axis=0)).sum()
den = (sim_mat[movie_id].filter(items=user_movie_list, axis=0) ).sum()
num/den

-0.08395390252074358

In [108]:
a = sim_mat[movie_id].filter(items=user_movie_list, axis=0)

In [96]:
b = df_reduced[user_id].filter(items=user_movie_list, axis=0)

In [97]:
(a*b).sum()

-8.724250575773516

In [104]:
b[:-1].sum()

0.6102941176470509

In [106]:
b[-1:]

movie_id
272   -0.610294
Name: 1, dtype: float64

In [110]:
sim_mat

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.078128,0.075116,0.059985,0.112651,0.194015,0.045516,0.064782,0.048599,0.088970,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.078128,0.000000,0.110890,0.080668,0.124449,0.259807,0.077719,0.098818,0.080626,0.150029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.075116,0.110890,0.000000,0.085485,0.165656,0.294847,0.077891,0.090637,0.085753,0.154082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.059985,0.080668,0.085485,0.000000,0.078397,0.176777,0.063221,0.074923,0.072312,0.131344,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.112651,0.124449,0.165656,0.078397,0.000000,0.290690,0.086061,0.116197,0.096847,0.200201,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1681,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
rating_num = 0
rating_den = 0
for other_movie_id in user_movie_list:
    rating_num += sim_mat[movie_id][other_movie_id] * df_reduced[user_id][other_movie_id]
    rating_den += sim_mat[movie_id][other_movie_id]
rating_num / rating_den

-0.08395390252074356