# Collaborative Filtering

* Not all Users have rated every movie (This is logical, and not really a missing data)
* 

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics.pairwise import pairwise_distances

In [130]:
r_cols = ["user_id", "movie_id", "rating", "unix_timestamp"]
ratings_train = pd.read_csv("data/ml-100k/ua.base", names=r_cols, sep="\t", encoding="latin-1")
ratings_train.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [131]:
ratings_train.isna().sum()

user_id           0
movie_id          0
rating            0
unix_timestamp    0
dtype: int64

In [132]:
X_train = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')

In [133]:
X_train.shape

(943, 1680)

In [134]:
n_users, n_items = X_train.shape

In [95]:
user_sim = np.zeros((n_users, n_users))
item_sim = np.zeros((n_items, n_items))

In [135]:
X_isna = X_train.isna()

In [136]:
def calc_dist_nan(curr_id, other_id, calc, metric):
    if calc == "user":
        curr = X_train.loc[curr_id, :]
        other = X_train.loc[other_id, :]
        curr_isna = X_isna.loc[curr_id, :]
        other_isna = X_isna.loc[other_id, :]
        
    elif calc == "item":
        curr = X_train.loc[:, curr_id]
        other = X_train.loc[:, other_id]
        curr_isna = X_isna.loc[:, curr_id]
        other_isna = X_isna.loc[:, other_id]
        
        
    valid_idx = ~(curr_isna | other_isna)
    other = other.loc[valid_idx].values.reshape(1, -1)
    curr = curr.loc[valid_idx].values.reshape(1, -1)
    
    try:
        res = pairwise_distances(curr, other, metric=metric)[0, 0]
    except ValueError:
        res = np.nan
    
    return res

In [137]:
calc_dist_nan(1, 5, "user", "correlation")

0.6050554126894128

In [12]:
# for user_id in user_ids:
#     for other_id in user_ids:
#         user_sim[user_id-1, other_id-1] = calc_dist_nan(user_id, other_id, "user", "correlation")

* Cosine Similarity is not affected by ZERO padding (X -> X_zf (zero-fill))
* Pearson Correlation is not affected by MEAN padding (X -> X_umf, X_imf (user and item mean-fill))

In [138]:
X__zf = X_train.fillna(0)
X_umf = X_train.T.fillna(X_train.mean(axis=1), axis=0).T
X_imf = X_train.fillna(X_train.mean(axis=0), axis=0)

Here I choose the pearson correlation similarity

In [139]:
user_sim = 1 - pairwise_distances(X_umf, X_umf, metric="correlation")

In [140]:
item_sim = 1 - pairwise_distances(X_imf.T, X_imf.T, metric="correlation")

In [142]:
user_sim = pd.DataFrame(user_sim, index=X_train.index, columns=X_train.index)
item_sim = pd.DataFrame(item_sim, index=X_train.columns, columns=X_train.columns)

Item similarity has NaN values as some of the columns had the same entries for each movie 

In [143]:
item_sim.fillna(0, inplace=True)

In [144]:
user_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.02859,-0.003828,0.020081,0.12411,0.106932,0.117792,0.188452,0.004759,0.006039,...,0.009763,-0.033367,0.086846,0.02838,0.070688,0.095859,0.080992,0.01184,-0.044896,0.043661
2,0.02859,1.0,-0.021223,-0.006146,0.025119,0.104709,0.087534,0.006623,0.007834,0.066855,...,-0.013352,0.025006,0.047662,0.182241,0.038222,0.039041,0.012068,-0.045436,-0.006165,0.039756
3,-0.003828,-0.021223,1.0,-0.013882,0.017186,-0.040819,0.007468,0.053729,0.0,0.010856,...,0.000876,0.000172,-0.028436,0.014123,-0.002138,-0.013758,-0.009838,-0.005399,0.022304,0.001957
4,0.020081,-0.006146,-0.013882,1.0,-0.001941,0.0,-0.020235,0.177352,0.0,0.002642,...,-0.009107,0.001272,-0.026177,-0.018798,0.023365,0.0,0.213826,0.329542,0.068909,-0.014356
5,0.12411,0.025119,0.017186,-0.001941,1.0,0.034633,0.074725,0.133686,0.00844,0.004918,...,0.074736,0.000898,0.024986,0.042377,0.101132,0.026222,0.034262,0.009612,0.034556,0.094492


In [145]:
item_sim.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.094325,0.088859,0.049576,0.111533,0.07435,0.137681,0.114308,0.070285,0.05371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.094325,1.0,0.070472,0.120234,0.067024,-0.011068,0.068603,0.132597,-0.070552,0.046792,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.088859,0.070472,1.0,-0.0534,0.027067,0.063664,0.036606,-0.014717,0.015743,0.010401,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.049576,0.120234,-0.0534,1.0,-0.11768,-0.001884,0.079612,0.138738,0.094984,0.034637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.111533,0.067024,0.027067,-0.11768,1.0,-0.02272,0.078582,0.06286,0.028432,-0.030891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## User-User Collaborative filtering

In [147]:
X_train.shape

(943, 1680)

In [107]:
user_sim.shape

(943, 943)

In [22]:
def predict_rating(ratings, similarity, method):
    if method == "user-user":
        pass
    
    elif method == "item-item":
        pass
    
    return None

In [108]:
from sklearn.base import BaseEstimator, ClassifierMixin

In [148]:
class Recommender(BaseEstimator, ClassifierMixin):
    def __init__(self, engine, method):
        '''
        Parameters
        ----------
        engine: str, ['user', 'item']
            user or item based collaborative filtering approach
            
        method: str, ['correlation', 'cosine']
            Similarity metric to use
            'correlation': pearson correlation,
            'cosine': cosine similarity
        '''
        self.engine = engine
        self.method = method
        
    
    @staticmethod
    def _impute(X, engine, method):

        if engine == 'user':
            if method == 'correlation':
                X = X.T.fillna(X.mean(axis=1), axis=0).T
            elif method == 'cosine':
                X = X.fillna(0)
                
        elif engine == 'item':
            if method == 'correlation':
                X = X.fillna(X.mean(axis=0), axis=0).T
            elif method == 'cosine':
                X = X.fillna(0).T
                
                
        return X
        
    def fit(self, user_item_ids, ratings):
        ''' fit method
        
        Parameters
        ----------
        user_item_ids: np.array
        ratings: np.array
            ratings
        '''
        
        user_ids, item_ids = user_item_ids[:, 0], user_item_ids[:, 1]
        X = (pd.DataFrame(
                {'user_id': user_ids, 'item_id': item_ids, 'rating': ratings}).
                 pivot(values='rating', index='user_id', columns='item_id')
                )
        X = self._impute(X, self.engine, self.method)
        
        self.similarity = pd.DataFrame(1 - pairwise_distances(X, X, metric=self.method),
                                       index=X.index, columns=X.index).fillna(0)
        
        self.pred_ratings = (self.similarity.dot(X)/abs(self.similarity).
                             sum(axis=1).values.reshape(-1, 1))
        
        if self.engine == 'item':
            self.pred_ratings = self.pred_ratings.T
        
        return self
    
#     def predict(self, user_item_ids):
            
#         user_id, item_id = user_item_ids[:, 0], user_item_ids[:, 1]
            
#         return pred
    


In [149]:
from sklearn.model_selection import cross_val_score

In [150]:
def rmse(actual, pred):
    size = (~actual.isna()).sum().sum()
    return (((pred - actual)**(2)).sum().sum()/size)**(1/2)

In [151]:
from sklearn.model_selection import train_test_split

In [152]:
rcmdr = Recommender("user", "correlation")
rcmdr.fit(ratings_train.loc[:, ["user_id", "movie_id"]].values, ratings_train["rating"].values)

Recommender(engine='user', method='correlation')

In [153]:
rcmdr.pred_ratings

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.244741,2.948537,2.964926,3.034042,2.963416,3.042999,3.195328,3.068992,3.215707,3.027927,...,3.018414,3.018999,3.018294,3.016755,3.018625,3.017713,3.018945,3.018329,3.019024,3.017785
2,2.347613,2.251412,2.250957,2.269158,2.266191,2.281506,2.318986,2.312724,2.368964,2.212118,...,2.276349,2.278359,2.278048,2.277329,2.278095,2.279266,2.278404,2.278835,2.278380,2.278351
3,0.396649,0.376989,0.384401,0.401924,0.381184,0.395649,0.403709,0.425180,0.440036,0.401508,...,0.393566,0.395278,0.393926,0.390937,0.395205,0.398190,0.395379,0.396784,0.395292,0.395188
4,0.642394,0.641829,0.633882,0.638196,0.643305,0.653738,0.664402,0.676174,0.655355,0.656639,...,0.648838,0.647349,0.647354,0.647113,0.646901,0.647531,0.647465,0.647498,0.647501,0.647665
5,2.833243,2.598973,2.580645,2.678890,2.616358,2.647664,2.849655,2.766981,2.738744,2.660775,...,2.652954,2.651809,2.651067,2.649565,2.651553,2.650886,2.651717,2.651301,2.651777,2.651442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,1.888321,1.700093,1.675433,1.748591,1.712033,1.725380,1.816390,1.742357,1.901127,1.722198,...,1.726844,1.726788,1.725527,1.722999,1.726857,1.725470,1.726629,1.726050,1.726675,1.726528
940,2.465095,2.300971,2.298583,2.267674,2.322128,2.336572,2.469926,2.506139,2.359464,2.346251,...,2.337274,2.336150,2.335518,2.334214,2.335599,2.335449,2.336086,2.335767,2.336135,2.335881
941,1.730485,1.356488,1.352026,1.387318,1.364625,1.380594,1.459263,1.447033,1.443604,1.413493,...,1.376277,1.376080,1.375963,1.375744,1.375954,1.375514,1.376043,1.375779,1.376070,1.375417
942,1.516816,1.387759,1.387238,1.419869,1.410922,1.405881,1.399744,1.479570,1.449581,1.411753,...,1.409017,1.409176,1.408028,1.405597,1.408948,1.408328,1.409099,1.408713,1.409148,1.408801


In [154]:
r_cols = ["user_id", "movie_id", "rating", "unix_timestamp"]
ratings_test = pd.read_csv("data/ml-100k/ua.test", names=r_cols, sep="\t", encoding="latin-1")
ratings_test.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,20,4,887431883
1,1,33,4,878542699
2,1,61,4,878542420
3,1,117,3,874965739
4,1,155,2,878542201


In [155]:
X_test = ratings_test.pivot_table(values='rating', index='user_id', columns='movie_id')

In [156]:
X_test

movie_id,1,2,3,4,5,6,7,8,9,10,...,1591,1592,1600,1612,1617,1646,1653,1656,1662,1664
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,,,,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [157]:
X_test.columns

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            1591, 1592, 1600, 1612, 1617, 1646, 1653, 1656, 1662, 1664],
           dtype='int64', name='movie_id', length=1129)

In [159]:
rmse(X_test, rcmdr.pred_ratings.loc[:, X_test.columns])

2.4615935214939744

In [160]:
rmse(X_train, rcmdr.pred_ratings)

1.9258894261434056