In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_openml

In [2]:
data= fetch_openml(data_id=1220)

In [3]:
df = pd.DataFrame(
    data['data'],
    columns=data['feature_names']
)[['user_id', 'ad_id']].astype(int)

df['user_rating'] = pd.Series(data['target']).astype(int)

In [4]:
df.head(10).groupby(['user_id', 'ad_id']).max().reset_index().pivot('user_id', 'ad_id','user_rating').fillna(0).astype(int)

ad_id,6803526,8343295,9027213,20017077,20366086,20886690,21186478,21348354,21367376,21811752
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,0,0,0,0,0,0,0,0
562934,0,0,0,1,0,0,0,0,0,0
579253,0,0,0,0,0,0,0,0,0,0
2886008,0,0,0,0,0,0,1,0,0,0
5277279,0,0,0,0,0,0,0,0,0,0
7589739,0,0,0,0,0,0,0,0,0,0
8778348,0,0,0,0,0,0,0,0,0,0
11621116,0,0,0,0,0,0,0,0,0,0
11808635,0,0,1,0,0,0,0,0,0,0
12118311,0,0,0,0,0,0,0,0,0,0


In [5]:
from surprise.dataset import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [6]:
reader = Reader(rating_scale=(0,1))
dataset = Dataset.load_from_df(df, reader)

In [7]:
trainset, testset = train_test_split(dataset, test_size=0.25)

In [8]:
def predict_evaluate(recsys, dataset, name='Algorithm'):
    scores = cross_validate(
        recsys, dataset, measures=['RMSE', 'MAE'], cv=4)
    print('Testset Avg. MAE: {:.2f} & Avg RMSE: {:.2f} [{}]'.format(
        scores['test_mae'].mean(),
        scores['test_rmse'].mean(),
        name
    )
)

In [9]:
from surprise import AlgoBase

class RandomRating(AlgoBase):
    def __init__(self, p=0.5):
        self.p = p
        AlgoBase.__init__(self)
        
    def estimate(self, u, i):
        return np.random.binomial(n=1, p=self.p, size=1)[0]
    
recsys = RandomRating(p=0.0168)
predict_evaluate(recsys, dataset, 'RandomRating')

Testset Avg. MAE: 0.18 & Avg RMSE: 0.42 [RandomRating]


In [10]:
from surprise.prediction_algorithms.knns import KNNBasic
recsys = KNNBasic()
predict_evaluate(recsys, dataset, 'KNNBasic')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Testset Avg. MAE: 0.28 & Avg RMSE: 0.38 [KNNBasic]


In [12]:
from surprise.model_selection import GridSearchCV

param_grid = {
    'sim_options': {
        'name': ['cosine', 'pearson']
    },
    'k': [5,10,20,40],
    'verbose': [True]
}

dataset_subset = Dataset.load_from_df(df.sample(frac=0.25, random_state=0), reader)

gscv = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'],
                   cv=4, n_jobs=-1)

#print('Best MAE:', gscv.best_score['mae'].round(2))
#print('Best RMSE:', gscv.best_score['rmse'].round(2))
#print('Best Params', gscv.best_params['rmse'])

In [13]:
from surprise.prediction_algorithms.baseline_only import BaselineOnly
recsys = BaselineOnly(verbose=False)
predict_evaluate(recsys, dataset, 'BaselineOnly')

Testset Avg. MAE: 0.27 & Avg RMSE: 0.37 [BaselineOnly]


In [17]:
## SVD

In [18]:
music_ratings = [('U1', 'Metallica'), ('U1', 'Rammstein'), ('U2',
'Rammstein'), ('U3', 'Tiesto'), ('U3', 'Paul van Dyk'), ('U2',
'Metallica'), ('U4', 'Tiesto'), ('U4', 'Paul van Dyk'), ('U5',
'Metallica'), ('U5', 'Slipknot'), ('U6', 'Tiesto'), ('U6', 'Aly & Fila'),
('U3', 'Aly & Fila')]

In [19]:
df_music_ratings = pd.DataFrame(music_ratings, columns=['User', 'Artist'])

In [21]:
df_music_ratings['Rating'] = 1

In [23]:
df_music_ratings_pivoted = df_music_ratings.pivot(
    'User', 'Artist', 'Rating'
).fillna(0)

In [24]:
df_music_ratings_pivoted

Artist,Aly & Fila,Metallica,Paul van Dyk,Rammstein,Slipknot,Tiesto
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U1,0.0,1.0,0.0,1.0,0.0,0.0
U2,0.0,1.0,0.0,1.0,0.0,0.0
U3,1.0,0.0,1.0,0.0,0.0,1.0
U4,0.0,0.0,1.0,0.0,0.0,1.0
U5,0.0,1.0,0.0,0.0,1.0,0.0
U6,1.0,0.0,0.0,0.0,0.0,1.0


In [25]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2)
svd.fit_transform(df_music_ratings_pivoted).round(2)

array([[-0.  ,  1.37],
       [-0.  ,  1.37],
       [ 1.71,  0.  ],
       [ 1.21,  0.  ],
       [-0.  ,  1.  ],
       [ 1.21,  0.  ]])

In [26]:
pd.DataFrame(
    svd.fit_transform(df_music_ratings_pivoted),
    index=df_music_ratings_pivoted.index,
    columns=['SV1', 'SV2'],
    ).round(2).style.bar(
    subset=['SV1', 'SV2'], align='mid', color='#AAA'
)

Unnamed: 0_level_0,SV1,SV2
User,Unnamed: 1_level_1,Unnamed: 2_level_1
U1,0.0,1.37
U2,0.0,1.37
U3,1.71,0.0
U4,1.21,0.0
U5,-0.0,1.0
U6,1.21,0.0


In [27]:
from sklearn.metrics.pairwise import cosine_similarity

user_ids = ['U1', 'U2', 'U3', 'U5']

pd.DataFrame(
    cosine_similarity(
        df_music_ratings_pivoted.loc[user_ids, :].values
    ),
    index=user_ids,
    columns=user_ids
).round(2).style.bar(
    subset=user_ids, align='mid', color='#AAA'
)

Unnamed: 0,U1,U2,U3,U5
U1,1.0,1.0,0.0,0.5
U2,1.0,1.0,0.0,0.5
U3,0.0,0.0,1.0,0.0
U5,0.5,0.5,0.0,1.0


In [28]:
from sklearn.decomposition import TruncatedSVD

user_ids = ['U1', 'U2', 'U3', 'U5']

svd = TruncatedSVD(n_components=2)

df_user_svd = pd.DataFrame(
    svd.fit_transform(df_music_ratings_pivoted),
    index=df_music_ratings_pivoted.index,
    columns=['SV1', 'SV2'],
)
    
pd.DataFrame(
    cosine_similarity(
    df_user_svd.loc[user_ids, :].values
    ),
    index=user_ids,
    columns=user_ids
).round(2).style.bar(
    subset=user_ids, align='mid', color='#AAA'
)

Unnamed: 0,U1,U2,U3,U5
U1,1.0,1.0,-0.0,1.0
U2,1.0,1.0,-0.0,1.0
U3,-0.0,-0.0,1.0,-0.0
U5,1.0,1.0,-0.0,1.0


In [29]:
from surprise.prediction_algorithms.matrix_factorization import SVD
recsys = SVD()
predict_evaluate(recsys, dataset, 'SVD')

Testset Avg. MAE: 0.26 & Avg RMSE: 0.37 [SVD]


In [31]:
import joblib

recsys.fit(trainset)
joblib.dump(recsys, 'recsys.pkl')

['recsys.pkl']

In [None]:
from surprise import accuracy

recsys = joblib.load('recsys.pkl')
predictions = recsys.test(testset)