In [1]:
import pandas as pd
import numpy as np
import statistics as st
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
import heapq

In [2]:
train_data = pd.read_csv("train.csv")

In [389]:
len(train_data['session_id'].drop_duplicates())

910683

In [390]:
len(train_data['reference'].drop_duplicates())

400277

## Use Boston Data Only

In [350]:
action_type = ['interaction item image','clickout item', 'interaction item rating',
               'interaction item info','search for item', 'interaction item deals']

In [351]:
df_train = train_data[(train_data['city'] == 'Boston, USA') & (train_data['action_type'].isin(action_type))].reset_index()

In [352]:
#Split total data into 80% train and 20% test
df_split = df_train.groupby('session_id')['session_id'].nunique()
train, test=train_test_split(df_split, test_size=0.4, random_state=93)
test=pd.DataFrame(test.index)
test=pd.merge(df_train, test, how='inner', on='session_id')
train=pd.DataFrame(train.index)
train=pd.merge(df_train, train, how='inner', on='session_id')

In [353]:
#for test data keep the last row of click of the session with multiple actions as true test 
test['count'] = test.groupby('session_id')['session_id'].transform('count')
test['index'] = test.index
test['max_clk'] = test[test['action_type']=='clickout item'].groupby(['user_id', 'session_id']).index.transform('max')
test_totrain = test[((test['action_type']=='clickout item')&(test['count'] == 1))|(test['index'] != test['max_clk'])]
true_test = test[(test['index'] == test['max_clk'])&(test['count'] > 1)]

In [354]:
#add other records back to training data
columns = ['session_id','reference', 'action_type']
total_train = pd.concat([train[columns], test_totrain[columns]]).drop_duplicates().reset_index()

In [355]:
#Define ratings of all items
action_wt =  pd.DataFrame(data={'action_type': action_type, 'weight': [0.1,1,0.5,0.1,0.8,0.6]})
total_train = total_train.merge(action_wt, how = 'inner', on = 'action_type')
total_train['rating'] = total_train.groupby(['session_id', 'reference'])['weight'].transform('sum')
total_train = total_train[['session_id','reference','rating']].drop_duplicates()

In [356]:
#Split test data in to 50% development 50% test
df_split = true_test.groupby('session_id')['session_id'].nunique()
dev_df, test_df=train_test_split(df_split, test_size=0.5, random_state=9)
dev_df=pd.DataFrame(dev_df.index)
dev_df=pd.merge(true_test, dev_df, how='inner', on='session_id')
test_df=pd.DataFrame(test_df.index)
test_df=pd.merge(true_test, test_df, how='inner', on='session_id')

In [357]:
#Add Dev to train
total_train_wdev = pd.concat([train[columns], dev_df[columns],test_totrain[columns]]).drop_duplicates().reset_index()
total_train_wdev = total_train_wdev.merge(action_wt, how = 'inner', on = 'action_type')
total_train_wdev['rating'] = total_train_wdev.groupby(['session_id', 'reference'])['weight'].transform('sum')
total_train_wdev = total_train_wdev[['session_id','reference','rating']].drop_duplicates()

## Item- Item CF Model

In [119]:
from surprise import Reader
from surprise import Dataset
from surprise import KNNBaseline

In [358]:
def cf(train_df, bsl_options, sim_options, k):
    algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options, k=k)
    reader = Reader(rating_scale=(0, 3))
    data = Dataset.load_from_df(train_df[['session_id','reference','rating']], reader)
    data_train=data.build_full_trainset()
    algo.fit(data_train)
    return(algo)

In [359]:
def eva(true_test,algorithm):
    true_response = dict(zip(true_test['session_id'], true_test['reference']))
    test_list = dict(zip(true_test['session_id'], true_test['impressions'].apply(lambda x: x.split('|'))))
    res_hat = {}
    rec = {}
    for i in test_list:
        res_hat[i] ={}
        for j in test_list[i]:
            res_hat[i][j] = algorithm.predict(i,j).est
        if len(res_hat[i])>20:
            rec[i] = heapq.nlargest(20, res_hat[i], key=res_hat[i].get)
        else:
            rec[i] = heapq.nlargest(len(res_hat[i]),res_hat[i], key=res_hat[i].get)
    reciprocal = dict.fromkeys(rec)
    for i in reciprocal:
        if true_response[i] in rec[i]:
            reciprocal[i] = 1/(rec[i].index(true_response[i])+1)
        else:
            reciprocal[i] = 0
    return(statistics.mean(reciprocal.values()))

In [360]:
sgd = [dict({'method': 'sgd','learning_rate': .005,'reg':0.02}),
               dict({'method': 'sgd','learning_rate': .005,'reg':0.05}),
               dict({'method': 'sgd','learning_rate': .005,'reg':0.1}),
               dict({'method': 'sgd','learning_rate': .05,'reg':0.02}),
               dict({'method': 'sgd','learning_rate': .05,'reg':0.05}),
               dict({'method': 'sgd','learning_rate': .05,'reg':0.1}),
               dict({'method': 'sgd','learning_rate': .1,'reg':0.02}),
               dict({'method': 'sgd','learning_rate': .1,'reg':0.05}),
               dict({'method': 'sgd','learning_rate': .1,'reg':0.1}),
               dict({'method': 'sgd','learning_rate': .2,'reg':0.02}),
               dict({'method': 'sgd','learning_rate': .2,'reg':0.05}),
               dict({'method': 'sgd','learning_rate': .2,'reg':0.1})]
k = [1,2,3,4,5]
als = [dict({'method': 'als','reg_i': 60,'reg_u':60}),
               dict({'method': 'als','reg_i': 60,'reg_u':80}),
               dict({'method': 'als','reg_i': 60,'reg_u':100}),
               dict({'method': 'als','reg_i': 80,'reg_u':60}),
               dict({'method': 'als','reg_i': 80,'reg_u':80}),
               dict({'method': 'als','reg_i': 80,'reg_u':100}),
               dict({'method': 'als','reg_i': 100,'reg_u':60}),
               dict({'method': 'als','reg_i': 100,'reg_u':80}),
               dict({'method': 'als','reg_i': 100,'reg_u':100})]

In [361]:
sim_options = {'name': 'pearson_baseline','user_based': False}
result = {}
for i in sgd:
    for j in k:
        algo = cf(total_train, i, sim_options,j)
        result['%s, k=%s' % (i,j)]=eva(dev_df,algo)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline si

In [362]:
heapq.nlargest(1, result, key=result.get)

["{'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.02}, k=2"]

In [363]:
sim_options = {'name': 'pearson_baseline','user_based': False}
result_als = {}
for i in als:
    for j in k:
        algo = cf(total_train, i, sim_options,j)
        result_als['%s, k=%s' % (i,j)]=eva(dev_df,algo)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline si

In [364]:
heapq.nlargest(1, result_als, key=result_als.get)

["{'method': 'als', 'reg_i': 100, 'reg_u': 60}, k=2"]

In [365]:
result_als["{'method': 'als', 'reg_i': 100, 'reg_u': 60}, k=2"]

0.2607030503623951

In [366]:
result["{'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.1}, k=2"]

0.22933283052027417

In [368]:
algo_final = cf(total_train, dict({'method': 'als','reg_i': 100,'reg_u':60}), sim_options,2)
eva(test_df,algo_final)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


0.2587413775702282

## User-User CF Model

In [378]:
sim_options = {'name': 'pearson_baseline','user_based':True}
result_uu = {}
for i in sgd:
    for j in k:
        algo_uu = cf(total_train, i, sim_options,j)
        result_uu['%s, k=%s' % (i,j)]=eva(dev_df,algo_uu)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline si

In [379]:
heapq.nlargest(1, result_uu, key=result_uu.get)

["{'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.02}, k=2"]

In [380]:
result_uu["{'method': 'sgd', 'learning_rate': 0.005, 'reg': 0.02}, k=2"]

0.2940221152746136

In [381]:
sim_options = {'name': 'pearson_baseline','user_based':True}
result_uu_als = {}
for i in als:
    for j in k:
        algo_uu_als = cf(total_train, i, sim_options,j)
        result_uu_als['%s, k=%s' % (i,j)]=eva(dev_df,algo_uu_als)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline si

In [382]:
heapq.nlargest(1, result_uu_als, key=result_uu_als.get)

["{'method': 'als', 'reg_i': 80, 'reg_u': 80}, k=4"]

In [383]:
result_uu_als["{'method': 'als', 'reg_i': 80, 'reg_u': 80}, k=4"]

0.32595033693125164

In [385]:
algo_t = cf(total_train_wdev, dict({'method': 'als', 'reg_i': 80, 'reg_u': 80}), sim_options,4)
eva(test_df,algo_t)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


0.269800919808176

In [386]:
algo_t = cf(total_train, dict({'method': 'als', 'reg_i': 80, 'reg_u': 80}), sim_options,4)
eva(test_df,algo_t)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


0.32681673514394105