### Collaborative Filtering - SVD - clickout items have rating 1, everything else is zero

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns

from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

from collections import Counter
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('../data/train_split.csv')

In [3]:
df

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
9,00RL8Z82B2Z1,aff3928535f48,1541037542,10,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [13]:
df_ratings = df[df['action_type'] == 'clickout item'][['user_id', 'reference']].drop_duplicates()
df_ratings['rating'] = np.repeat(1, len(df_ratings))
df_ratings

Unnamed: 0,user_id,reference,rating
13,00RL8Z82B2Z1,109038,1
15,00RL8Z82B2Z1,1257342,1
115,02SRUT1NQYH1,2795374,1
121,03K8AXBL4BX2,1032816,1
176,03P4VFKK12UO,65685,1
177,03P4VFKK12UO,1320460,1
180,0473FZ8UNXRS,3143258,1
181,066TUPQWUEV5,2552514,1
184,06S61EKCW1JY,110591,1
188,06SZHKMYOOI8,2557176,1


In [15]:
df_ratings['user_id'].unique().size

588073

In [16]:
df_ratings['reference'].unique().size

263149

In [18]:
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_ratings, Reader(rating_scale=(0, 1)))

# # We can now use this dataset as we please, e.g. calling cross_validate
# cross_validate(NormalPredictor(), data, cv=2)

In [21]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0491  0.0491  0.0492  0.0487  0.0490  0.0490  0.0002  
MAE (testset)     0.0218  0.0219  0.0219  0.0216  0.0217  0.0218  0.0001  
Fit time          47.26   48.52   51.78   50.18   51.32   49.81   1.70    
Test time         2.17    2.10    1.54    1.88    1.50    1.84    0.28    


{'test_rmse': array([0.04912045, 0.04913913, 0.049156  , 0.0486543 , 0.04896418]),
 'test_mae': array([0.021826  , 0.02185638, 0.02187887, 0.02162827, 0.02173525]),
 'fit_time': (47.26137614250183,
  48.517417907714844,
  51.77871084213257,
  50.179691791534424,
  51.31760096549988),
 'test_time': (2.1688590049743652,
  2.104248523712158,
  1.5439939498901367,
  1.8803796768188477,
  1.49550461769104)}

In [23]:
user_id = '00RL8Z82B2Z1'
item_id = '6696574'
algo.predict(user_id, item_id)

Prediction(uid='00RL8Z82B2Z1', iid='6696574', r_ui=None, est=1, details={'was_impossible': False})

In [24]:
df_validation = pd.read_csv('../data/validation.csv')

In [25]:
df_target = df_validation[(df_validation['action_type'] == 'clickout item') & df_validation['reference'].isnull()]
df_target

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
4,0BO8K084XKYE,ad89aba9bb002,1541458805,5,clickout item,,US,"Ledyard, USA",tablet,,76028|70388|1336053|317126|79506|9979396|71272...,81|70|104|153|100|86|121|113|113|94|39|94|75|4...
11,0CI40GAHP96Q,74986cb514e04,1541456884,7,clickout item,,BR,"Rio Quente, Brazil",mobile,,1668387|2260908|3926372|2343438|2571941|257499...,49|30|31|60|354|280|33|50|307|56|64|273|52|27|40
38,0OQU0SD8LPT3,0570c0ad1a064,1541455563,27,clickout item,,IT,"Leukerbad, Switzerland",mobile,,28940|351361|28939|28930|28947|28926|28927|289...,225|177|424|161|201|194|153|175|157|204|159|33...
42,101QC20YANDM,86052262c2b01,1541456080,31,clickout item,,UK,"Bristol, United Kingdom",tablet,,12558|12563|12564|12568|12565|46606|47804|1257...,191|189|170|136|160|131|91|168|128|221|170|113...
47,1YXNJH0CVWD1,957660f1348c1,1541459188,5,clickout item,,BR,"São Paulo, Brazil",mobile,Sort by Price,6703430|8136104|2707078|6935282|3145604|450574...,11|12|14|16|17|17|17|17|17|17|18|18|18|18|18|1...
51,2GUABJ97DL44,e7ae0d9b9548b,1541457420,4,clickout item,,RU,"Rybinsk, Russia",mobile,,2852220|1710581|2526176|4969070|2037901|321307...,76|19|29|28|37|56|36|31|20|23|32|34|19|40|133|...
52,2HMIJDJ4O1SM,10f6a151699ca,1541457984,1,clickout item,,IE,"Dublin, Ireland",tablet,,12509|12510|46119|16674|46149|640521|895195|12...,479|429|243|265|269|187|224|464|210|224|228|38...
78,2I3VJP0O4QN9,f700004a9608f,1541456853,26,clickout item,,EC,"Fort Myers, USA",desktop,Sort by Price,63381|1416578|6174504|2518114|6297618|6946984|...,36|36|42|43|44|44|44|44|45|45|46|51|51|52|52|5...
90,2I49QL4EG4ZC,b46b69b053ea4,1541456935,12,clickout item,,US,"Nashville, USA",mobile,,75291|9293590|59561|78737|72439|9112834|956532...,155|259|117|57|220|175|350|210|249|119|284|219...
110,2N2RU0NDDZ88,9b028a29d99b1,1541456396,20,clickout item,,HR,"City of Sarajevo, Bosnia and Herzegovina",mobile,,6329130|409456|1555533|4503730|4133960|3480472...,15|72|72|101|30|89|79|15|93|75|86|70|58|30|40|...


In [27]:
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

In [41]:
def order_impressions_by_prediction(action):
    user_id = action['user_id']
    impressions = string_to_array(action['impressions'])
    predictions = {item_id: algo.predict(user_id, item_id) for item_id in impressions}
    print('predictions: {}'.format(predictions))
    sorted_impressions = sorted(impressions, key=lambda item_id: predictions[item_id], reverse=True)
    return ' '.join(sorted_impressions)

In [42]:
df_target.head().apply(order_impressions_by_prediction, axis=1)

predictions: {'76028': Prediction(uid='0BO8K084XKYE', iid='76028', r_ui=None, est=0.9937836129143864, details={'was_impossible': False}), '70388': Prediction(uid='0BO8K084XKYE', iid='70388', r_ui=None, est=0.99886107756529, details={'was_impossible': False}), '1336053': Prediction(uid='0BO8K084XKYE', iid='1336053', r_ui=None, est=1, details={'was_impossible': False}), '317126': Prediction(uid='0BO8K084XKYE', iid='317126', r_ui=None, est=1, details={'was_impossible': False}), '79506': Prediction(uid='0BO8K084XKYE', iid='79506', r_ui=None, est=0.9927604886895387, details={'was_impossible': False}), '9979396': Prediction(uid='0BO8K084XKYE', iid='9979396', r_ui=None, est=1, details={'was_impossible': False}), '7127238': Prediction(uid='0BO8K084XKYE', iid='7127238', r_ui=None, est=1, details={'was_impossible': False}), '318466': Prediction(uid='0BO8K084XKYE', iid='318466', r_ui=None, est=1, details={'was_impossible': False}), '2420616': Prediction(uid='0BO8K084XKYE', iid='2420616', r_ui=Non

4     9979396 9633230 893845 892191 8723862 8485592 ...
11    9620324 8332666 8288320 7361580 5502372 427907...
38    9367854 351361 28947 28944 28940 28939 28936 2...
42    899371 6609210 634641 47804 46606 3981996 2435...
47    9587086 9490460 9300048 9236964 8769448 839380...
dtype: object

In [34]:
df_target.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
4,0BO8K084XKYE,ad89aba9bb002,1541458805,5,clickout item,,US,"Ledyard, USA",tablet,,76028|70388|1336053|317126|79506|9979396|71272...,81|70|104|153|100|86|121|113|113|94|39|94|75|4...
11,0CI40GAHP96Q,74986cb514e04,1541456884,7,clickout item,,BR,"Rio Quente, Brazil",mobile,,1668387|2260908|3926372|2343438|2571941|257499...,49|30|31|60|354|280|33|50|307|56|64|273|52|27|40
38,0OQU0SD8LPT3,0570c0ad1a064,1541455563,27,clickout item,,IT,"Leukerbad, Switzerland",mobile,,28940|351361|28939|28930|28947|28926|28927|289...,225|177|424|161|201|194|153|175|157|204|159|33...
42,101QC20YANDM,86052262c2b01,1541456080,31,clickout item,,UK,"Bristol, United Kingdom",tablet,,12558|12563|12564|12568|12565|46606|47804|1257...,191|189|170|136|160|131|91|168|128|221|170|113...
47,1YXNJH0CVWD1,957660f1348c1,1541459188,5,clickout item,,BR,"São Paulo, Brazil",mobile,Sort by Price,6703430|8136104|2707078|6935282|3145604|450574...,11|12|14|16|17|17|17|17|17|17|18|18|18|18|18|1...
