In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
items1 = pd.read_csv('../data/item_properties_part1.csv')
items2 = pd.read_csv('../data/item_properties_part2.csv')

In [5]:
items = pd.concat([items1, items2])
items.head(10)

In [6]:
import datetime
times =[]
for i in items['timestamp']:
    times.append(datetime.datetime.fromtimestamp(i//1000.0)) 
items['timestamp'] = times
items.head(10)

Unnamed: 0,timestamp,itemid,property,value
0,2015-06-28 05:00:00,460429,categoryid,1338
1,2015-09-06 05:00:00,206783,888,1116713 960601 n277.200
2,2015-08-09 05:00:00,395014,400,n552.000 639502 n720.000 424566
3,2015-05-10 05:00:00,59481,790,n15360.000
4,2015-05-17 05:00:00,156781,917,828513
5,2015-07-05 05:00:00,285026,available,0
6,2015-06-14 05:00:00,89534,213,1121373
7,2015-05-17 05:00:00,264312,6,319724
8,2015-06-07 05:00:00,229370,202,1330310
9,2015-06-14 05:00:00,98113,451,1141052 n48.000


In [8]:
events = pd.read_csv('../data/events.csv')
events.head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
5,1433224086234,972639,view,22556,
6,1433221923240,810725,view,443030,
7,1433223291897,794181,view,439202,
8,1433220899221,824915,view,428805,
9,1433221204592,339335,view,82389,


In [10]:
category_tree = pd.read_csv('../data/category_tree.csv')
category_tree.head(10)

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0
5,231,
6,542,378.0
7,1146,542.0
8,1140,542.0
9,1479,1537.0


In [11]:
events = events.assign(date=pd.Series(datetime.datetime.fromtimestamp(i/1000).date() for i in events.timestamp))
events = events.sort_values('date').reset_index(drop=True)
events = events[['visitorid','itemid','event', 'date']]
events.head(5)

Unnamed: 0,visitorid,itemid,event,date
0,1155627,61279,view,2015-05-03
1,53842,12217,view,2015-05-03
2,710338,81856,view,2015-05-03
3,807228,395446,view,2015-05-03
4,190672,259357,transaction,2015-05-03


In [12]:
start_date = '2015-5-3'
end_date = '2015-5-18'
fd = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date()
events = events[(events.date >= fd(start_date)) & (events.date <= fd(end_date))]

In [48]:
split_point = np.int(np.round(events.shape[0]*0.8))
events_train = events.iloc[0:split_point]
events_test = events.iloc[split_point::]
#Get only visitors and items that are in train as well
events_test = events_test[
    (events_test['visitorid'].isin(events_train['visitorid'])) & 
    (events_test['itemid'].isin(events_train['itemid']))
]
events_test.shape
# (64019, 4)
# (10510, 4)

(10510, 4)

In [20]:
from lightfm import LightFM
from lightfm.evaluation import auc_score
from scipy.sparse import coo_matrix
from sklearn import preprocessing

In [15]:
id_cols=['visitorid','itemid']
trans_cat_train=dict()
trans_cat_test=dict()

for k in id_cols:
    cate_enc=preprocessing.LabelEncoder()
    trans_cat_train[k]=cate_enc.fit_transform(events_train[k].values)
    trans_cat_test[k]=cate_enc.transform(events_test[k].values)

In [45]:
trans_cat_train.

{'visitorid': array([111231,   5243,  68449, ..., 113575, 102010,  13908]),
 'itemid': array([ 9359,  1871, 12526, ..., 45843,   186,  9951])}

In [21]:
ratings = dict()
cate_enc=preprocessing.LabelEncoder()
ratings['train'] = cate_enc.fit_transform(events_train.event)
ratings['test'] = cate_enc.transform(events_test.event)


<lightfm.lightfm.LightFM at 0x1acb4dd860>

In [49]:
ratings['train']

array([2, 2, 2, ..., 2, 2, 2])

In [25]:
n_users=len(np.unique(trans_cat_train['visitorid']))
n_items=len(np.unique(trans_cat_train['itemid']))
rate_matrix = dict()
rate_matrix['train'] = coo_matrix((ratings['train'], (trans_cat_train['visitorid'], trans_cat_train['itemid'])), shape=(n_users,n_items))
rate_matrix['test'] = coo_matrix((ratings['test'], (trans_cat_test['visitorid'], trans_cat_test['itemid'])), shape=(n_users,n_items))

<135523x71818 sparse matrix of type '<class 'numpy.float32'>'
	with 256076 stored elements in COOrdinate format>

In [None]:
model = LightFM(no_components=10, loss='warp')
model.fit(rate_matrix['train'], epochs=100, num_threads=8)

In [22]:
print(auc_score(model, rate_matrix['train'], num_threads=8).mean())
print(auc_score(model, rate_matrix['test'], num_threads=10).mean())

0.998681
0.82616794
