<h1>EDA and graph construction</h1>

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import networkx as nx
from networkx.convert_matrix import from_pandas_edgelist
from networkx import DiGraph, Graph
from networkx.drawing.nx_pylab import draw_networkx_nodes
import matplotlib.pyplot as plt
from networkx.algorithms.link_analysis.hits_alg import hits
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import os
import networkx as nx
import numpy as np
import pandas as pd
from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph
from stellargraph import datasets
from IPython.display import display, HTML
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split



<b>Create start timestamp to calculate notebook runtime at bottom</b>

In [2]:
start = datetime.now()
print(start)

2021-07-08 16:44:09.231185


<b>Read in data for November 2019. We will sample this down further later on.</b>

In [3]:
df = pd.read_csv('../archive/2019-Nov.csv')

In [4]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-11-01 00:00:00 UTC,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
1,2019-11-01 00:00:00 UTC,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2,2019-11-01 00:00:01 UTC,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
3,2019-11-01 00:00:01 UTC,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
4,2019-11-01 00:00:01 UTC,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [5]:
df.size

607517811

In [6]:
df.drop_duplicates().size

606613140

In [7]:
df = df.drop_duplicates()

In [8]:
df.count()

event_time       67401460
event_type       67401460
product_id       67401460
category_id      67401460
category_code    45530037
brand            58186451
price            67401460
user_id          67401460
user_session     67401450
dtype: int64

In [9]:
df.describe()

Unnamed: 0,product_id,category_id,price,user_id
count,67401460.0,67401460.0,67401460.0,67401460.0
mean,12520920.0,2.057901e+18,292.4819,538630700.0
std,17261990.0,2.013233e+16,355.7358,22884430.0
min,1000365.0,2.053014e+18,0.0,10300220.0
25%,1305996.0,2.053014e+18,69.24,516473500.0
50%,5100571.0,2.053014e+18,165.77,535039400.0
75%,17300760.0,2.053014e+18,360.34,561068600.0
max,100028600.0,2.187708e+18,2574.07,579969900.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67401460 entries, 0 to 67501978
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.0+ GB


Drop null values and split category code into high-level and lower-leval category codes for use as node attributes

In [11]:
df = df.dropna()

In [12]:
def split_on_dot(stringy):
    return stringy.split('.')

df['event_time'] = pd.to_datetime(df['event_time'])
df['h_lvl'] = df['category_code'].apply(lambda x: split_on_dot(x)[0])
df['l_lvl'] = df['category_code'].apply(lambda x: split_on_dot(x)[1])

In [13]:
df.shape

(42018766, 11)

In [14]:
print(f'number unique category codes: {df.category_code.nunique()}')
print(f'number unique high level categories: {df.h_lvl.nunique()}')
print(f'number unique brands: {df.brand.nunique()}')
print(f'number unique event types: {df.event_type.nunique()}')
print(f'unique event types: {df.event_type.unique()}')
print(f'unique user ids: {df.user_id.nunique()}')
print(f'unique product ids: {df.product_id.nunique()}')

number unique category codes: 129
number unique high level categories: 13
number unique brands: 1987
number unique event types: 3
unique event types: ['view' 'cart' 'purchase']
unique user ids: 2862935
unique product ids: 69773


In [15]:
df['event_type'].value_counts()

view        39314217
cart         2045298
purchase      659251
Name: event_type, dtype: int64

In [16]:
df[df['user_session'] == '4d3b30da-a5e4-49df-b1a8-ba5943f1dd33']

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,h_lvl,l_lvl
0,2019-11-01 00:00:00+00:00,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
95,2019-11-01 00:00:36+00:00,view,1004184,2053013555631882655,electronics.smartphone,xiaomi,463.15,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
121,2019-11-01 00:00:47+00:00,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
131,2019-11-01 00:00:50+00:00,view,1005234,2053013555631882655,electronics.smartphone,xiaomi,398.72,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
263,2019-11-01 00:01:48+00:00,view,1003898,2053013555631882655,electronics.smartphone,oneplus,540.3,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
329,2019-11-01 00:02:17+00:00,view,1003898,2053013555631882655,electronics.smartphone,oneplus,540.3,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
352,2019-11-01 00:02:24+00:00,view,1003499,2053013555631882655,electronics.smartphone,oneplus,461.61,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
397,2019-11-01 00:02:41+00:00,view,1003499,2053013555631882655,electronics.smartphone,oneplus,461.61,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone
416,2019-11-01 00:02:48+00:00,view,1003898,2053013555631882655,electronics.smartphone,oneplus,540.3,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33,electronics,smartphone


Aggregate data into list of sequences for use in the creation of our graph database

In [17]:
df2 = df.groupby('user_id').agg(list)

In [18]:
df2.head()

Unnamed: 0_level_0,event_time,event_type,product_id,category_id,category_code,brand,price,user_session,h_lvl,l_lvl
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
29515875,"[2019-11-10 02:14:40+00:00, 2019-11-12 03:45:0...","[view, view, view, view, view, view, view]","[13201002, 13200026, 1802034, 1802034, 1801638...","[2053013557192163841, 2053013557192163841, 205...","[furniture.bedroom.bed, furniture.bedroom.bed,...","[stendmebel, sv, kivi, kivi, harper, harper, h...","[176.04, 100.36, 115.04, 115.04, 153.67, 153.6...","[eb6882fb-ead2-47a5-aa86-d171a2c3b6ff, 643be73...","[furniture, furniture, electronics, electronic...","[bedroom, bedroom, video, video, video, video,..."
31198833,"[2019-11-08 02:09:45+00:00, 2019-11-08 02:10:3...","[view, view, view, view, view, view, view, vie...","[1005158, 1003551, 1005158, 1004870, 1004873, ...","[2053013555631882655, 2053013555631882655, 205...","[electronics.smartphone, electronics.smartphon...","[xiaomi, xiaomi, xiaomi, samsung, samsung, sam...","[302.45, 437.59, 302.45, 267.42, 360.07, 334.3...","[b9f5a88d-09a1-4327-a129-5e4425952f71, b9f5a88...","[electronics, electronics, electronics, electr...","[smartphone, smartphone, smartphone, smartphon..."
34916060,[2019-11-24 07:43:33+00:00],[view],[12600007],[2053013554751078769],[appliances.kitchen.grill],[tefal],[295.94],[4c2709a8-e61b-4d09-a0bf-b4d8b4923d00],[appliances],[kitchen]
41798457,[2019-11-26 08:33:16+00:00],[view],[100017960],[2053013555631882655],[electronics.smartphone],[huawei],[945.97],[7acfc025-a748-4dec-a2e0-5d4c54c8d8ca],[electronics],[smartphone]
62336140,"[2019-11-08 15:57:37+00:00, 2019-11-08 16:00:0...","[view, view, view, view, view, view]","[28718004, 28718694, 28720700, 28718348, 28718...","[2053013565639492569, 2053013565639492569, 205...","[apparel.shoes, apparel.shoes, apparel.shoes, ...","[respect, respect, respect, respect, respect, ...","[93.44, 89.84, 102.45, 84.43, 80.83, 124.84]","[393afdaf-b61d-4dda-87ff-b8467a4201fa, 393afda...","[apparel, apparel, apparel, apparel, apparel, ...","[shoes, shoes, shoes, shoes, shoes, shoes]"


In [19]:
df2.shape

(2862935, 10)

<h2>Create training and validation sets</h2>
Training data will be split into training and test sets and will build the graph from which embeddings are created. Validation data will be kept separate from the training of graph embeddings and serve as completely unseen data for predicting future events.

In [20]:
df2_sample = df2.sample(frac=0.2, random_state=42)
df2_sample['seq_num'] = df2_sample['event_type'].apply(lambda x: len(x))
df2_sample = df2_sample[df2_sample.seq_num > 6]


training_data, validation_data = train_test_split(df2_sample, test_size=0.3, random_state=42)

In [21]:
# distinct_product_ids = []
# for i in range(len(training_data)):
#     for j in training_data.iloc[i]['product_id']:
#         if j in distinct_product_ids:
#             pass
#         else:
#             distinct_product_ids.append(j)
            
            
distinct_h_lvl_ids = []
for i in range(len(training_data)):
    for j in training_data.iloc[i]['h_lvl']:
        if j in distinct_h_lvl_ids:
            pass
        else:
            distinct_h_lvl_ids.append(j)

In [22]:
# len(distinct_product_ids)

In [23]:
# product_map= \
# {distinct_product_ids[i]:[x for x in range(0,len(distinct_product_ids))][i] for i in range(len(distinct_product_ids))}

h_lvl_map= \
{distinct_h_lvl_ids[i]:[x for x in range(0,len(distinct_h_lvl_ids))][i] for i in range(len(distinct_h_lvl_ids))}

In [24]:
h_lvl_map

{'electronics': 0,
 'apparel': 1,
 'computers': 2,
 'appliances': 3,
 'furniture': 4,
 'auto': 5,
 'accessories': 6,
 'sport': 7,
 'construction': 8,
 'kids': 9,
 'stationery': 10,
 'medicine': 11,
 'country_yard': 12}

In [25]:
training_data.shape

(171245, 11)

Define function to convert time sequences into length of time between one event and another as possible edge weights

In [26]:
def time_list(listy):
    new_listy = []
    for i, j in enumerate(listy):
        if i == 0:
            pass
        else:
            new_listy.append((j-(listy[i-1])).total_seconds())
    return new_listy

In [27]:
(training_data.iloc[1]['event_time'][1] - training_data.iloc[1]['event_time'][0]).total_seconds()

129.0

In [28]:
def last_char(listy):
    return listy[-1]

def return_all_but_last(listy):
    return listy[:-1]

In [29]:
training_data.head(2)

Unnamed: 0_level_0,event_time,event_type,product_id,category_id,category_code,brand,price,user_session,h_lvl,l_lvl,seq_num
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
567069554,"[2019-11-08 09:42:27+00:00, 2019-11-08 09:42:4...","[view, view, view, view, view, view, view, car...","[1005115, 1005115, 1004249, 1004249, 1004250, ...","[2053013555631882655, 2053013555631882655, 205...","[electronics.smartphone, electronics.smartphon...","[apple, apple, apple, apple, apple, apple, app...","[915.08, 915.08, 739.79, 739.79, 814.56, 814.5...","[629faf75-5107-41d6-9806-123eb20676f9, 629faf7...","[electronics, electronics, electronics, electr...","[smartphone, smartphone, smartphone, smartphon...",32
559077481,"[2019-11-12 07:48:18+00:00, 2019-11-12 07:50:2...","[view, view, view, view, view, view, cart, pur...","[4804660, 28721804, 28717827, 28703606, 287177...","[2053013554658804075, 2053013565069067197, 205...","[electronics.audio.headphone, apparel.shoes.ke...","[xiaomi, adidas, adidas, reebok, puma, reebok,...","[23.09, 102.71, 102.71, 118.15, 84.69, 118.15,...","[2fedbf73-0fcd-4c54-95b7-0fd85cb17d67, 2fedbf7...","[electronics, apparel, apparel, apparel, appar...","[audio, shoes, shoes, shoes, shoes, shoes, sho...",11


<h2>Feature engineering function</h2>

In [30]:
def feature_engineer(df):    
    df['time_between'] = ''

    df['next_event'] = df['event_type'].apply(lambda x: last_char(x))
#     df['next_product_id'] = df['product_id'].apply(lambda x: last_char(x))
    df['next_time_between'] = df['product_id'].apply(lambda x: last_char(x))
    # df['time_on_page'] = df['event_type'].apply(lambda x: time_list(x))

    for i in range(len(df)):
    #     if i % 50000 == 0:
    #         print(i)
        df['time_between'].iloc[i] = time_list(df.iloc[i]['event_time'])

    #create RML (remove last) columns in which the last event from each sequence has been removed    
    df['rml_event_time'] = df['event_time'].apply(lambda x: return_all_but_last(x))
    df['rml_event_type'] = df['event_type'].apply(lambda x: return_all_but_last(x))
#     df['rml_product_id'] = df['product_id'].apply(lambda x: return_all_but_last(x))
    df['rml_category_id'] = df['category_id'].apply(lambda x: return_all_but_last(x))
    df['rml_category_code'] = df['category_code'].apply(lambda x: return_all_but_last(x))
    df['rml_brand'] = df['brand'].apply(lambda x: return_all_but_last(x))
    df['rml_user_session'] = df['user_session'].apply(lambda x: return_all_but_last(x))
    df['rml_event_time'] = df['event_time'].apply(lambda x: return_all_but_last(x))
    df['rml_h_lvl'] = df['h_lvl'].apply(lambda x: return_all_but_last(x))
    df['rml_l_lvl'] = df['l_lvl'].apply(lambda x: return_all_but_last(x))
    df['rml_time_between'] = df['time_between'].apply(lambda x: return_all_but_last(x))


    def map_values(listy, label_map):
        listy2 = []
        for i in listy:
            listy2.append(int(label_map[i]))
        return listy2

#     df['product_id_new_labels'] = df['product_id'].apply(lambda x: map_values(x, product_map))
    df['h_lvl_id_new_labels'] = df['h_lvl'].apply(lambda x: map_values(x, h_lvl_map))
    df['next_h_lvl'] = df['h_lvl_id_new_labels'].apply(lambda x: last_char(x))
    df['rml_h_lvl_id_new_labels'] = df['h_lvl_id_new_labels'].apply(lambda x: return_all_but_last(x))
    
    return df
    
    
training_data = feature_engineer(training_data)
validation_data = feature_engineer(validation_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_between'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['next_event'] = df['event_type'].apply(lambda x: last_char(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['next_time_between'] = df['product_id'].apply(lambda x: last_char(x))
A value is trying to be set on a copy of a

In [31]:
training_data.head()

Unnamed: 0_level_0,event_time,event_type,product_id,category_id,category_code,brand,price,user_session,h_lvl,l_lvl,...,rml_category_id,rml_category_code,rml_brand,rml_user_session,rml_h_lvl,rml_l_lvl,rml_time_between,h_lvl_id_new_labels,next_h_lvl,rml_h_lvl_id_new_labels
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
567069554,"[2019-11-08 09:42:27+00:00, 2019-11-08 09:42:4...","[view, view, view, view, view, view, view, car...","[1005115, 1005115, 1004249, 1004249, 1004250, ...","[2053013555631882655, 2053013555631882655, 205...","[electronics.smartphone, electronics.smartphon...","[apple, apple, apple, apple, apple, apple, app...","[915.08, 915.08, 739.79, 739.79, 814.56, 814.5...","[629faf75-5107-41d6-9806-123eb20676f9, 629faf7...","[electronics, electronics, electronics, electr...","[smartphone, smartphone, smartphone, smartphon...",...,"[2053013555631882655, 2053013555631882655, 205...","[electronics.smartphone, electronics.smartphon...","[915.08, 915.08, 739.79, 739.79, 814.56, 814.5...","[629faf75-5107-41d6-9806-123eb20676f9, 629faf7...","[electronics, electronics, electronics, electr...","[smartphone, smartphone, smartphone, smartphon...","[14.0, 30.0, 8.0, 11.0, 10.0, 10.0, 8.0, 6.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
559077481,"[2019-11-12 07:48:18+00:00, 2019-11-12 07:50:2...","[view, view, view, view, view, view, cart, pur...","[4804660, 28721804, 28717827, 28703606, 287177...","[2053013554658804075, 2053013565069067197, 205...","[electronics.audio.headphone, apparel.shoes.ke...","[xiaomi, adidas, adidas, reebok, puma, reebok,...","[23.09, 102.71, 102.71, 118.15, 84.69, 118.15,...","[2fedbf73-0fcd-4c54-95b7-0fd85cb17d67, 2fedbf7...","[electronics, apparel, apparel, apparel, appar...","[audio, shoes, shoes, shoes, shoes, shoes, sho...",...,"[2053013554658804075, 2053013565069067197, 205...","[electronics.audio.headphone, apparel.shoes.ke...","[23.09, 102.71, 102.71, 118.15, 84.69, 118.15,...","[2fedbf73-0fcd-4c54-95b7-0fd85cb17d67, 2fedbf7...","[electronics, apparel, apparel, apparel, appar...","[audio, shoes, shoes, shoes, shoes, shoes, sho...","[129.0, 44.0, 22.0, 77.0, 23.0, 30.0, 137.0, 3...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
513711707,"[2019-11-13 10:32:21+00:00, 2019-11-15 10:04:2...","[view, view, view, view, view, view, view, view]","[1005284, 1801766, 1801631, 1801631, 1801785, ...","[2053013555631882655, 2053013554415534427, 205...","[electronics.smartphone, electronics.video.tv,...","[samsung, artel, artel, artel, lg, apacer, apa...","[2562.49, 154.19, 334.11, 334.11, 447.63, 33.2...","[88c7c83c-e255-49ed-b707-f73c7d737e88, 2168a15...","[electronics, electronics, electronics, electr...","[smartphone, video, video, video, video, compo...",...,"[2053013555631882655, 2053013554415534427, 205...","[electronics.smartphone, electronics.video.tv,...","[2562.49, 154.19, 334.11, 334.11, 447.63, 33.2...","[88c7c83c-e255-49ed-b707-f73c7d737e88, 2168a15...","[electronics, electronics, electronics, electr...","[smartphone, video, video, video, video, compo...","[171121.0, 5863.0, 33.0, 294.0, 138645.0, 20.0]","[0, 0, 0, 0, 0, 2, 2, 0]",0,"[0, 0, 0, 0, 0, 2, 2]"
523379089,"[2019-11-02 17:05:40+00:00, 2019-11-02 17:06:4...","[view, view, view, view, view, view, view, vie...","[28719634, 28717035, 28715765, 28719086, 28714...","[2053013565069067197, 2053013565069067197, 205...","[apparel.shoes.keds, apparel.shoes.keds, appar...","[fassen, strobbs, nexpero, fassen, nike, nike,...","[44.79, 48.39, 75.42, 44.79, 55.39, 55.39, 44....","[69cfd1b9-ea8a-4fb6-bc10-caae9928ad4c, 69cfd1b...","[apparel, apparel, apparel, apparel, apparel, ...","[shoes, shoes, shoes, shoes, shoes, shoes, sho...",...,"[2053013565069067197, 2053013565069067197, 205...","[apparel.shoes.keds, apparel.shoes.keds, appar...","[44.79, 48.39, 75.42, 44.79, 55.39, 55.39, 44....","[69cfd1b9-ea8a-4fb6-bc10-caae9928ad4c, 69cfd1b...","[apparel, apparel, apparel, apparel, apparel, ...","[shoes, shoes, shoes, shoes, shoes, shoes, sho...","[63.0, 22.0, 67.0, 97.0, 26.0, 29.0, 9.0, 35.0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 3, 3, 3]",3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 3, 3]"
515367279,"[2019-11-12 19:14:01+00:00, 2019-11-12 19:14:2...","[view, view, view, view, view, view, view, vie...","[13201323, 13201323, 13201315, 13201315, 10001...","[2053013557192163841, 2053013557192163841, 205...","[furniture.bedroom.bed, furniture.bedroom.bed,...","[stendmebel, stendmebel, stendmebel, stendmebe...","[222.37, 222.37, 161.91, 161.91, 58.4, 95.79, ...","[873d5eff-3160-4633-bdbe-0c8c7d7f4cb1, 873d5ef...","[furniture, furniture, furniture, furniture, f...","[bedroom, bedroom, bedroom, bedroom, bedroom, ...",...,"[2053013557192163841, 2053013557192163841, 205...","[furniture.bedroom.bed, furniture.bedroom.bed,...","[222.37, 222.37, 161.91, 161.91, 58.4, 95.79, ...","[873d5eff-3160-4633-bdbe-0c8c7d7f4cb1, 873d5ef...","[furniture, furniture, furniture, furniture, f...","[bedroom, bedroom, bedroom, bedroom, bedroom, ...","[20.0, 24.0, 328315.0, 1095684.0, 33.0, 29.0, ...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]",4,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]"


In [32]:
# training_data.to_csv('sample.csv', index=False)
# table = pa.Table.from_pandas(training_data)
# pq.write_to_dataset(table, 'sample.parquet')

Functions for creating graphs only by user_id (for graph2vec)

In [33]:
def edge_maker_sm(df, iloc_num, node_category):
    product_id_list = df.iloc[iloc_num][node_category]
    time_on_page = df.iloc[iloc_num]['time_between']
#     event_type = df.iloc[iloc_num]['event_type']
#     source_target_dict = {'source':[], 'target':[], 'weight':[0] + time_on_page, 'attributes':event_type}
    source_target_dict = {'source':[], 'target':[], 'weight':time_on_page, 'event_type':[]}
    
    for i, j in enumerate(product_id_list):
        if i == 0:
#             source_target_dict['source'].append(f'customer_{iloc_num}')
#             source_target_dict['target'].append(f'{product_id_list[i+1]}')
            pass
        else:
            source_target_dict['source'].append(f'{product_id_list[i-1]}')
            source_target_dict['target'].append(f'{j}')
            source_target_dict['event_type'].append(df.iloc[iloc_num]['event_type'][i])
    return source_target_dict
  

def node_maker_sm(df, iloc_num, node_category):
    node_category = df.iloc[iloc_num][node_category]
#     time_on_page = df.iloc[iloc_num]['category_id']
#     h_lvl = df.iloc[iloc_num]['h_lvl']
#     l_lvl = df.iloc[iloc_num]['l_lvl']
#     user_session = df.iloc[iloc_num]['user_session']
    
    distinct_node_categories = [str(i) for n, i in enumerate(node_category) if i not in node_category[:n]]
#     distinct_category_ids = [i for n, i in enumerate(category_id) if i not in category_id[:n]]
#     distinct_h_lvls = [i for n, i in enumerate(h_lvl) if i not in h_lvl[:n]]
#     distinct_l_lvls = [i for n, i in enumerate(l_lvl) if i not in l_lvl[:n]]
#     distinct_user_sessions = [i for n, i in enumerate(user_session) if i not in user_session[:n]]
    
#     node_dict = {'id':[f'customer_{iloc_num}'] + distinct_product_ids}
    node_dict = {'id':distinct_product_ids}
    return node_dict

In [34]:
edge_maker_sm(training_data, 1, 'l_lvl')

{'source': ['audio',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes'],
 'target': ['shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes',
  'shoes'],
 'weight': [129.0,
  44.0,
  22.0,
  77.0,
  23.0,
  30.0,
  137.0,
  370.0,
  217.0,
  967902.0],
 'event_type': ['view',
  'view',
  'view',
  'view',
  'view',
  'cart',
  'purchase',
  'view',
  'view',
  'view']}

In [35]:
def edge_maker(df, node_category):
    source_target_dict = {'source':[], 'target':[], 'weight':[], 'event_type':[]}
    for x in range(len(df)):
        product_id_list = df.iloc[x][node_category]
        for q in df.iloc[x]['time_between']:
            source_target_dict['weight'].append(q)
    #     source_target_dict = {'source':[], 'target':[], 'weight':[0] + time_on_page, 'attributes':event_type}
        
        for i, j in enumerate(product_id_list):
            if i == 0:
    #             source_target_dict['source'].append(f'customer_{iloc_num}')
    #             source_target_dict['target'].append(f'{product_id_list[i+1]}')
                pass
            else:
                source_target_dict['source'].append(f'{product_id_list[i-1]}')
                source_target_dict['target'].append(f'{j}')
                source_target_dict['event_type'].append(df.iloc[x]['event_type'][i])
    return source_target_dict


def node_maker(df):
    product_id = df.iloc[iloc_num]['product_id']
#     time_on_page = df.iloc[iloc_num]['category_id']
#     h_lvl = df.iloc[iloc_num]['h_lvl']
#     l_lvl = df.iloc[iloc_num]['l_lvl']
#     user_session = df.iloc[iloc_num]['user_session']
    
    distinct_product_ids = [str(i) for n, i in enumerate(product_id) if i not in product_id[:n]]
#     distinct_category_ids = [i for n, i in enumerate(category_id) if i not in category_id[:n]]
#     distinct_h_lvls = [i for n, i in enumerate(h_lvl) if i not in h_lvl[:n]]
#     distinct_l_lvls = [i for n, i in enumerate(l_lvl) if i not in l_lvl[:n]]
#     distinct_user_sessions = [i for n, i in enumerate(user_session) if i not in user_session[:n]]
    
#     node_dict = {'id':[f'customer_{iloc_num}'] + distinct_product_ids}
    node_dict = {'id':distinct_product_ids}
    return node_dict

In [36]:
ex_dict = edge_maker(training_data, 'h_lvl_id_new_labels')

In [37]:
len(ex_dict['source'])

5131363

In [38]:
len(ex_dict['target'])

5131363

In [39]:
len(ex_dict['weight'])

5131363

In [40]:
len(ex_dict['event_type'])

5131363

In [41]:
edges = pd.DataFrame(edge_maker(training_data, 'h_lvl_id_new_labels'))
edges

Unnamed: 0,source,target,weight,event_type
0,0,0,14.0,view
1,0,0,30.0,view
2,0,0,8.0,view
3,0,0,11.0,view
4,0,0,10.0,view
...,...,...,...,...
5131358,0,0,247386.0,view
5131359,0,0,74.0,view
5131360,0,0,32.0,view
5131361,0,0,345713.0,view


In [42]:
# nodes = pd.DataFrame(node_maker(1), index=[node_maker(1)['id']]).drop(['id'], axis=1)
# nodes

In [43]:
end = datetime.now()
print(end)
print(end-start)

2021-07-08 17:36:26.354152
0:52:17.122967


In [48]:
def pack_dataset(df): 
    for i, j in enumerate(df.columns):
        if type(df.iloc[0][j]) == list:
            df[j] = df[j].apply(lambda x: ','.join([str(i) for i in x]))
        else:
            pass
    return df
                                
training_save = pack_dataset(training_data)
validation_save = pack_dataset(validation_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[j] = df[j].apply(lambda x: ','.join([str(i) for i in x]))


In [49]:
training_save.head()

Unnamed: 0_level_0,event_time,event_type,product_id,category_id,category_code,brand,price,user_session,h_lvl,l_lvl,...,rml_category_id,rml_category_code,rml_brand,rml_user_session,rml_h_lvl,rml_l_lvl,rml_time_between,h_lvl_id_new_labels,next_h_lvl,rml_h_lvl_id_new_labels
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
567069554,"2019-11-08 09:42:27+00:00,2019-11-08 09:42:41+...","view,view,view,view,view,view,view,cart,view,c...","1005115,1005115,1004249,1004249,1004250,100425...","2053013555631882655,2053013555631882655,205301...","electronics.smartphone,electronics.smartphone,...","apple,apple,apple,apple,apple,apple,apple,appl...","915.08,915.08,739.79,739.79,814.56,814.56,1091...","629faf75-5107-41d6-9806-123eb20676f9,629faf75-...","electronics,electronics,electronics,electronic...","smartphone,smartphone,smartphone,smartphone,sm...",...,"2053013555631882655,2053013555631882655,205301...","electronics.smartphone,electronics.smartphone,...","915.08,915.08,739.79,739.79,814.56,814.56,1091...","629faf75-5107-41d6-9806-123eb20676f9,629faf75-...","electronics,electronics,electronics,electronic...","smartphone,smartphone,smartphone,smartphone,sm...","14.0,30.0,8.0,11.0,10.0,10.0,8.0,6.0,7.0,15238...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",0,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
559077481,"2019-11-12 07:48:18+00:00,2019-11-12 07:50:27+...","view,view,view,view,view,view,cart,purchase,vi...","4804660,28721804,28717827,28703606,28717791,28...","2053013554658804075,2053013565069067197,205301...","electronics.audio.headphone,apparel.shoes.keds...","xiaomi,adidas,adidas,reebok,puma,reebok,reebok...","23.09,102.71,102.71,118.15,84.69,118.15,118.15...","2fedbf73-0fcd-4c54-95b7-0fd85cb17d67,2fedbf73-...","electronics,apparel,apparel,apparel,apparel,ap...","audio,shoes,shoes,shoes,shoes,shoes,shoes,shoe...",...,"2053013554658804075,2053013565069067197,205301...","electronics.audio.headphone,apparel.shoes.keds...","23.09,102.71,102.71,118.15,84.69,118.15,118.15...","2fedbf73-0fcd-4c54-95b7-0fd85cb17d67,2fedbf73-...","electronics,apparel,apparel,apparel,apparel,ap...","audio,shoes,shoes,shoes,shoes,shoes,shoes,shoe...","129.0,44.0,22.0,77.0,23.0,30.0,137.0,370.0,217.0",01111111111,1,0111111111
513711707,"2019-11-13 10:32:21+00:00,2019-11-15 10:04:22+...","view,view,view,view,view,view,view,view","1005284,1801766,1801631,1801631,1801785,680107...","2053013555631882655,2053013554415534427,205301...","electronics.smartphone,electronics.video.tv,el...","samsung,artel,artel,artel,lg,apacer,apacer,huawei","2562.49,154.19,334.11,334.11,447.63,33.21,33.2...","88c7c83c-e255-49ed-b707-f73c7d737e88,2168a15a-...","electronics,electronics,electronics,electronic...","smartphone,video,video,video,video,components,...",...,"2053013555631882655,2053013554415534427,205301...","electronics.smartphone,electronics.video.tv,el...","2562.49,154.19,334.11,334.11,447.63,33.21,33.21","88c7c83c-e255-49ed-b707-f73c7d737e88,2168a15a-...","electronics,electronics,electronics,electronic...","smartphone,video,video,video,video,components,...","171121.0,5863.0,33.0,294.0,138645.0,20.0",00000220,0,0000022
523379089,"2019-11-02 17:05:40+00:00,2019-11-02 17:06:43+...","view,view,view,view,view,view,view,view,view,p...","28719634,28717035,28715765,28719086,28714060,2...","2053013565069067197,2053013565069067197,205301...","apparel.shoes.keds,apparel.shoes.keds,apparel....","fassen,strobbs,nexpero,fassen,nike,nike,fassen...","44.79,48.39,75.42,44.79,55.39,55.39,44.79,44.7...","69cfd1b9-ea8a-4fb6-bc10-caae9928ad4c,69cfd1b9-...","apparel,apparel,apparel,apparel,apparel,appare...","shoes,shoes,shoes,shoes,shoes,shoes,shoes,shoe...",...,"2053013565069067197,2053013565069067197,205301...","apparel.shoes.keds,apparel.shoes.keds,apparel....","44.79,48.39,75.42,44.79,55.39,55.39,44.79,44.7...","69cfd1b9-ea8a-4fb6-bc10-caae9928ad4c,69cfd1b9-...","apparel,apparel,apparel,apparel,apparel,appare...","shoes,shoes,shoes,shoes,shoes,shoes,shoes,shoe...","63.0,22.0,67.0,97.0,26.0,29.0,9.0,35.0,112.0,5...",1111111111100333,3,111111111110033
515367279,"2019-11-12 19:14:01+00:00,2019-11-12 19:14:21+...","view,view,view,view,view,view,view,view,view,v...","13201323,13201323,13201315,13201315,100016372,...","2053013557192163841,2053013557192163841,205301...","furniture.bedroom.bed,furniture.bedroom.bed,fu...","stendmebel,stendmebel,stendmebel,stendmebel,ik...","222.37,222.37,161.91,161.91,58.4,95.79,105.28,...","873d5eff-3160-4633-bdbe-0c8c7d7f4cb1,873d5eff-...","furniture,furniture,furniture,furniture,furnit...","bedroom,bedroom,bedroom,bedroom,bedroom,bedroo...",...,"2053013557192163841,2053013557192163841,205301...","furniture.bedroom.bed,furniture.bedroom.bed,fu...","222.37,222.37,161.91,161.91,58.4,95.79,105.28,...","873d5eff-3160-4633-bdbe-0c8c7d7f4cb1,873d5eff-...","furniture,furniture,furniture,furniture,furnit...","bedroom,bedroom,bedroom,bedroom,bedroom,bedroo...","20.0,24.0,328315.0,1095684.0,33.0,29.0,60.0,64...",44444444444,4,4444444444


In [50]:
edges.to_csv('../h_lvl_graph.csv', index=False)

In [51]:
validation_save.to_csv('../validation_data.csv', index=False)

In [52]:
training_save.to_csv('../training_data.csv', index=False)