In [1]:
import os
import pickle
import pandas as pd

In [2]:
raw_csv = '../../data/eComm-behavior/eCommerce-behavior-data-2019-Oct.csv'
data_dir = '../dataset'

In [3]:
raw_df = pd.read_csv(raw_csv)
raw_df.shape

(42448764, 9)

In [4]:
raw_df.head(2)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc


In [5]:
raw_df = raw_df.dropna()
raw_df.shape

(26560620, 9)

In [6]:
temp = raw_df.groupby(
    by=['user_id'], as_index=False).agg({'product_id': 'count'}).rename(
        columns={'product_id': 'total_interactions'})

raw_df = pd.merge(raw_df, temp, on="user_id", how="left")
print(raw_df.shape)
raw_df.head(2)

(26560620, 10)


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,total_interactions
0,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,3
1,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,10


In [7]:
del temp

In [8]:
sample_df  = raw_df[raw_df.total_interactions.between(200, 600)]
print(sample_df.shape)

(1527951, 10)


In [9]:
df1 = sample_df[sample_df.event_type.isin(['cart', 'purchase'])]

df2 = sample_df[sample_df.event_type.isin(['view'])].sample(df1.shape[0], random_state=1)

In [10]:
sample_df = pd.concat([df1, df2])
sample_df.drop_duplicates(subset=['event_time', 'user_id', 'product_id'], inplace=True)
sample_df['event_time'] = sample_df['event_time'].apply(lambda x : pd.to_datetime(x).value)
sample_df.reset_index(drop=True, inplace=True)
print(sample_df.shape)
sample_df.head(2)

(126332, 10)


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,total_interactions
0,1569888134000000000,purchase,1004856,2053013555631882655,electronics.smartphone,samsung,130.76,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564,205
1,1569896511000000000,cart,1005135,2053013555631882655,electronics.smartphone,apple,1747.79,515384420,7f82b450-6c45-4346-96fb-ecf4ab25779c,475


In [11]:
del df1, df2

In [12]:
sample_df[['cat_1', 'cat_2']] = sample_df['category_code'].str.split('.', 1, expand=True)
sample_df['cat_2'] = sample_df['cat_2'].str.split('.').str[0]

In [13]:
sample_df.head(2)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,total_interactions,cat_1,cat_2
0,1569888134000000000,purchase,1004856,2053013555631882655,electronics.smartphone,samsung,130.76,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564,205,electronics,smartphone
1,1569896511000000000,cart,1005135,2053013555631882655,electronics.smartphone,apple,1747.79,515384420,7f82b450-6c45-4346-96fb-ecf4ab25779c,475,electronics,smartphone


In [14]:
sample_df['user_id'] = sample_df['user_id'].astype('category')
sample_df['user_id_num'] = sample_df['user_id'].cat.codes
user_num_to_id = dict(zip(sample_df['user_id_num'], sample_df['user_id']))

In [15]:
sample_df['product_id'] = sample_df['product_id'].astype('category')
sample_df['product_id_num'] = sample_df['product_id'].cat.codes
item_num_to_id = dict(zip(sample_df['product_id_num'], sample_df['product_id']))

In [16]:
print(f"Sample data shape {len(sample_df)}")

print(f"Unique users : {len(user_num_to_id)}, Unique items : {len(item_num_to_id)}")

Sample data shape 126332
Unique users : 5380, Unique items : 15286


In [17]:
with open(os.path.join(data_dir, 'user_num_to_id.pkl'), 'wb') as handle:
    pickle.dump(user_num_to_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(data_dir, 'item_num_to_id.pkl'), 'wb') as handle:
    pickle.dump(item_num_to_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

sample_df.to_csv(os.path.join(data_dir, 'sample_data.csv'), index=False)