In [1]:
from tqdm.notebook import tqdm 
import os

import time
import random
import numpy as np
import pandas as pd
import datetime as dt

from multiprocessing import Pool

In [2]:
train = pd.read_csv('./data/train.csv')
train.head()

Unnamed: 0,session_id,dt,reference
0,aff3928535f48,2018-11-01 01:58:42,666856
1,aff3928535f48,2018-11-01 01:58:42,666856
2,aff3928535f48,2018-11-01 01:58:52,666856
3,aff3928535f48,2018-11-01 01:58:52,109038
4,aff3928535f48,2018-11-01 01:58:52,666856


In [3]:
# session_id 숫자화
f = train.groupby('session_id')['dt'].agg([('x','count')]).reset_index()
f = f.reset_index()
f = f.rename(columns={'index':'SessionId'})
f = f.drop(columns = ['x'], axis=1)
train = pd.merge(train,f, on='session_id')
train = train.drop(columns = ['session_id'], axis=1)
train 

Unnamed: 0,dt,reference,SessionId
0,2018-11-01 01:58:42,666856,283862
1,2018-11-01 01:58:42,666856,283862
2,2018-11-01 01:58:52,666856,283862
3,2018-11-01 01:58:52,109038,283862
4,2018-11-01 01:58:52,666856,283862
...,...,...,...
6214447,2018-11-03 13:23:17,360506,364959
6214448,2018-11-03 21:11:08,8260058,358614
6214449,2018-11-03 21:13:05,1105742,358614
6214450,2018-11-03 23:00:37,1131953,313064


In [4]:
test = pd.read_csv('./data/test.csv')
test.head() 

Unnamed: 0,session_id,dt,reference
0,c65d7bed8c83c,2018-11-04 05:50:32,1934379
1,c65d7bed8c83c,2018-11-04 05:52:56,8830114
2,c65d7bed8c83c,2018-11-04 05:52:56,8830114
3,c65d7bed8c83c,2018-11-04 05:53:06,8830114
4,c65d7bed8c83c,2018-11-04 05:53:06,8830114


In [5]:
# session_id 숫자화
f = test.groupby('session_id')['dt'].agg([('x','count')]).reset_index()
f = f.reset_index()
f = f.rename(columns={'index':'SessionId'})
f = f.drop(columns = ['x'], axis=1)
test = pd.merge(test,f, on='session_id')
test = test.drop(columns = ['session_id'], axis=1)
test 

Unnamed: 0,dt,reference,SessionId
0,2018-11-04 05:50:32,1934379,70359
1,2018-11-04 05:52:56,8830114,70359
2,2018-11-04 05:52:56,8830114,70359
3,2018-11-04 05:53:06,8830114,70359
4,2018-11-04 05:53:06,8830114,70359
...,...,...,...
2218770,2018-11-04 08:08:19,103651,15079
2218771,2018-11-04 08:18:54,103651,15079
2218772,2018-11-04 08:19:19,103651,15079
2218773,2018-11-04 08:19:30,103651,15079


In [6]:
print('train SessionId min {} max {}'.format(train['SessionId'].min(), train['SessionId'].max()))

test['SessionId'] = test['SessionId'] + (train['SessionId'].max() + 1)
print('test SessionId min {} max {}'.format(test['SessionId'].min(), test['SessionId'].max()))

train SessionId min 0 max 412726
test SessionId min 412727 max 503460


In [7]:
## parameters
sampling= False
sample_rate = 0.1 
single_process=True
file_type = "sample" if sampling==True else "full"
file_type 

'full'

In [8]:
# ## sampling 
# ### raw data의 수가 많으므로 tutorial을 원활히 수행하기 위해,
# ### sessionId 기준으로 샘플링을 수행한다.
# random.seed(1050)
# if sampling:
#     u_sessid = raw_data.SessionId.unique()
#     s_sessid = random.sample(u_sessid.tolist(), int(len(u_sessid)*sample_rate))
#     raw_data = raw_data[np.in1d(raw_data.SessionId, s_sessid)]
# raw_data.shape

In [9]:
## multi processing: transpose timestr to timestamp
num_cores = 8

def timestr_to_timestamp(df):
    df['timestamp'] = df['dt'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp())
    del(df['dt'])
    return df

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df 

In [11]:
train.shape, test.shape

((6214452, 3), (2218775, 3))

In [12]:
data = pd.concat([train,test])
data.shape

(8433227, 3)

In [13]:
## transpose timestr to timestamp
if single_process:
    ## single processing
    %time data['timestamp'] = data['dt'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp())
    data = data
    del(data['dt']) 
else:
    ## multi processing
    %time data = parallelize_dataframe(data, timestr_to_timestamp)
    del(data['dt'])

CPU times: user 1min 21s, sys: 582 ms, total: 1min 21s
Wall time: 1min 22s


In [14]:
data

Unnamed: 0,reference,SessionId,timestamp
0,666856,283862,1.541005e+09
1,666856,283862,1.541005e+09
2,666856,283862,1.541005e+09
3,109038,283862,1.541005e+09
4,666856,283862,1.541005e+09
...,...,...,...
2218770,103651,427806,1.541286e+09
2218771,103651,427806,1.541287e+09
2218772,103651,427806,1.541287e+09
2218773,103651,427806,1.541287e+09


In [15]:
data = data.rename(columns = {'reference':'ItemId'})
data = data.iloc[:,[1,0,2]]
data.head() 

Unnamed: 0,SessionId,ItemId,timestamp
0,283862,666856,1541005000.0
1,283862,666856,1541005000.0
2,283862,666856,1541005000.0
3,283862,109038,1541005000.0
4,283862,666856,1541005000.0


In [16]:
## sorted by sessionid, timestamp
%time data = data.sort_values(['SessionId','timestamp'])
data[:5]

CPU times: user 12.9 s, sys: 324 ms, total: 13.2 s
Wall time: 13.3 s


Unnamed: 0,SessionId,ItemId,timestamp
4614337,0,7281198,1541032000.0
2365167,1,979325,1541065000.0
3176102,2,445081,1541070000.0
3176103,2,445081,1541070000.0
3176104,2,445081,1541070000.0


In [17]:
## data length by sessionid
session_lengths = data.groupby('SessionId').size()
print("length:", len(session_lengths))
print("min length", min(session_lengths))
print("max length", max(session_lengths))

length: 503461
min length 1
max length 3328


In [18]:
## filter by session length
### session이 2이상인 데이터만 필터한다.
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [19]:
## data length by itemid
item_supports = data.groupby('ItemId').size()
print("length:", len(item_supports))
print("min length", min(item_supports))
print("max length", max(item_supports))

length: 238052
min length 1
max length 4814


In [20]:
## filter by item length
### item이 5이상인 데이터만 필터한다.
data = data[np.in1d(data.ItemId, item_supports[item_supports>=5].index)]

In [21]:
## filter by session length
### item에 의해 session length가 1인 id생길 수 있으므로 한번더 수행한다.
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [31]:
## split train & test set
### train index max 기준으로 분리 
train_max_ss = train['SessionId'].max()

### 학습 & 테스트 데이터 구성, 테스트 데이터의 아이템은 학습데이터에 있는 아이템만을 선택
train = data.loc[data['SessionId']<train_max_ss]
test = data.loc[data['SessionId']>train_max_ss]
print('train shape {} test shape {}'.format(train.shape, test.shape))

train shape (5900211, 3) test shape (2166566, 3)


In [32]:
## filter by session length
### test data의 item 필터 후 session length가 1이 될 수 있으므로, 필터링을 한번 더 수행
tslength = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]

In [35]:
## save processed data
PATH_TO_PROCESSED_DATA = './data/processed/'
if not os.path.exists(PATH_TO_PROCESSED_DATA):
    os.mkdir(PATH_TO_PROCESSED_DATA)

print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), 
        train.SessionId.nunique(), train.ItemId.nunique()))
train.to_csv(PATH_TO_PROCESSED_DATA + 'trivago_train_{}.txt'.format(file_type), 
             sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'trivago_train_{}.txt'.format(file_type))

print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), 
        test.SessionId.nunique(), test.ItemId.nunique()))
test.to_csv(PATH_TO_PROCESSED_DATA + 'trivago_test_{}.txt'.format(file_type), 
            sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'trivago_test_{}.txt'.format(file_type))

Full train set
	Events: 5900211
	Sessions: 225223
	Items: 128580
./data/processed/trivago_train_full.txt
Test set
	Events: 2166566
	Sessions: 68863
	Items: 78682
./data/processed/trivago_test_full.txt


In [36]:
## make validation set
### 동일한 과정으로 train data에서 validation data를 분리한다.
tmax = train.timestamp.max() 
session_max_times = train.groupby('SessionId').timestamp.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_valid = session_max_times[session_max_times >= tmax-86400].index
train_tr = train[np.in1d(train.SessionId, session_train)]
valid = train[np.in1d(train.SessionId, session_valid)]
valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
tslength = valid.groupby('SessionId').size()
valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]

print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), 
        train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
train_tr.to_csv(PATH_TO_PROCESSED_DATA + 'trivago_train_{}_trn.txt'.format(file_type), 
                sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'trivago_train_{}_trn.txt'.format(file_type))
print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), 
        valid.SessionId.nunique(), valid.ItemId.nunique()))
valid.to_csv(PATH_TO_PROCESSED_DATA + 'trivago_train_{}_valid.txt'.format(file_type), 
             sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'trivago_train_{}_valid.txt'.format(file_type))

Train set
	Events: 3997279
	Sessions: 153508
	Items: 112364
./data/processed/trivago_train_full_trn.txt
Validation set
	Events: 1627990
	Sessions: 65630
	Items: 61992
./data/processed/trivago_train_full_valid.txt


In [37]:
train_tr

Unnamed: 0,SessionId,ItemId,timestamp
3176102,2,445081,1.541070e+09
3176103,2,445081,1.541070e+09
3176104,2,445081,1.541070e+09
3176105,2,445081,1.541070e+09
3176106,2,445081,1.541070e+09
...,...,...,...
5349866,412720,1668601,1.541128e+09
5349867,412720,1668601,1.541128e+09
5349868,412720,1668601,1.541128e+09
5349869,412720,509846,1.541128e+09


In [38]:
valid

Unnamed: 0,SessionId,ItemId,timestamp
2173184,3,673981,1.541208e+09
2173185,3,673981,1.541208e+09
2173186,3,673981,1.541208e+09
2173187,3,10369176,1.541209e+09
2173188,3,10369176,1.541209e+09
...,...,...,...
3376372,412722,104632,1.541245e+09
3376373,412722,104632,1.541245e+09
3376374,412722,104632,1.541245e+09
3376375,412722,1244689,1.541245e+09
