### Pre-processing steps based on EDA

In [1]:
import pandas as pd
from datetime import datetime

### 0 Loading data

In [2]:
df = pd.read_csv('../data/original/train_users_2.csv')
df.shape

(213451, 16)

In [3]:
df.date_account_created = df.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
df.timestamp_first_active = df.timestamp_first_active.apply(lambda x: datetime.strptime(str(x), '%Y%m%d%H%M%S'))
df.date_first_booking = df.date_first_booking.apply(lambda x: datetime.strptime(x, '%Y-%m-%d') if pd.notna(x) else None)

In [4]:
df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,NaT,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,NaT,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [5]:
test = pd.read_csv('../data/original/test_users.csv')
test.shape

(62096, 15)

In [6]:
test.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [7]:
ss = pd.read_csv('../data/original/sessions.csv')
ss.shape

(10567737, 6)

In [8]:
ss.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


### 1 Checking whether sessions are present for all users. 
### It appears that around sessions data is available for only 34% of users

In [9]:
ids = set(df.id.tolist()).intersection(set(ss.user_id.tolist()))
ids = list(ids)
len(ids)

73815

In [10]:
len(ids) * 100 / df.id.nunique()

34.581707277079985

### 2 Checking whether sessions are present for all **TEST** users. 
### It appears that sessions data is available for almost all test users

In [11]:
ids = set(test.id.tolist()).intersection(set(ss.user_id.tolist()))
ids = list(ids)
len(ids)

61668

In [12]:
len(ids) * 100 / test.id.nunique()

99.31074465343984

### 3 Dropping redundand data

### 3.1 We need to drop users with no sesssions

In [13]:
ids = set(df.id.tolist()).intersection(set(ss.user_id.tolist()))
df = df[df.id.isin(ids)]
df.reset_index(drop=True, inplace=True)
df.shape

(73815, 16)

### 3.2 We need to drop date_first_booking, as it is null for test set

In [14]:
test.date_first_booking.isna().sum(), len(test)

(62096, 62096)

In [15]:
df.drop('date_first_booking', inplace=True, axis=1)
test.drop('date_first_booking', inplace=True, axis=1)
df.shape, test.shape

((73815, 15), (62096, 14))

### 3.3 We need to drop sessions without any user_ids

In [16]:
ss.user_id.isna().sum()

34496

In [17]:
ss = ss.loc[ss.user_id.notna()]
ss.reset_index(drop=True, inplace=True)
ss.shape

(10533241, 6)

### 3.4 Checking whether all action, action_types etc are present in both datasets

### 3.4.1 Dropping all actions that are not present in both datasets

In [36]:
ss['action_info'] = ss[['action', 'action_type', 'action_detail']].apply(lambda x: f'{x[0]}_{x[1]}_{x[2]}', axis=1)

In [47]:
ss.sample(20)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed,action_info
8087529,3v9l3i09mw,show,,,Mac Desktop,48.0,show_nan_nan
5952163,masrzmahd8,ajax_refresh_subtotal,click,change_trip_characteristics,Mac Desktop,286.0,ajax_refresh_subtotal_click_change_trip_charac...
6341514,y3wl0scef6,similar_listings_v2,,,Mac Desktop,353.0,similar_listings_v2_nan_nan
7998812,teucumzg02,search,click,view_search_results,iPhone,7909.0,search_click_view_search_results
2939320,5kpw1g13hn,show,view,p3,Windows Desktop,850.0,show_view_p3
2350888,az6ws02xkg,notifications,data,notifications,iPhone,3159.0,notifications_data_notifications
3092163,hg0wsu7lly,personalize,data,wishlist_content_update,Windows Desktop,2636.0,personalize_data_wishlist_content_update
1880801,j03vciujqh,show,,,iPad Tablet,780.0,show_nan_nan
8981781,ribag1qz1a,set_user,submit,create_listing,Mac Desktop,723.0,set_user_submit_create_listing
3496326,dk6u4oi2p3,show,,,Android Phone,1271.0,show_nan_nan


In [48]:
ss_train = ss[ss.user_id.isin(df.id.tolist())]
ss_train.reset_index(drop=True, inplace=True)
ss_train.shape

(5537957, 7)

In [49]:
ss_test = ss[ss.user_id.isin(test.id.tolist())]
ss_test.reset_index(drop=True, inplace=True)
ss_test.shape

(4995284, 7)

In [50]:
ss_train.nunique()

user_id           73815
action              331
action_type           9
action_detail       128
device_type          14
secs_elapsed     256593
action_info         401
dtype: int64

In [51]:
ss_test.nunique()

user_id           61668
action              311
action_type          10
action_detail       153
device_type          14
secs_elapsed     223462
action_info         392
dtype: int64

In [52]:
action_info_set = set(ss_train.action_info.unique()).intersection(set(ss_test.action_info.unique()))
len(actions_set)

336

In [53]:
ss_train = ss_train[ss_train.action_info.isin(action_info_set)]
ss_train.reset_index(drop=True, inplace=True)
ss_train.shape

(5494799, 7)

In [54]:
ss_test = ss_test[ss_test.action_info.isin(action_info_set)]
ss_test.reset_index(drop=True, inplace=True)
ss_test.shape

(4934245, 7)

### 4. Saving data

In [60]:
df.to_parquet('../data/processed/train_users_2.parquet')
test.to_parquet('../data/processed/test_users.parquet')

In [61]:
ss_train.to_parquet('../data/processed/sessions_train.parquet')
ss_test.to_parquet('../data/processed/sessions_test.parquet')