### Pre-processing steps based on EDA

In [1]:
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm

tqdm.pandas()

### 0 Loading data

In [2]:
train_users = pd.read_csv('../data/original/train_users_2.csv')
test_users = pd.read_csv('../data/original/test_users.csv')
train_users.shape, test_users.shape

((213451, 16), (62096, 15))

In [5]:
users = pd.concat([train_users, test_users], axis=0)
users.shape

(275547, 16)

In [6]:
users['train_flag'] = 1
users.loc[users.id.isin(test_users.id), 'train_flag'] = 0
users.train_flag.value_counts()

1    213451
0     62096
Name: train_flag, dtype: int64

In [7]:
users.date_account_created = users.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
users.timestamp_first_active = users.timestamp_first_active.apply(lambda x: datetime.strptime(str(x), '%Y%m%d%H%M%S'))
users.date_first_booking = users.date_first_booking.apply(lambda x: datetime.strptime(x, '%Y-%m-%d') if pd.notna(x) else None)

In [8]:
users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,train_flag
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,NaT,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF,1
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,NaT,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF,1
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US,1
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other,1
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US,1


In [9]:
sessions = pd.read_csv('../data/original/sessions.csv')
sessions.shape

(10567737, 6)

In [10]:
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


### 1 Checking whether sessions are present for all users. 
### It appears that around sessions data is available for only 34% of train users

In [11]:
ids = set(train_users.id.tolist()).intersection(set(sessions.user_id.tolist()))
len(ids), len(ids) * 100 / train_users.id.nunique()

(73815, 34.581707277079985)

### 2 Checking whether sessions are present for all **TEST** users. 
### It appears that sessions data is available for almost all test users

In [12]:
ids = set(test_users.id.tolist()).intersection(set(sessions.user_id.tolist()))
len(ids), len(ids) * 100 / test_users.id.nunique()

(61668, 99.31074465343984)

### 3 Dropping redundand data

### 3.1 We need to drop users with no sesssions

In [13]:
users.shape

(275547, 17)

In [14]:
test_users[~test_users.id.isin(sessions.user_id)].id.nunique()

428

### 3.1.1 Decided to leave users without sessions, as we would have dropped 428 users

In [15]:
# users = users[users.id.isin(sessions.user_id)]
# users.reset_index(drop=True, inplace=True)
# users.shape

### 3.2 We need to drop date_first_booking, as it is null for test set

In [16]:
users.drop('date_first_booking', inplace=True, axis=1)
users.shape

(275547, 16)

### 3.2.1 Need to update train_users and test_users dataframes after cleaning
No longer needed, as we did not drop any users

In [17]:
# train_users.shape, test_users.shape

In [15]:
# train_users = train_users[train_users.id.isin(sessions.user_id)]
# train_users.reset_index(drop=True, inplace=True)
# train_users.shape

In [None]:
# test_users = test_users[test_users.id.isin(sessions.user_id)]
# test_users.reset_index(drop=True, inplace=True)
# test_users.shape

### 3.3 We need to drop sessions without any user_ids

In [20]:
sessions.user_id.isna().sum(), (~sessions.user_id.isin(users.id)).sum()

(34496, 34496)

In [21]:
sessions = sessions[sessions.user_id.notna()]
sessions.reset_index(drop=True, inplace=True)
sessions.shape

(10533241, 6)

### 3.4 Checking whether all action, action_types etc are present in both datasets

### 3.4.1 Concatenating action, action_type, action_detail, and splitting into train and test sessions set

In [23]:
sessions['action_info'] = sessions['action'].astype(str) + '_' + sessions['action_type'].astype(str) + '_' + sessions['action_detail'].astype(str)

In [24]:
sessions.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed,action_info
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0,lookup_nan_nan
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0,search_results_click_view_search_results
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0,lookup_nan_nan
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0,search_results_click_view_search_results
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0,lookup_nan_nan


In [25]:
sessions.nunique()

user_id          135483
action              359
action_type          10
action_detail       155
device_type          14
secs_elapsed     337188
action_info         457
dtype: int64

In [None]:
sessions_test = sessions[sessions.user_id.isin(test_users.id.tolist())]
sessions_test.reset_index(drop=True, inplace=True)
sessions_test.shape

(4995284, 7)

In [None]:
sessions_train.nunique()

user_id           73815
action              331
action_type           9
action_detail       128
device_type          14
secs_elapsed     256593
action_info         401
dtype: int64

In [21]:
sessions_test.nunique()

user_id           61668
action              311
action_type          10
action_detail       153
device_type          14
secs_elapsed     223462
action_info         392
dtype: int64

### 3.5 Checking action, action_type, action_detail, device_type, action_info unique values for test and train users

In [26]:
cols = ['action', 'action_type', 'action_detail', 'device_type', 'action_info']

In [27]:
for col in cols:
    train_set = set(sessions_train[col].unique())
    test_set = set(sessions_test[col].unique())
    if train_set != test_set:
        print(f'Discrepancy found for: {col}')
        print(f'train size: {len(train_set)}, test size: {len(test_set)}')
        print(f'Present in train but missing in test:\n{train_set - test_set}')
        print(f'Present in test but missing in train:\n{test_set - train_set}')

Discrepancy found for: action
train size: 332, test size: 312
Present in train but missing in test:
{'multi', 'plaxo_cb', 'concierge', 'confirmation', 'multi_message_attributes', 'onenight', 'sldf', 'reactivate', 'friend_listing', 'ajax_photo_widget', 'airbrb', 'ajax_send_message', 'travel', 'use_mobile_site', 'deauthorize', 'host_2013', 'views_campaign', 'faq_experiment_ids', 'update_message', 'ajax_get_referrals_amt', 'unread', 'ajax_get_results', 'views_campaign_rules', 'google_importer', 'nyan', 'pricing', 'locale_from_host', 'wishlists', 'apply', 'ajax_referral_banner_type', 'disaster_action', 'tos_2014', 'recommended_listings', 'desks', 'press_content', 'message', 'popular_listing', 'press_release', 'ajax_referral_banner_experiment_type', 'hosting_social_proof', 'multi_message', 'relationship', 'new_host', 'widget', 'ajax_ldp', 'hospitality_standards', 'rentals', 'deactivate'}
Present in test but missing in train:
{'report', 'host_cancel', 'view', 'acculynk_pin_pad_error', 'busin

### 3.5.1 Due to the fact that discrepancies where found, we are taking only those sessions that have action_info in both sets
Not doing that any longer, as we might drop 4 users sessions because of this, resulting in total decrease of 432 users (428 + 4)

In [28]:
# action_info_set = set(sessions_train.action_info.unique()).intersection(set(sessions_test.action_info.unique()))
# len(action_info_set)

In [29]:
# unique_user_ids = sessions.user_id.nunique()
# unique_user_ids

In [30]:
# sessions = sessions[sessions.action_info.isin(action_info_set)]
# sessions.reset_index(drop=True, inplace=True)
# sessions.shape

### 3.5.3 Checking if the submission files are missing 4 test_users

In [38]:
submission = pd.read_csv('../data/results/submission1.csv')
submission.shape

(61664, 2)

In [41]:
submission.head()

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,jtl0dijy2j,NDF
2,xx0ulgorjt,NDF
3,6c6puo6ix0,NDF
4,czqhjk3yfe,US


In [39]:
original_test_users = pd.read_csv('../data/original/test_users.csv')
original_test_users.shape

(62096, 15)

In [42]:
original_test_users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


### It appears that there are 432 test users we did not submit any results
Conclusion: not to drop non-intersecting action_info and users without any sessions

In [43]:
# len(set(original_test_users.id.unique()) - set(submission.id.unique()))

432

In [44]:
# len(set(submission.id.unique()) - set(original_test_users.id.unique()))

0

In [53]:
# ss_train = ss_train[ss_train.action_info.isin(action_info_set)]
# ss_train.reset_index(drop=True, inplace=True)
# ss_train.shape

(5494799, 7)

In [54]:
# ss_test = ss_test[ss_test.action_info.isin(action_info_set)]
# ss_test.reset_index(drop=True, inplace=True)
# ss_test.shape

(4934245, 7)

### 4. Saving data

In [26]:
sessions.to_parquet('../data/processed/sessions.parquet')

In [27]:
users.to_parquet('../data/processed/users.parquet')