In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data into DataFrames
train_users = pd.read_csv('./data/train_users_2.csv')
test_users = pd.read_csv('./data/test_users.csv')

In [3]:
# Inspect data 
len(train_users.id.unique()), len(test_users.id.unique()), train_users.shape, test_users.shape

(213451, 62096, (213451, 16), (62096, 15))

In [4]:
# Merge train and test users
users = pd.concat((train_users, test_users), axis=0)
users.head()

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,id,language,signup_app,signup_flow,signup_method,timestamp_first_active
0,direct,direct,,NDF,2010-06-28,,untracked,Chrome,Mac Desktop,-unknown-,gxn3p5htnn,en,Web,0,facebook,20090319043255
1,seo,google,38.0,NDF,2011-05-25,,untracked,Chrome,Mac Desktop,MALE,820tgsjxq7,en,Web,0,facebook,20090523174809
2,direct,direct,56.0,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,4ft3gnwmtx,en,Web,3,basic,20090609231247
3,direct,direct,42.0,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,bjjt8pjhuk,en,Web,0,facebook,20091031060129
4,direct,direct,41.0,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,-unknown-,87mebub9p4,en,Web,0,basic,20091208061105


In [5]:
# Replace null in Gender field
users.gender.replace('-unknown-', np.nan, inplace=True)

In [6]:
# Change Dates columns to Python Date format
users['date_account_created'] = pd.to_datetime(users['date_account_created'], format='%Y-%m-%d')
users['timestamp_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')
#users['date_account_created'].fillna(df_all.timestamp_first_active, inplace=True)

In [7]:
# Nullify outliers
users.loc[users['age'] > 90,'age'] = np.NaN
users.loc[users['age'] < 15,'age'] = np.NaN

print len(users[users['age'] > 90])
print len(users[users['age'] < 15])
print len(users[users['age'].isnull()])

0
0
119853


In [8]:
# Extract Date Features
import datetime as dt
users['day_account_created'] = users['date_account_created'].dt.weekday
users['month_account_created'] = users['date_account_created'].dt.month
users['quarter_account_created'] = users['date_account_created'].dt.quarter
users['year_account_created'] = users['date_account_created'].dt.year
users['hour_first_active'] = users['timestamp_first_active'].dt.hour
users['day_first_active'] = users['timestamp_first_active'].dt.weekday
users['month_first_active'] = users['timestamp_first_active'].dt.month
users['quarter_first_active'] = users['timestamp_first_active'].dt.quarter
users['year_first_active'] = users['timestamp_first_active'].dt.year
users['created_less_active'] = (users['date_account_created'] - users['timestamp_first_active']).dt.days

In [9]:
users = users.fillna({'age':-1})

In [10]:
t_users = users.copy()
t_users = t_users.set_index('id')

cat_features = ['affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'first_browser',
 'first_device_type',
 'gender',
 'language',
 'signup_app',
 'signup_flow',
 'signup_method']
num_features = ['day_account_created',
 'month_account_created',
 'quarter_account_created',
 'year_account_created',
 'hour_first_active',
 'day_first_active',
 'month_first_active',
 'quarter_first_active',
 'year_first_active',
 'created_less_active',
 'age']
target = ['country_destination']

In [11]:
nogender_cat_features = ['affiliate_channel',
 'affiliate_provider',
 'first_affiliate_tracked',
 'first_browser',
 'first_device_type',
 'language',
 'signup_app',
 'signup_flow',
 'signup_method']

other_features = ['day_account_created',
 'month_account_created',
 'quarter_account_created',
 'year_account_created',
 'hour_first_active',
 'day_first_active',
 'month_first_active',
 'quarter_first_active',
 'year_first_active',
 'created_less_active',
 'age',
 'gender']

In [12]:
encode_users = pd.get_dummies(t_users[nogender_cat_features])

In [13]:
encode_users.shape, t_users[num_features].shape

((275547, 132), (275547, 11))

In [14]:
df_users = pd.merge(encode_users, t_users[other_features], how='inner', left_index=True, right_index=True)

In [15]:
df_users.shape

(275547, 144)

In [16]:
# Join/Merge with session data
user_devices = pd.read_csv('./data/user_devices.csv')
user_session = pd.read_csv('./data/user_session_detail.csv')

In [17]:
df_users = df_users.reset_index()

In [18]:
all_users = pd.merge(df_users, user_devices, how='left', left_on='id', right_on='user_id')
all_users = pd.merge(all_users, user_session, how='left', left_on='id', right_on='user_id')

In [19]:
all_users.shape

(275547, 316)

In [20]:
# Split notnull vs null gender
X_gender_users = all_users[all_users['gender'].isnull()==False]
X_pred_gender_users = all_users[all_users['gender'].isnull()]

In [21]:
y_gender_users = X_gender_users['gender']

In [22]:
id_X = X_gender_users.id
id_X_pred = X_pred_gender_users.id

In [23]:
X_gender_users = X_gender_users.drop(['gender','user_id_x','user_id_y','id'], axis=1)
X_pred_gender_users = X_pred_gender_users.drop(['gender','user_id_x','user_id_y','id'], axis=1)

In [24]:
X_gender_users = X_gender_users.fillna(0)
X_pred_gender_users = X_pred_gender_users.fillna(0)

In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_y_gender_users = le.fit_transform(y_gender_users)

In [26]:
X_gender_users.shape, X_pred_gender_users.shape

((146067, 312), (129480, 312))

In [27]:
np.bincount(le_y_gender_users)

array([77524, 68209,   334])

In [28]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_gender_users, le_y_gender_users)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

In [29]:
pred_gender = neigh.predict(X_pred_gender_users)

In [30]:
np.bincount(pred_gender)

array([78235, 51245])

In [31]:
X_pred_gender_users['predicted_gender'] = le.inverse_transform(pred_gender)

In [32]:
impute_gender = pd.DataFrame(X_pred_gender_users['predicted_gender'])

In [33]:
impute_gender.head()

Unnamed: 0,predicted_gender
0,FEMALE
4,MALE
5,FEMALE
9,FEMALE
12,FEMALE


In [34]:
new_all_users = pd.merge(all_users, impute_gender, how='left', left_index=True, right_index=True)

In [35]:
new_all_users.shape, all_users.shape

((275547, 317), (275547, 316))

In [36]:
new_all_users.head()

Unnamed: 0,id,signup_flow,affiliate_channel_api,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,affiliate_channel_seo,...,view_search_results,view_security_checks,view_user_real_names,wishlist,wishlist_content_update,wishlist_note,your_listings,your_reservations,your_trips,predicted_gender
0,gxn3p5htnn,0,0,0,1,0,0,0,0,0,...,,,,,,,,,,FEMALE
1,820tgsjxq7,0,0,0,0,0,0,0,0,1,...,,,,,,,,,,
2,4ft3gnwmtx,3,0,0,1,0,0,0,0,0,...,,,,,,,,,,
3,bjjt8pjhuk,0,0,0,1,0,0,0,0,0,...,,,,,,,,,,
4,87mebub9p4,0,0,0,1,0,0,0,0,0,...,,,,,,,,,,MALE


In [37]:
new_all_users.loc[new_all_users.gender.isnull(),'gender'] = new_all_users['predicted_gender']

In [38]:
new_all_users = new_all_users.drop(['user_id_x','user_id_y','predicted_gender'], axis=1)

In [39]:
new_all_users.shape

(275547, 314)

In [40]:
# One-hot encoding Gender feature
new_all_users = new_all_users.set_index('id')
df_all_users = pd.get_dummies(new_all_users)

In [41]:
destination = t_users['country_destination'].reset_index()

In [42]:
destination.head()

Unnamed: 0,id,country_destination
0,gxn3p5htnn,NDF
1,820tgsjxq7,NDF
2,4ft3gnwmtx,US
3,bjjt8pjhuk,other
4,87mebub9p4,US


In [43]:
df_all_users = df_all_users.fillna(0)

In [44]:
df_all_users.head()

Unnamed: 0_level_0,signup_flow,affiliate_channel_api,affiliate_channel_content,affiliate_channel_direct,affiliate_channel_other,affiliate_channel_remarketing,affiliate_channel_sem-brand,affiliate_channel_sem-non-brand,affiliate_channel_seo,affiliate_provider_baidu,...,view_user_real_names,wishlist,wishlist_content_update,wishlist_note,your_listings,your_reservations,your_trips,gender_FEMALE,gender_MALE,gender_OTHER
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gxn3p5htnn,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
820tgsjxq7,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4ft3gnwmtx,3,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
bjjt8pjhuk,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
87mebub9p4,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0


In [45]:
df_all_users = df_all_users.reset_index()
df_all_users = pd.merge(df_all_users, destination, how='inner', on='id')

In [46]:
df_all_users.shape

(275547, 317)

In [47]:
# Split train and test data
df_test = df_all_users[df_all_users['country_destination'].isnull()]
df_train = df_all_users[df_all_users['country_destination'].isnull()==False]

In [48]:
df_test.shape, df_train.shape

((62096, 317), (213451, 317))

In [50]:
# Export featured-ready datasets
df_train.to_csv('./data/train_features_ready.csv',index=False)
df_test.to_csv('./data/test_features_ready.csv',index=False)