In [59]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb

%matplotlib inline 

In [2]:
! ls ./data

sample_submission.csv test.csv              train_sample.csv


In [3]:
dtypes = {
    'ip': 'uint16',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'int8'
}

In [4]:
data = pd.read_csv('./data/train_sample.csv'
                   , low_memory=False
                   , parse_dates=['click_time', 'attributed_time']
                   , dtype=dtypes)



test = pd.read_csv('./data/test.csv'
                   , low_memory=False
                   , parse_dates=['click_time']
                   , dtype=dtypes)

In [5]:
data.shape

(100000, 8)

In [6]:
data.dtypes

ip                         uint16
app                        uint16
device                     uint16
os                         uint16
channel                    uint16
click_time         datetime64[ns]
attributed_time    datetime64[ns]
is_attributed                int8
dtype: object

#### Simple EDA

In [7]:
# is the data balanced?
pos_ratio = np.sum(data['is_attributed']) * 1.0 / data.shape[0]
pos_ratio

0.00227

It might seem data is highly unbalanced since there are only 0.23 % of samples appeared to download the APP. However, low success rate is highly possible in click rate and fraud detection case in the real world.

In [8]:
def null_ratio(df): return round(np.sum(df.isna() * 1.0 / df.shape[0]), 3)

null_ratio(data)

ip                 0.000
app                0.000
device             0.000
os                 0.000
channel            0.000
click_time         0.000
attributed_time    0.998
is_attributed      0.000
dtype: float64

It seems attributed_time is the only one contains null.

In [9]:
data.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time',
       'is_attributed'],
      dtype='object')

In [10]:
def unique_cat(df): return [(x, df[x].nunique(dropna=False)) for x in df.columns]

unique_cat(data)

[('ip', 28470),
 ('app', 161),
 ('device', 100),
 ('os', 130),
 ('channel', 161),
 ('click_time', 80350),
 ('attributed_time', 228),
 ('is_attributed', 2)]

It is natural to drop ip since it states that every three click is from the same ip. And a different ip address will be assigned to a device once the device is logged in.

### Define data set

In [83]:
X = data[['app','device', 'os', 'channel']]
y = data['is_attributed']
test_X = test[['app','device', 'os', 'channel']]
sample = pd.read_csv('./data/sample_submission.csv')  # load sample submission 

### Baseline Model - Logistic Regression

In [None]:
lr = LogisticRegression() # define logstic regression 
lr.fit(X,y)               # train lr

y_pred = lr.predict_proba(test_X)[:,1] # get soft predicitons

In [None]:
sample['is_attributed'] = y_pred                      # insert SOFT prediction as target variable
sample.to_csv('sub_0.csv', index=False)               # save prediction to csv to submit 

After submitting to kaggle, the area under the ROC curve is 0.6084. We can improve this using feature engineering and other techniques. But first, lets see if rf using the same train data will lead to a better score.

### Baseline Model - Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=50) # define Random Forest with 100 trees and 50 or more samples on each leaf node 
rf.fit(X,y)                                                          # train rf 

In [None]:
y_pred_rf = rf.predict_proba(test_X)[:,1]

In [None]:
sample['is_attributed'] = y_pred_rf
sample.to_csv('sub_1_rf.csv', index=False)

### Baseline Model - XGBoost

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [30]:
xgb_model = xgb.XGBClassifier(
                learning_rate =0.1,
                n_estimators=500,
                max_depth=5,
                min_child_weight=1,
                gamma=0,
                subsample=0.8,
                colsample_bytree=0.8,
                objective= 'binary:logistic',
                nthread=4,
                scale_pos_weight=1,
                seed=228)

In [31]:
xgb_model.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=228,
       silent=True, subsample=0.8)

In [71]:
batch_size = 100000
N_batch = int(test_X.shape[0]/batch_size)
i = N_batch

In [72]:
batch_X = test_X.iloc[i*batch_size : (i+1)*batch_size] 

In [73]:
len(batch_X)

90469

In [79]:
def batch_pred(model, X, batch_size=100000):
    
    N_batch = int(X.shape[0]/batch_size)
    pred = []
    
    for i in range(N_batch+1):
        if i%10 == 0:
            print("finished batch %d/%d" %(i+1, N_batch+1))
        
        batch_X = X.iloc[i*batch_size : (i+1)*batch_size] 
        batch_pred = model.predict_proba(batch_X)[:,1]
        pred.extend(list(batch_pred))
    
    return pred

In [80]:
pred_xgb = batch_pred(xgb_model, test_X)

finished batch 1/188
finished batch 11/188
finished batch 21/188
finished batch 31/188
finished batch 41/188
finished batch 51/188
finished batch 61/188
finished batch 71/188
finished batch 81/188
finished batch 91/188
finished batch 101/188
finished batch 111/188
finished batch 121/188
finished batch 131/188
finished batch 141/188
finished batch 151/188
finished batch 161/188
finished batch 171/188
finished batch 181/188


In [91]:
sample['is_attributed'] = pred

In [92]:
sample.to_csv('./output/sub_2_xgb.csv', index=False)