In [81]:
#!pip install dask-ml[xgboost]    # also install xgboost and dask-xgboost
#!pip install dask-ml[tensorflow]
#!pip install dask-ml[complete]   # install all optional dependencies

**ip:** ip address of click  
**app:** app id for marketing   
**device:** device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)  
**os:** os version id of user mobile phone  
**channel:** channel id of mobile ad publisher  
**click_time:** timestamp of click (UTC)  
**attributed_time:** if user download the app for after clicking an ad, this is the time of the app download  
**is_attributed:** the target that is to be predicted, indicating the app was downloaded  

In [1]:
import pandas as pd
import time
import numpy as np
import pyarrow as pa
import dask.dataframe as dd
from dask_ml import xgboost#,tensorflow
from datetime import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
!ls data

In [9]:
path = 'data/'
start_time = time.time()

dtype={"ip": "category", 
       "app": "category",
       "device" : "category",
       "os":"category",
       "channel":"category",
       "click_time":"str",
       "attributed_time":"str",
       "is_attributed":"category"
      }

parse_dates = ['click_time', 'attributed_time']
train_data = dd.read_csv(path+"train.csv",dtype=dtype,parse_dates=parse_dates) #184.903.891
train_data.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']


test = dd.read_csv(path+"test.csv")
print('[{}] Finished to load data'.format(time.time() - start_time))

[0.08521819114685059] Finished to load data


In [10]:
train_data.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,NaT,0
1,17357,3,1,19,379,2017-11-06 14:33:34,NaT,0
2,35810,3,1,13,379,2017-11-06 14:34:12,NaT,0
3,45745,14,1,13,478,2017-11-06 14:34:52,NaT,0
4,161007,3,1,13,379,2017-11-06 14:35:08,NaT,0


In [87]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [91]:
delayed_by_ip = train.groupby(['ip','is_attributed']).count().compute()

In [93]:
delayed_by_ip

Unnamed: 0_level_0,Unnamed: 1_level_0,app,device,os,channel,click_time,attributed_time
ip,is_attributed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,0,2401.0,2401.0,2401.0,2401.0,2401.0,0.0
100002,1,3.0,3.0,3.0,3.0,3.0,3.0
100009,0,8041.0,8041.0,8041.0,8041.0,8041.0,0.0
100009,1,13.0,13.0,13.0,13.0,13.0,13.0
100013,0,8693.0,8693.0,8693.0,8693.0,8693.0,0.0
100013,1,5.0,5.0,5.0,5.0,5.0,5.0
100020,0,1227.0,1227.0,1227.0,1227.0,1227.0,0.0
100020,1,2.0,2.0,2.0,2.0,2.0,2.0
100032,0,1564.0,1564.0,1564.0,1564.0,1564.0,0.0
100032,1,2.0,2.0,2.0,2.0,2.0,2.0


In [None]:
# Define some aggregations to plot
aggregations = (train.groupby(['ip','is_attributed']).count()
#                 train.groupby('channel').is_attributed.count(),
#                 train.groupby('is_attributed').device.count(),
#                 train.groupby('is_attributed').os.count(),
#                 train.groupby('is_attributed').count()
               )

# Compute them all in a single pass over the data
(delayed_by_ip          
# ,delayed_by_channel,
# delayed_by_device,
# delayed_by_os,
# delayed_by_attributed
) = dd.compute(*aggregations)

In [71]:
from bokeh.plotting import figure, output_file, show

delayed_by_channel

is_attributed
0    184447044
1       456846
Name: channel, dtype: int64

In [18]:
ip_test = set(test.ip)
print('Number of distinct IPs in train: ',len(ip_train))
print('Number of distinct IPs in test: ',len(ip_test))
print('% IPs in test that are in train as well: ',round(len([i for i in ip_test if i in ip_train])*100/len(ip_test),2))
print('% IPs in train that are in test as well: ',round(len([i for i in ip_train if i in ip_test])*100/len(ip_train),2))


<bound method _Frame.all of Dask Series Structure:
npartitions=118
    int64
      ...
    ...  
      ...
      ...
Name: ip, dtype: int64
Dask Name: getitem, 590 tasks>

In [8]:
#--- memory consumed by train dataframe ---
mem = train.memory_usage(index=True).sum()
print("Memory consumed by training set  :   {} MB" .format(mem/ 1024**2))
 
#--- memory consumed by test dataframe ---
mem = test.memory_usage(index=True).sum()
print("Memory consumed by test set      :   {} MB" .format(mem/ 1024**2))

Memory consumed by training set  :   dd.Scalar<truediv..., dtype=float64> MB
Memory consumed by test set      :   dd.Scalar<truediv..., dtype=float64> MB


In [5]:
def change_datatype(df):
    float_cols = list(df.select_dtypes(include=['int']).columns)
    for col in float_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)
    

change_datatype(train)
change_datatype(test)

#--- Converting columns from 'float64' to 'float32' ---
def change_datatype_float(df):
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)
        
change_datatype_float(train)
change_datatype_float(test)


In [6]:
#--- memory consumed by train dataframe ---
mem = train.memory_usage(index=True).sum()
print("Memory consumed by training set  :   {} MB" .format(mem/ 1024**2))
 
#--- memory consumed by test dataframe ---
mem = test.memory_usage(index=True).sum()
print("Memory consumed by test set      :   {} MB" .format(mem/ 1024**2))

Memory consumed by training set  :   3595.352249145508 MB
Memory consumed by test set      :   430.0797805786133 MB


In [None]:
train.head()

In [None]:
test.head()

In [9]:
def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time']).dt.date
    df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    return df

train = dataPreProcessTime(train)
test = dataPreProcessTime(test)


In [4]:
#label = train['is_attributed']
#train = train.drop(['is_attributed', 'attributed_time'], axis=1)

sub = pd.DataFrame()
sub = test['click_id']
test = test.drop('click_id', axis=1)

In [5]:
#from sklearn.datasets import dump_svmlight_file
#from sklearn.metrics import precision_score
import xgboost as xgb

#from sklearn.model_selection import StratifiedKFold
#from sklearn.model_selection import GridSearchCV

 
params = {'eta': 0.02, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.9, 
          'colsample_bylevel':0.9,
          'min_child_weight':100,
          'alpha':4,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99, 
          'silent': True}
        

In [11]:
from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.3, random_state=42)
from dask_ml.xgboost import train

X_train, X_test = train_data.random_split([0.8, 0.2])

y_train = X_train['is_attributed']
y_test = X_test['is_attributed']

del X_train['is_attributed']
del X_test['is_attributed']


In [10]:
#!pip install git+https://github.com/dask/distributed.git --upgrade

!cat /etc/hosts

127.0.0.1	localhost
::1	localhost ip6-localhost ip6-loopback
fe00::0	ip6-localnet
ff00::0	ip6-mcastprefix
ff02::1	ip6-allnodes
ff02::2	ip6-allrouters
172.17.0.2	ad9f51a0ecb4


In [11]:
from dask.distributed import Client, LocalCluster
#cluster = LocalCluster(n_workers=1)
client = Client('172.17.0.2:8786')

distributed.utils - ERROR - Timed out trying to connect to 'tcp://172.17.0.2:8786' after 5 s: in <distributed.comm.tcp.TCPConnector object at 0x7f73b7108c88>: ConnectionRefusedError: [Errno 111] Connection refused
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/distributed/comm/core.py", line 185, in connect
    quiet_exceptions=EnvironmentError)
  File "/opt/conda/lib/python3.6/site-packages/tornado/gen.py", line 1055, in run
    value = future.result()
  File "/opt/conda/lib/python3.6/site-packages/tornado/concurrent.py", line 238, in result
    raise_exc_info(self._exc_info)
  File "<string>", line 4, in raise_exc_info
tornado.gen.TimeoutError: Timeout

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/distributed/utils.py", line 229, in f
    result[0] = yield make_coro()
  File "/opt/conda/lib/python3.6/site-packages/tornado/gen.py", line 1055, in 

OSError: Timed out trying to connect to 'tcp://172.17.0.2:8786' after 5 s: in <distributed.comm.tcp.TCPConnector object at 0x7f73b7108c88>: ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
train(client=client,params=params, data=X_train, labels=y_train)  

In [None]:
print('[{}] Start XGBoost Training'.format(time.time() - start_time))

d_train = xgb.DMatrix(X_train, y_train)
d_test  = xgb.DMatrix(X_test, y_test)


#watchlist = [(d_train, 'train'), (d_test, 'valid')]

#model = xgboost.train(params, d_train, 250, watchlist, maximize=True, verbose_eval=10)
est.fit(train, train_labels)

print('[{}] Finish XGBoost Training'.format(time.time() - start_time))


[130.0817587375641] Start XGBoost Training


In [14]:
#dtest = xgb.DMatrix(X_test, label=y_test)

sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False)

In [None]:
t = model.get_fscore

In [None]:
gc.collect()

In [None]:
################## TEST 

In [None]:
#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {#'nthread':[8], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              #'scoring':['roc_auc'],
              #'eval_metric': ['auc'],
              'learning_rate': np.arange(0.02,0.05), #so called `eta` value
              'max_depth': np.arange(3,6),
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.9],
              'colsample_bytree': [0.9],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}

# params = {'eta': 0.02, 
#           'max_depth': 4, 
#           'subsample': 0.9, 
#           'colsample_bytree': 0.7, 
#           'colsample_bylevel':0.7,
#           'min_child_weight':100,
#           'alpha':4,
#           'objective': 'binary:logistic', 
#           'eval_metric': 'auc', 
#           'random_state': 99, 
#           'silent': True}

kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=42)

clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=kfold, 
                   scoring='roc_auc',
                   verbose=2, refit=True)

clf.fit(train, label)

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(test)[:,1]
test_probs