In [1]:
from rocketml.ensemble import GradientBoostingClassifier
from rocketml.io import LIBSVM,CSV
from time import time
import bokeh
import subprocess
import os
from glob import glob
from sklearn.metrics import f1_score, confusion_matrix,roc_auc_score
from sklearn.datasets import load_svmlight_file
import numpy as np

## Start dask for data processing

In [2]:
hostname = subprocess.check_output('hostname').strip()
from dask.distributed import Client
client = Client(hostname+":8786")
client

0,1
Client  Scheduler: tcp://ip-172-31-17-80:8786  Dashboard: http://ip-172-31-17-80:8787/status,Cluster  Workers: 4  Cores: 64  Memory: 130.65 GB


## Read Data File and Inspect its contents

In [36]:
import dask.dataframe as dd
filename = "s3://nyc-tlc/trip+data/yellow_tripdata_2017-01.csv"
df = dd.read_csv(filename,header=None)

## Check data statistics

In [37]:
print(df.columns)
print(df.head())
stats = df.describe()
print(stats.compute())

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28],
           dtype='int64')
    0         1         2         3         4         5         6         7   \
0  1.0  0.869293 -0.635082  0.225690  0.327470 -0.689993  0.754202 -0.248573   
1  1.0  0.907542  0.329147  0.359412  1.497970 -0.313010  1.095531 -0.557525   
2  1.0  0.798835  1.470639 -1.635975  0.453773  0.425629  1.104875  1.282322   
3  0.0  1.344385 -0.876626  0.935913  1.992050  0.882454  1.786066 -1.646778   
4  1.0  1.105009  0.321356  1.522401  0.882808 -1.205349  0.681466 -1.070464   

         8         9     ...           19        20        21        22  \
0 -1.092064  0.000000    ...    -0.010455 -0.045767  3.101961  1.353760   
1 -1.588230  2.173076    ...    -1.138930 -0.000819  0.000000  0.302220   
2  1.381664  0.000000    ...     1.128848  0.900461  0.000000  0.909753   
3 -0.942383  0.000000    ...    -0.678379 -1.360356 

In [42]:
rows = df[0].count().compute()
print(rows)

11000000


## Create test and train data for Gradient Boosted Trees

In [120]:
df_train,df_test = df.random_split([0.95,0.05])

In [121]:
df_train.compute()
df_test.compute()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
29,0.0,1.250867,-0.750010,1.090164,0.461849,-0.380657,0.677161,0.339623,-0.711208,0.000000,...,-0.907405,0.278959,3.101961,0.848267,0.842342,1.021281,0.665904,0.410987,0.606533,0.638353
104,0.0,1.177847,-1.467825,-0.930746,1.112738,0.176547,0.710414,-1.096210,0.586136,2.173076,...,-0.379395,-1.623385,3.101961,0.736332,0.788293,1.333251,0.960399,0.921212,0.798359,0.688752
120,1.0,0.817685,-0.867860,0.518657,0.705481,0.775555,1.202070,0.950595,-1.057693,2.173076,...,0.288529,0.974819,0.000000,0.720033,0.694720,0.993414,1.427427,0.932708,0.971303,0.842326
129,1.0,1.215180,0.744058,0.536967,0.533705,-1.341770,1.030856,0.398047,-1.498975,2.173076,...,-2.331533,-0.709444,0.000000,0.617484,1.543184,1.107504,0.994369,0.706062,1.187867,1.067938
148,0.0,1.974485,-0.520153,-0.273788,1.046293,-0.317087,1.239995,0.975351,1.140511,1.086538,...,0.333501,-1.273235,3.101961,0.970771,1.056947,1.002480,0.898361,0.881328,1.491169,1.227005
151,1.0,1.031805,-0.977919,0.677902,0.603602,-1.026682,0.742843,-1.006099,-0.793810,1.086538,...,1.251273,-0.015247,0.000000,0.570243,1.008914,1.092595,0.820011,1.297997,0.991746,0.883162
154,0.0,1.212435,0.265839,0.342211,0.967790,0.933620,0.797532,-0.126775,-1.372023,2.173076,...,-0.763327,1.364367,0.000000,0.984775,0.905101,0.992753,0.854621,0.506261,0.796949,0.703231
162,0.0,0.871855,0.716787,1.067415,0.424898,-1.152868,0.856344,-1.056601,0.203617,0.000000,...,0.193587,0.694034,0.000000,0.906827,0.986163,0.986507,1.742203,1.465180,1.548729,1.311926
222,0.0,2.057571,0.190844,1.474128,0.653487,-0.063609,0.813930,1.759613,-0.064804,0.000000,...,1.335388,0.524230,3.101961,0.877582,0.838931,1.578759,0.960481,0.453235,0.625721,0.858794
232,1.0,0.586361,-1.273031,1.393673,0.039736,0.717229,0.991557,-0.297094,0.132102,2.173076,...,0.129460,0.964276,3.101961,0.857004,0.843783,0.906702,0.845508,0.868141,0.890055,0.756254


In [122]:
df_test.to_csv("/shared/data/test_data/",header=None,index=False)

['/shared/data/test_data/000.part',
 '/shared/data/test_data/001.part',
 '/shared/data/test_data/002.part',
 '/shared/data/test_data/003.part',
 '/shared/data/test_data/004.part',
 '/shared/data/test_data/005.part',
 '/shared/data/test_data/006.part',
 '/shared/data/test_data/007.part',
 '/shared/data/test_data/008.part',
 '/shared/data/test_data/009.part',
 '/shared/data/test_data/010.part',
 '/shared/data/test_data/011.part',
 '/shared/data/test_data/012.part',
 '/shared/data/test_data/013.part',
 '/shared/data/test_data/014.part',
 '/shared/data/test_data/015.part',
 '/shared/data/test_data/016.part',
 '/shared/data/test_data/017.part',
 '/shared/data/test_data/018.part',
 '/shared/data/test_data/019.part',
 '/shared/data/test_data/020.part',
 '/shared/data/test_data/021.part',
 '/shared/data/test_data/022.part',
 '/shared/data/test_data/023.part',
 '/shared/data/test_data/024.part',
 '/shared/data/test_data/025.part',
 '/shared/data/test_data/026.part',
 '/shared/data/test_data/027

In [123]:
test_filenames = glob('/shared/data/test_data/*.part')
with open('/shared/data/test.csv','w') as fp:
    for fn in test_filenames:
        with open(fn) as f:
            fp.write(f.read())

In [124]:
df_train.to_csv('/shared/data/train_data/',header=None,index=False)

['/shared/data/train_data/000.part',
 '/shared/data/train_data/001.part',
 '/shared/data/train_data/002.part',
 '/shared/data/train_data/003.part',
 '/shared/data/train_data/004.part',
 '/shared/data/train_data/005.part',
 '/shared/data/train_data/006.part',
 '/shared/data/train_data/007.part',
 '/shared/data/train_data/008.part',
 '/shared/data/train_data/009.part',
 '/shared/data/train_data/010.part',
 '/shared/data/train_data/011.part',
 '/shared/data/train_data/012.part',
 '/shared/data/train_data/013.part',
 '/shared/data/train_data/014.part',
 '/shared/data/train_data/015.part',
 '/shared/data/train_data/016.part',
 '/shared/data/train_data/017.part',
 '/shared/data/train_data/018.part',
 '/shared/data/train_data/019.part',
 '/shared/data/train_data/020.part',
 '/shared/data/train_data/021.part',
 '/shared/data/train_data/022.part',
 '/shared/data/train_data/023.part',
 '/shared/data/train_data/024.part',
 '/shared/data/train_data/025.part',
 '/shared/data/train_data/026.part',
 

In [125]:
train_filenames = glob('/shared/data/train_data/*.part')
with open('/shared/data/train.csv','w') as fp:
    for fn in train_filenames:
        with open(fn) as f:
            fp.write(f.read())

In [None]:
start = time()
data_file = LIBSVM(filename='/home/ubuntu/rocketml-dev/data/avazu-app.tr')
gbdt_classifier = GradientBoostingClassifier(n_estimators=100,learning_rate=0.1,num_leaves=255,min_child_samples=0,min_sum_hessian_in_leaf=100,tree_learner="voting")
gbdt_classifier.fit(data_file)
print("Time to build a model = %0.2f"%(time()-start))
test_file_name = '/home/ubuntu/rocketml-dev/data/avazu-app.val'
test_file = LIBSVM(filename=test_file_name)
y_predict = gbdt_classifier.predict(test_file)
_,y_true = load_svmlight_file(test_file_name)
f1 = f1_score(y_true,y_predict)
cm = confusion_matrix(y_true,y_predict)
print("F1 score = %0.2f"%f1)
print("Confusion matrix")
print(cm)

------------------
Cluster Resources
------------------
Nodes        = 4
Sockets/node = 1
Cores/socket = 8
--------------------------
Total Cores    = 32
Total Memory(GB)   = 121.68
--------------------------


In [3]:
gbdt_classifier.feature_importance()

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
test_file = LIBSVM(filename='/home/ubuntu/rocketml-dev/data/avazu-app.val')
y_predict = gbdt_classifier.predict(test_file)

------------------
Cluster Resources
------------------
Nodes        = 4
Sockets/node = 1
Cores/socket = 8
--------------------------
Total Cores    = 32
Total Memory(GB)   = 121.68
--------------------------


In [18]:
y_true

array([1., 1., 0., ..., 0., 0., 1.])

In [15]:
np.argwhere(y_true < 0.)

array([], shape=(0, 1), dtype=int64)

In [20]:
print(cm)

[[1697067    3895]
 [ 246726    6263]]


In [22]:
y_predict

array([0, 0, 0, ..., 0, 0, 0])