# RocketML tutorial on binary classification using gradient boosted trees

## Import Libraries

In [19]:
from rocketml.ensemble import GradientBoostingClassifier
from rocketml.io import LIBSVM,CSV
from time import time
import subprocess
import os
from sklearn.metrics import f1_score, confusion_matrix,roc_auc_score
from sklearn.datasets import load_svmlight_file
import numpy as np

## Start dask for data processing

In [20]:
hostname = subprocess.check_output('hostname').strip()
from dask.distributed import Client
client = Client(hostname+":8786")
client

0,1
Client  Scheduler: tcp://ip-172-31-24-238:8786  Dashboard: http://ip-172-31-24-238:8787/status,Cluster  Workers: 4  Cores: 64  Memory: 130.65 GB


## Read Data File and Inspect its contents

In [21]:
import dask.dataframe as dd
filename = "/home/ubuntu/rocketml-dev/data/SUSY.csv"
df = dd.read_csv(filename,header=None)

## Check data statistics

In [22]:
print(df.columns)
print(df.head())
stats = df.describe()
print(stats.compute())

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], dtype='int64')
    0         1         2         3         4         5         6         7   \
0  0.0  0.972861  0.653855  1.176225  1.157156 -1.739873 -0.874309  0.567765   
1  1.0  1.667973  0.064191 -1.225171  0.506102 -0.338939  1.672543  3.475464   
2  1.0  0.444840 -0.134298 -0.709972  0.451719 -1.613871 -0.768661  1.219918   
3  1.0  0.381256 -0.976145  0.693152  0.448959  0.891753 -0.677328  2.033060   
4  1.0  1.309996 -0.690089 -0.676259  1.589283 -0.693326  0.622907  1.087562   

         8         9         10        11        12        13        14  \
0 -0.175000  0.810061 -0.252552  1.921887  0.889637  0.410772  1.145621   
1 -1.219136  0.012955  3.775174  1.045977  0.568051  0.481928  0.000000   
2  0.504026  1.831248 -0.431385  0.526283  0.941514  1.587535  2.024308   
3  1.533041  3.046260 -1.005285  0.569386  1.015211  1.582217  1.551914   
4 -0.381742  0.589204  1.365479  1.179295  0.96821

## Create test and train data for Gradient Boosted Trees

In [7]:
df_train,df_test = df.random_split([0.95,0.05])

## Run Compute() on the dataframes 

In [8]:
df_train.compute()
df_test.compute()
y_true = df_test[0].compute().values

## Write Train and Test Data to files

In [4]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

In [11]:
test_file_dir = "/shared/data/test_data/"
#mkdir_p(test_file_dir)
test_filenames = df_test.to_csv(test_file_dir,header=None,index=False)

In [12]:
with open('/shared/data/test.csv','w') as fp:
    for fn in test_filenames:
        with open(fn) as f:
            fp.write(f.read())

In [13]:
train_filenames = df_train.to_csv('/shared/data/train_data/',header=None,index=False)

In [15]:
with open('/shared/data/train.csv','w') as fp:
    for fn in train_filenames:
        with open(fn) as f:
            fp.write(f.read())

## Train using RocketML Gradient Boosting Classifier

In [26]:
start = time()
data_file = CSV(filename='/shared/data/train.csv')
gbdt_classifier = GradientBoostingClassifier(n_estimators=20,learning_rate=0.01,num_leaves=255,min_child_samples=0,min_sum_hessian_in_leaf=100)
gbdt_classifier.fit(data_file)
print("Time to build a model = %0.2f"%(time()-start))

------------------
Cluster Resources
------------------
Nodes        = 4
Sockets/node = 1
Cores/socket = 8
--------------------------
Total Cores    = 32
Total Memory(GB)   = 121.68
--------------------------
Time to build a model = 7.25


## Predict and compute accuracy scores

In [27]:
test_file_name = '/shared/data/test.csv'
test_file = CSV(filename=test_file_name)
y_predict = gbdt_classifier.predict(test_file)

------------------
Cluster Resources
------------------
Nodes        = 4
Sockets/node = 1
Cores/socket = 8
--------------------------
Total Cores    = 32
Total Memory(GB)   = 121.68
--------------------------


In [28]:
f1 = f1_score(y_true,y_predict)
roc_score = roc_auc_score(y_true,y_predict)
print("F1 score = %f"%f1)
print("ROC Score = %f"%roc_score)

F1 score = 0.760618
ROC Score = 0.789323
