# RocketML tutorial on binary classification using gradient boosted trees

## Import Libraries

In [None]:
from rocketml.ensemble import GradientBoostingClassifier
from rocketml.io import LIBSVM,CSV
from time import time
import subprocess
import os
from sklearn.metrics import f1_score, confusion_matrix,roc_auc_score
from sklearn.datasets import load_svmlight_file
import numpy as np

## Start dask for data processing

In [None]:
hostname = subprocess.check_output('hostname').strip()
from dask.distributed import Client
client = Client(hostname+":8786")
client

## Read Data File and Inspect its contents

In [None]:
import dask.dataframe as dd
filename = "/home/ubuntu/rocketml-dev/data/SUSY.csv"
df = dd.read_csv(filename,header=None)

## Check data statistics

In [None]:
print(df.columns)
print(df.head())
stats = df.describe()
print(stats.compute())

## Create test and train data for Gradient Boosted Trees

In [None]:
df_train,df_test = df.random_split([0.95,0.05])

## Run Compute() on the dataframes 

In [None]:
df_train.compute()
df_test.compute()
y_true = df_test[0].compute().values

## Write Train and Test Data to files

In [None]:
test_filenames = df_test.to_csv("/shared/data/test_data/",header=None,index=False)

In [None]:
with open('/shared/data/test.csv','w') as fp:
    for fn in test_filenames:
        with open(fn) as f:
            fp.write(f.read())

In [None]:
train_filenames = df_train.to_csv('/shared/data/train_data/',header=None,index=False)

In [None]:
train_filenames = glob('/shared/data/train_data/*.part')
with open('/shared/data/train.csv','w') as fp:
    for fn in train_filenames:
        with open(fn) as f:
            fp.write(f.read())

## Train using RocketML Gradient Boosting Classifier

In [None]:
start = time()
data_file = CSV(filename='/shared/data/train.csv')
gbdt_classifier = GradientBoostingClassifier(n_estimators=10,learning_rate=0.01,num_leaves=255,min_child_samples=0,min_sum_hessian_in_leaf=100)
gbdt_classifier.fit(data_file)
print("Time to build a model = %0.2f"%(time()-start))

## Predict and compute accuracy scores

In [None]:
test_file_name = '/shared/data/test.csv'
test_file = CSV(filename=test_file_name)
y_predict = gbdt_classifier.predict(test_file)


In [None]:
f1 = f1_score(y_true,y_predict)
roc_score = roc_auc_score(y_true,y_predict)
print("F1 score = %f"%f1)
print("ROC Score = %f"%roc_score)