# Random Forest Bench


In [1]:
import cudf
import cuml
import sklearn as skl 
from cuml import RandomForestClassifier as cuRF
from sklearn.ensemble import RandomForestClassifier as sklRF
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
from urllib.request import urlretrieve
import gzip
from cuml import ForestInference
import time
from numba import cuda


In [2]:
print("cudf version: ", cudf.__version__)
print("cuml version: ", cuml.__version__)
print("skl version: ", skl.__version__)

cudf version:  0.10.0a+810.g50a9dc7
cuml version:  0.9.0a1+1074.g42be3ac.dirty
skl version:  0.21.2


## Main benchmarkingfunction

In [3]:
def start_bench(run_cuml, run_skl, skip_test, estimator_array, stream_array, depth_array, csv_path, X_train, y_train, X_train_np, y_train_np, X_test_np, y_test_np):
    results = []
    for n_estimators in estimator_array:
        for n_streams in stream_array:
            for max_depth in depth_array:
                # cuml Random Forest params
                cu_rf_params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'n_bins': 16,
                    'split_algo': 1,
                    'n_streams': n_streams
                }

                cu_fit_time = 0
                skl_fit_time = 0
                acc_score_cuml = 0
                acc_score_skl = 0

                if run_cuml:
                    print("====>cuml====")
                    cuml_params = cu_rf_params.copy()
                    print("    cuml params: ", str(cuml_params))
                    cu_rf = cuRF(**cu_rf_params)
                    print("    cuml model: ", str(cu_rf.get_params()))

                    t0 = time.time()
                    cu_rf.fit(X_train, y_train)
                    cu_fit_time = time.time() - t0

                    print("    cuml fits RF: ", cu_fit_time)

                    if not skip_test:
                        # use a subset of test data to inference 
                        cu_rf_predicted = cu_rf.predict(X_test_np[:1000, :])
                        acc_score_cuml = accuracy_score(cu_rf_predicted, y_test_np[:1000])
                        print("    cuml total time: ", time.time() - t0)
                        print("    cuml acc: ", acc_score_cuml)

                if run_skl and n_streams == 8:
                    print("====>sklearn====")
                    sk_params = cu_rf_params.copy()
                    print("    skl params: ", str(cuml_params))

                    sk_params['n_jobs'] = -1
                    del sk_params['n_bins']
                    del sk_params['split_algo']
                    if 'n_streams' in sk_params:
                        del sk_params['n_streams']                
                    rfc = sklRF(**sk_params)

                    t0 = time.time()
                    rfc.fit(X_train_np, y_train_np)
                    skl_fit_time = time.time() - t0

                    print("    skl fits RF: ", skl_fit_time)
                    
                    if not skip_test:
                        skl_predicted = rfc.predict(X_test_np[:1000, :])
                        acc_score_skl = accuracy_score(skl_predicted, y_test_np[:1000])                
                        print("    skl total time: ", time.time() - t0)
                        print("    skl acc: ", acc_score_skl)

                pd.set_option('display.max_colwidth', 300)
                results.append(dict(cu_fit_time=cu_fit_time, acc_score_cuml=acc_score_cuml, skl_fit_time=skl_fit_time, acc_score_skl=acc_score_skl))
                df = pd.DataFrame(results)
                print(df.to_string())
                df.to_csv(csv_path, mode='a')

## Helper functions

In [4]:
def download_higgs(compressed_filepath, decompressed_filepath):
    higgs_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
    if not os.path.isfile(compressed_filepath):
        urlretrieve(higgs_url, compressed_filepath)
    if not os.path.isfile(decompressed_filepath):
        cf = gzip.GzipFile(compressed_filepath)
        with open(decompressed_filepath, 'wb') as df:
            df.write(cf.read())

## Higgs

In [5]:
# compressed_filepath = 'HIGGS.csv.gz' # Set this as path for gzipped Higgs data file, if you already have
decompressed_filepath = './HIGGS.csv' # Set this as path for decompressed Higgs data file, if you already have
# download_higgs(compressed_filepath, decompressed_filepath)

col_names = ['label'] + ["col-{}".format(i) for i in range(2, 30)] # Assign column names
dtypes_ls = ['int32'] + ['float32' for _ in range(2, 30)] # Assign dtypes to each column
data = cudf.read_csv(decompressed_filepath, names=col_names, dtype=dtypes_ls)

y_cudf = data['label']
X_cudf = data.drop('label')
from cuml.preprocessing.model_selection import train_test_split
# train_size is the ratio of the entire dataset to be split into training data
X_train, X_test, y_train, y_test = train_test_split(X_cudf, y_cudf, train_size=0.80)

print("Shape of the training data : ", X_train.shape)
print("Shape of the ground truth data used for training : ", y_train.shape)
print("Shape of the testing data : ", X_test.shape)
print("Shape of the ground truth data used for testing : ",y_test.shape)

X_train_np = X_train.as_matrix()
y_train_np = y_train.to_array()
X_test_np = X_test.as_matrix()
y_test_np = y_test.to_array()

Shape of the training data :  (8800000, 28)
Shape of the ground truth data used for training :  (8800000,)
Shape of the testing data :  (2200000, 28)
Shape of the ground truth data used for testing :  (2200000,)


In [6]:
data.head().to_pandas()

Unnamed: 0,label,col-2,col-3,col-4,col-5,col-6,col-7,col-8,col-9,col-10,...,col-20,col-21,col-22,col-23,col-24,col-25,col-26,col-27,col-28,col-29
0,1,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282323,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.108331,0.985692,0.951331,0.803251,0.865924,0.780118
3,0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808486


In [None]:
estimator_array = [100, 500, 1000]
stream_array = [8, 10]
depth_array = [8, 12, 16]
run_cuml = True
run_skl = True 
skip_test = True 
csv_path = './rf_bench_results/rf_bench_higgs.csv'

start_bench(run_cuml, run_skl, skip_test, estimator_array, stream_array, depth_array, csv_path, X_train, y_train, X_train_np, y_train_np, X_test_np, y_test_np)

====>cuml====
    cuml params:  {'n_estimators': 100, 'max_depth': 8, 'n_bins': 16, 'split_algo': 1, 'n_streams': 8}
    cuml model:  {'n_estimators': 100, 'max_depth': 8, 'handle': <cuml.common.handle.Handle object at 0x7f18241dad20>, 'max_features': 1.0, 'n_bins': 16, 'split_algo': 1, 'split_criterion': 0, 'min_rows_per_node': 2, 'bootstrap': True, 'bootstrap_features': False, 'verbose': False, 'rows_sample': 1.0, 'max_leaves': -1, 'quantile_per_tree': False}




    cuml fits RF:  10.489315032958984
====>sklearn====
    skl params:  {'n_estimators': 100, 'max_depth': 8, 'n_bins': 16, 'split_algo': 1, 'n_streams': 8}
    skl fits RF:  199.35952138900757
   acc_score_cuml  acc_score_skl  cu_fit_time  skl_fit_time
0               0              0    10.489315    199.359521
====>cuml====
    cuml params:  {'n_estimators': 100, 'max_depth': 12, 'n_bins': 16, 'split_algo': 1, 'n_streams': 8}
    cuml model:  {'n_estimators': 100, 'max_depth': 12, 'handle': <cuml.common.handle.Handle object at 0x7f1821e59c30>, 'max_features': 1.0, 'n_bins': 16, 'split_algo': 1, 'split_criterion': 0, 'min_rows_per_node': 2, 'bootstrap': True, 'bootstrap_features': False, 'verbose': False, 'rows_sample': 1.0, 'max_leaves': -1, 'quantile_per_tree': False}




    cuml fits RF:  21.808729887008667
====>sklearn====
    skl params:  {'n_estimators': 100, 'max_depth': 12, 'n_bins': 16, 'split_algo': 1, 'n_streams': 8}


## Airline

In [None]:
from datasets import prepare_dataset

data = prepare_dataset('./data/', 'airline', 115000000)

X_train_np = data.X_train.to_numpy(np.float32)
X_test_np = data.X_test.to_numpy(np.float32)
y_train_np = data.y_train.to_numpy(np.int32)
y_test_np = data.y_test.to_numpy(np.int32)

print("Shape of the training data : ", X_train_np.shape)
print("Shape of the ground truth data used for training : ", y_train_np.shape)
print("Shape of the testing data : ", X_test_np.shape)
print("Shape of the ground truth data used for testing : ",y_test_np.shape)

In [None]:
estimator_array = [100, 500, 1000]
stream_array = [8, 10]
depth_array = [8, 12, 16]
run_cuml = True
run_skl = True 
skip_test = True 
csv_path = './rf_bench_results/rf_bench_airline.csv'

start_bench(run_cuml, run_skl, skip_test, estimator_array, stream_array, depth_array, csv_path, X_train_np, y_train_np, X_train_np, X_test_np, X_test_np, y_test_np)

## Epsilon

In [None]:
from datasets import prepare_dataset

data = prepare_dataset('./data/', 'epsilon', 500000)

X_train_np = data.X_train.astype(np.float32)
X_test_np = data.X_test.astype(np.float32)
y_train_np = data.y_train.astype(np.int32)
y_test_np = data.y_test.astype(np.int32)

print("Shape of the training data : ", X_train_np.shape)
print("Shape of the ground truth data used for training : ", y_train_np.shape)
print("Shape of the testing data : ", X_test_np.shape)
print("Shape of the ground truth data used for testing : ",y_test_np.shape)

# X_train_g = cuda.to_device(np.ascontiguousarray(X_train_np))
# X_train = cudf.DataFrame.from_gpu_matrix(X_train_g)
# y_train_g = cuda.to_device(np.ascontiguousarray(y_train_np))
# y_train = cudf.Series(y_train_g)

In [None]:
estimator_array = [100, 500, 1000]
stream_array = [8, 10]
depth_array = [8, 12, 16]
run_cuml = True
run_skl = True 
skip_test = True 
csv_path = './rf_bench_results/rf_bench_epsilon.csv'

start_bench(run_cuml, run_skl, skip_test, estimator_array, stream_array, depth_array, csv_path, X_train_np, y_train_np, X_train_np, y_train_np, X_test_np, y_test_np)

## Bosch

In [None]:
from datasets import prepare_dataset

data = prepare_dataset('./data/', 'bosch', 1184000)

X_train_np = data.X_train.to_numpy(np.float32)
X_test_np = data.X_test.to_numpy(np.float32)
y_train_np = data.y_train.to_numpy(np.int32)
y_test_np = data.y_test.to_numpy(np.int32)

print("Shape of the training data : ", X_train_np.shape)
print("Shape of the ground truth data used for training : ", y_train_np.shape)
print("Shape of the testing data : ", X_test_np.shape)
print("Shape of the ground truth data used for testing : ",y_test_np.shape)

# X_train_g = cuda.to_device(np.ascontiguousarray(X_train_np))
# X_train = cudf.DataFrame.from_gpu_matrix(X_train_g)
# y_train_g = cuda.to_device(np.ascontiguousarray(y_train_np))
# y_train = cudf.Series(y_train_g)

In [None]:
estimator_array = [100, 500, 1000]
stream_array = [8, 10]
depth_array = [8, 12, 16]
run_cuml = True
run_skl = True 
skip_test = True 
csv_path = './rf_bench_results/rf_bench_bosch.csv'

start_bench(run_cuml, run_skl, skip_test, estimator_array, stream_array, depth_array, csv_path, X_train_np, y_train_np, X_train_np, y_train_np, X_test_np, y_test_np)