# Multioutput-Multiclass Random Forest Blosc

## Objetivos
* Crear un algoritmo de arboles de decisión basado en bosques aleatorios utilizando scikit-learn.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

%load_ext autoreload
%autoreload 2

%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas

Software,Version
Python,3.5.2 64bit [MSC v.1900 64 bit (AMD64)]
IPython,5.3.0
OS,Windows 10 10.0.14393 SP0
numpy,1.11.3
scipy,0.19.0
matplotlib,2.0.0
pandas,0.19.2
Thu Mar 30 10:14:06 2017 Hora de verano romance,Thu Mar 30 10:14:06 2017 Hora de verano romance


In [2]:
import os
import sys
sys.path.append("../src/")

from IPython.display import display
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from sklearn.preprocessing import binarize 
from sklearn.preprocessing import OneHotEncoder

pd.options.display.float_format = '{:,.3f}'.format
matplotlib.rcParams.update({'font.size': 12})

## Importando los datos de entramiento
Para ver como se crearon los datos de entrenamiento ir a [Training data generator](../deliver/training_data_generator.ipynb)

In [3]:
df = pd.read_csv('../data/training_data.csv', sep='\t')

## Preprocesamiento entradas (extraer en training data generator o antes)

In [4]:
df = df.assign(is_Table=binarize(df['Table'].values.reshape(-1,1), 0), 
               is_Columnar=binarize(df['Table'].values.reshape(-1,1), 1),
               is_Int=df['DType'].str.contains('int').astype(int),
               is_Float=df['DType'].str.contains('float').astype(int),
               is_String=(df['DType'].str.contains('S') | df['DType'].str.contains('U')).astype(int))
def aux_func(n):
    if n == 32 or n == 64:
        return n // 8
    else:
        return n
df['Type_Size'] = [aux_func(int(s)) for s in df['DType'].str[-2:]]

## Preprocesamiento salidas

In [5]:
df = df.assign(Blosclz=(df['Codec'] == 'blosclz').astype(int),
               Lz4=(df['Codec'] == 'lz4').astype(int),
               Lz4hc=(df['Codec'] == 'lz4hc').astype(int),
               Snappy=(df['Codec'] == 'snappy').astype(int),
               Zstd=(df['Codec'] == 'zstd').astype(int),
               Shuffle=(df['Filter'] == 'shuffle').astype(int),
               Bitshuffle=(df['Filter'] == 'bitshuffle').astype(int))
enc_cl = OneHotEncoder()
enc_cl.fit(df['CL'].values.reshape(-1, 1))
new_cls = enc_cl.transform(df['CL'].values.reshape(-1, 1)).toarray()
enc_block = OneHotEncoder()
enc_block.fit(df['Block_Size'].values.reshape(-1, 1))
new_blocks = enc_block.transform(df['Block_Size'].values.reshape(-1, 1)).toarray()
block_sizes = [0, 8, 16, 32, 64, 128, 256, 512, 1024]
for i in range(9):
    cl_label = 'CL' + str(i+1)
    block_label = 'Block_' + str(block_sizes[i])
    df[cl_label] = new_cls[:, i]
    df[block_label] = new_blocks[:, i]
df['Block_2048'] = new_blocks[:, 9]

In [6]:
IN_OPTIONS = ['IN_CR', 'IN_CS', 'IN_DS', 'is_Table', 'is_Columnar', 'is_Int', 'is_Float', 'is_String', 'Type_Size', 'Chunk_Size',
              'Mean', 'Median', 'Sd', 'Skew', 'Kurt', 'Min', 'Max', 'Q1', 'Q3', 'BLZ_CRate', 'BLZ_CSpeed', 'BLZ_DSpeed', 'LZ4_CRate',
              'LZ4_CSpeed', 'LZ4_DSpeed']
OUT_OPTIONS = ['Block_0', 'Block_8', 'Block_16', 'Block_32', 'Block_64', 'Block_128', 'Block_256', 'Block_512', 'Block_1024', 'Block_2048',
               'Blosclz', 'Lz4', 'Lz4hc', 'Snappy', 'Zstd', 'Shuffle', 'Bitshuffle',
               'CL1', 'CL2', 'CL3', 'CL4', 'CL5', 'CL6', 'CL7', 'CL8', 'CL9']

In [7]:
X, Y = df[IN_OPTIONS].values, df[OUT_OPTIONS].values

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
# [DIFFER] thresholds randomness instead of most discriminative
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(Xtrain, Ytrain)
Ypred = clf.predict(Xtest)

In [9]:
from sklearn import metrics
print(metrics.classification_report(Ytest, Ypred, digits=3, target_names=OUT_OPTIONS))

             precision    recall  f1-score   support

    Block_0      0.639     0.167     0.264       138
    Block_8      0.658     0.333     0.442        75
   Block_16      0.333     0.057     0.097        88
   Block_32      0.300     0.029     0.053       103
   Block_64      0.532     0.141     0.223       177
  Block_128      0.478     0.080     0.137       138
  Block_256      0.688     0.176     0.280       125
  Block_512      0.500     0.081     0.139        62
 Block_1024      0.778     0.135     0.230        52
 Block_2048      0.962     0.805     0.876       220
    Blosclz      0.959     0.858     0.906       275
        Lz4      0.961     0.923     0.942       455
      Lz4hc      0.950     0.835     0.889       115
     Snappy      0.000     0.000     0.000         0
       Zstd      0.991     0.973     0.982       333
    Shuffle      0.946     0.934     0.940       409
 Bitshuffle      0.802     0.775     0.789        89
        CL1      0.823     0.710     0.762   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [10]:
clf.score(Xtest, Ytest)

0.17996604414261461

In [11]:
count = 0
for i in range(Ytest.shape[0]):
    if (Ytest[i,:] == Ypred[i,:]).all():
        count += 1
print(count/Ytest.shape[0])

0.1799660441426146


In [12]:
from sklearn.metrics import f1_score
f1_score(Ytest, Ypred, average='weighted')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.63001015846405417

In [13]:
OUT_OPTIONS

['Block_0',
 'Block_8',
 'Block_16',
 'Block_32',
 'Block_64',
 'Block_128',
 'Block_256',
 'Block_512',
 'Block_1024',
 'Block_2048',
 'Blosclz',
 'Lz4',
 'Lz4hc',
 'Snappy',
 'Zstd',
 'Shuffle',
 'Bitshuffle',
 'CL1',
 'CL2',
 'CL3',
 'CL4',
 'CL5',
 'CL6',
 'CL7',
 'CL8',
 'CL9']

In [14]:
def my_score(Yreal, Ypred):
    score = 0
    for i in range(Yreal.shape[0]):
        if (Ytest[i,0:10] == Ypred[i,0:10]).all():
            score += 0.2
        if (Ytest[i,10:15] == Ypred[i,10:15]).all():
            score += 0.4
        if (Ytest[i,15:17] == Ypred[i,15:17]).all():
            score += 0.2
        if (Ytest[i,17:26] == Ypred[i,17:26]).all():
            score += 0.2
    return score/Yreal.shape[0]
my_score(Ytest, Ypred)

0.6889643463497516

Demasiada buena puntuación, busquemos algo más exigente.

In [15]:
def my_score2(Yreal, Ypred):
    score = 0
    for i in range(Yreal.shape[0]):
        if (Ytest[i,0:10] == Ypred[i,0:10]).all() and (Ytest[i,17:26] == Ypred[i,17:26]).all():
            score += 0.5
        if (Ytest[i,10:15] == Ypred[i,10:15]).all() and (Ytest[i,15:17] == Ypred[i,15:17]).all():
            score += 0.5
    return score/Yreal.shape[0]
my_score2(Ytest, Ypred)

0.5292869269949066

In [16]:
def my_score3(Yreal, Ypred):
    score = 0
    for i in range(Yreal.shape[0]):
        score += (Ytest[i,:] == Ypred[i,:]).astype(int).sum()/26
    return score/Yreal.shape[0]
my_score3(Ytest, Ypred)

0.93470027425884161

In [21]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
X, Y = df[IN_OPTIONS].values, df[OUT_OPTIONS].values

param_grid = {'n_estimators': [100, 500],
              'criterion': ['gini', 'entropy'],
              'bootstrap': [True, False],
              'max_features': [1, 5, 10],
              'min_samples_leaf': [1, 5],
              'class_weight': [None, 'balanced']}

param_dist = {'n_estimators': [100, 200, 500],
              'criterion': ['gini', 'entropy'],
              'bootstrap': [True, False],
              'max_features': sp_randint(1, 10),
              'min_samples_leaf': sp_randint(1, 5),
              'class_weight': [None, 'balanced']}
ss = ShuffleSplit(n_splits=1, test_size=0.5)
rfc = RandomForestClassifier(n_jobs=-1)
grid_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=ss, verbose=10, n_jobs=-1)
rgrid_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=param_dist, cv=ss, verbose=10, n_jobs=-1)

In [22]:
grid_rfc.fit(X, Y)

Fitting 1 folds for each of 96 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 12.1min finished


Wall time: 12min 36s


GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=None, test_size=0.5, train_size=None),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': [1, 5, 10], 'n_estimators': [100, 500], 'class_weight': [None, 'balanced'], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 5], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [69]:
from IPython.display import HTML, display
score_param = []
for i in range(len(grid_rfc.cv_results_['mean_test_score'])):
    if grid_rfc.cv_results_['mean_test_score'][i] > 0.19:
        tup = (grid_rfc.cv_results_['mean_test_score'][i], grid_rfc.cv_results_['params'][i])
        score_param += [tup]
display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in score_param)
        )
 ))

0,1
0.192699490662,"{'max_features': 10, 'n_estimators': 100, 'class_weight': None, 'criterion': 'gini', 'min_samples_leaf': 1, 'bootstrap': True}"
0.195246179966,"{'max_features': 10, 'n_estimators': 500, 'class_weight': None, 'min_samples_leaf': 1, 'criterion': 'gini', 'bootstrap': True}"
0.199066213922,"{'max_features': 10, 'n_estimators': 100, 'class_weight': None, 'min_samples_leaf': 1, 'criterion': 'entropy', 'bootstrap': True}"
0.196095076401,"{'max_features': 10, 'n_estimators': 500, 'class_weight': None, 'criterion': 'entropy', 'min_samples_leaf': 1, 'bootstrap': True}"
0.227504244482,"{'max_features': 10, 'n_estimators': 100, 'class_weight': None, 'criterion': 'gini', 'min_samples_leaf': 1, 'bootstrap': False}"
0.227504244482,"{'max_features': 10, 'n_estimators': 500, 'class_weight': None, 'min_samples_leaf': 1, 'criterion': 'gini', 'bootstrap': False}"
0.230050933786,"{'max_features': 10, 'n_estimators': 100, 'class_weight': None, 'min_samples_leaf': 1, 'criterion': 'entropy', 'bootstrap': False}"
0.227928692699,"{'max_features': 10, 'n_estimators': 500, 'class_weight': None, 'criterion': 'entropy', 'min_samples_leaf': 1, 'bootstrap': False}"
0.212224108659,"{'max_features': 10, 'n_estimators': 100, 'class_weight': 'balanced', 'criterion': 'gini', 'min_samples_leaf': 1, 'bootstrap': False}"
0.216468590832,"{'max_features': 10, 'n_estimators': 500, 'class_weight': 'balanced', 'min_samples_leaf': 1, 'criterion': 'gini', 'bootstrap': False}"


In [102]:
dtc = grid_rfc.best_estimator_.estimators_[0]

In [None]:
import pydotplus
from IPython.display import Image
from sklearn import tree
dot_data = tree.export_graphviz(dtc, out_file=None, 
                         feature_names=IN_OPTIONS,  
                         class_names=OUT_OPTIONS,  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

In [71]:
rgrid_rfc.fit(X, Y)

Fitting 1 folds for each of 10 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   59.2s remaining:   59.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.2min remaining:   29.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.5min finished


RandomizedSearchCV(cv=ShuffleSplit(n_splits=1, random_state=None, test_size=0.5, train_size=None),
          error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001989D374B70>, 'n_estimators': [100, 200, 500], 'class_weight': [None, 'balanced'], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001989D374D68>, 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=None, refi