In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score

In [5]:
def grid(estimator, scoring, cv, X, y):
    return GridSearchCV(
    estimator = estimator,
    param_grid = params,
    cv=cv,
    verbose=True,
    n_jobs=-3
    ).fit(X, y)

In [6]:
params = {
    "n_estimators": [100, 200, 300, 400, 500],
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, 6, 7],
    "max_features": ["auto", "sqrt", "log2"],
    "bootstrap": [True, False],
    "warm_start": [True, False]
  },

In [7]:
data = pd.read_csv('dataset/ionosphere/data.csv')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [8]:
data['1'].value_counts()

0    351
Name: 1, dtype: int64

In [9]:
data.drop(['1'], axis=1, inplace=True)

In [10]:
X = data.drop('34', axis=1)
y = data['34']

In [11]:
len(X.columns)

33

In [12]:
X.head()

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,24,25,26,27,28,29,30,31,32,33
0,1,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,...,0.56811,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453
1,1,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,...,-0.20332,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447
2,1,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,...,0.57528,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238
3,1,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,0.0,...,1.0,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0
4,1,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,0.52798,...,0.03286,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697


In [13]:
from discretization.chi_merge import *
chi_merge = ChiMerge(con_features=X.columns, significance_level=0.1, n_jobs=-3)

In [19]:
%%time
X_dis = chi_merge.fit_transform(X, y)

Wall time: 13.4 s


In [20]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categories='auto')

In [21]:
X_dis = ohe.fit_transform(X_dis)

In [22]:
estimator = RandomForestClassifier()

In [23]:
y.value_counts()

g    225
b    126
Name: 34, dtype: int64

In [24]:
grid_search = grid(estimator, 'f1', 10, X, y)
grid_search.best_score_

Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-3)]: Done  30 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-3)]: Done 180 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-3)]: Done 430 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-3)]: Done 780 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-3)]: Done 1230 tasks      | elapsed:   43.3s
[Parallel(n_jobs=-3)]: Done 1780 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-3)]: Done 2430 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-3)]: Done 3180 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-3)]: Done 4030 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-3)]: Done 4980 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  4.4min finished


0.9430199430199431

In [25]:
grid_search = grid(estimator, 'f1', 10, X_dis, y)
grid_search.best_score_

[Parallel(n_jobs=-3)]: Using backend LokyBackend with 10 concurrent workers.


Fitting 10 folds for each of 600 candidates, totalling 6000 fits


[Parallel(n_jobs=-3)]: Done  60 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-3)]: Done 322 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-3)]: Done 572 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-3)]: Done 922 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-3)]: Done 1372 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-3)]: Done 1922 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-3)]: Done 2572 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-3)]: Done 3322 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-3)]: Done 4172 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-3)]: Done 5122 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-3)]: Done 6000 out of 6000 | elapsed:  3.1min finished


0.9401709401709402