In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, plot_roc_curve, plot_precision_recall_curve

In [4]:
df = pd.read_csv('data/californiabin.csv', header=None)

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,4.1685,20.0,6.700637,1.178344,745.0,2.372611,38.80,-121.15,1.0
1,2.8194,24.0,4.762500,1.020833,608.0,2.533333,36.75,-119.85,0.0
2,2.4250,46.0,4.927711,1.018072,772.0,2.325301,36.33,-119.31,0.0
3,3.1205,16.0,3.728477,1.101545,1260.0,2.781457,33.87,-117.99,0.0
4,4.3889,41.0,5.741007,1.199041,837.0,2.007194,34.15,-118.40,1.0
...,...,...,...,...,...,...,...,...,...
2995,4.0050,7.0,4.966767,1.021148,1143.0,3.453172,36.84,-121.44,1.0
2996,1.0560,24.0,4.462069,1.112644,1277.0,2.935632,35.36,-119.01,0.0
2997,3.2824,52.0,4.356984,1.064302,794.0,1.760532,34.08,-118.36,1.0
2998,2.3056,36.0,5.268657,0.970149,792.0,2.955224,38.14,-122.23,0.0


In [5]:
arr = df.to_numpy()

In [7]:
x = arr[:, :-1]
y = arr[:, -1]

In [8]:
x

array([[   4.1685    ,   20.        ,    6.70063694, ...,    2.37261146,
          38.8       , -121.15      ],
       [   2.8194    ,   24.        ,    4.7625    , ...,    2.53333333,
          36.75      , -119.85      ],
       [   2.425     ,   46.        ,    4.92771084, ...,    2.3253012 ,
          36.33      , -119.31      ],
       ...,
       [   3.2824    ,   52.        ,    4.35698448, ...,    1.76053215,
          34.08      , -118.36      ],
       [   2.3056    ,   36.        ,    5.26865672, ...,    2.95522388,
          38.14      , -122.23      ],
       [   5.6683    ,   24.        ,    6.08088235, ...,    3.47610294,
          37.41      , -121.87      ]])

In [11]:
x_trn, x_tst, y_trn, y_tst = train_test_split(x, y, test_size=0.3, random_state=42)

In [12]:
cv_kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [17]:
svc_hiperparameters = {
    'SVC__gamma': 2 ** np.arange(-15, 4, 2, dtype=float),
    'SVC__C': 2 ** np.arange(-5, 16, 2, dtype=float)
}

In [19]:
svc_pipeline = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('SVC', SVC(kernel='rbf'))
    ]
)
svc_gs = GridSearchCV(svc_pipeline, svc_hiperparameters, cv=cv_kfold, return_train_score=True, n_jobs=-1)

In [21]:
svc_gs.fit(x_trn, y_trn)

GridSearchCV(cv=KFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('SVC', SVC())]),
             n_jobs=-1,
             param_grid={'SVC__C': array([3.1250e-02, 1.2500e-01, 5.0000e-01, 2.0000e+00, 8.0000e+00,
       3.2000e+01, 1.2800e+02, 5.1200e+02, 2.0480e+03, 8.1920e+03,
       3.2768e+04]),
                         'SVC__gamma': array([3.05175781e-05, 1.22070312e-04, 4.88281250e-04, 1.95312500e-03,
       7.81250000e-03, 3.12500000e-02, 1.25000000e-01, 5.00000000e-01,
       2.00000000e+00, 8.00000000e+00])},
             return_train_score=True)

In [25]:
rf_hiperparameters = {
	'RF__n_estimators': np.arange(10, 210, 10),
	'RF__max_depth': [2, 4, 6, 8, None]
}

In [28]:
rf_pipeline = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('RF', RandomForestClassifier())
    ]
)
rf_gs = GridSearchCV(rf_pipeline, rf_hiperparameters, cv=cv_kfold, return_train_score=True, n_jobs=-1)

In [29]:
rf_gs.fit(x_trn, y_trn)

GridSearchCV(cv=KFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('RF', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'RF__max_depth': [2, 4, 6, 8, None],
                         'RF__n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200])},
             return_train_score=True)

In [30]:
best_svc = svc_gs.best_estimator_
svc_prediction = best_svc.predict(x_tst)
print(f"""SVM:

Hiperparâmetros escolhidos:
{svc_gs.best_params_}

Métricas:
Acurácia: { accuracy_score(svc_prediction, y_tst) }
F1-score: { f1_score(svc_prediction, y_tst) }
Precisão: { precision_score(svc_prediction, y_tst) }
Revocação: { recall_score(svc_prediction, y_tst) }
""")

SVM:

Hiperparâmetros escolhidos:
{'SVC__C': 32.0, 'SVC__gamma': 0.125}

Métricas:
Acurácia: 0.8444444444444444
F1-score: 0.8484848484848484
Precisão: 0.8376068376068376
Revocação: 0.8596491228070176



In [31]:
best_rf = rf_gs.best_estimator_
rf_prediction = best_rf.predict(x_tst)
print(f"""Random Forest:

Hiperparâmetros escolhidos:
{rf_gs.best_params_}

Métricas:
Acurácia: { accuracy_score(rf_prediction, y_tst) }
F1-score: { f1_score(rf_prediction, y_tst) }
Precisão: { precision_score(rf_prediction, y_tst) }
Revocação: { recall_score(rf_prediction, y_tst) }
""")

Random Forest:

Hiperparâmetros escolhidos:
{'RF__max_depth': None, 'RF__n_estimators': 140}

Métricas:
Acurácia: 0.8744444444444445
F1-score: 0.878101402373247
Precisão: 0.8696581196581197
Revocação: 0.8867102396514162

