In [89]:
import pandas as pd
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from autosklearn.classification import AutoSklearnClassifier

In [92]:
def get_random_samples(base, n_samples, label):
    samples = base[base[:, -1] == label]
    
    if (n_samples):
        indexes = np.random.choice(samples.shape[0], n_samples, replace=False)
        return samples[indexes]
    
    return samples

base = np.load(f"output/v2/bins_30.npy")

negatives = get_random_samples(base=base, n_samples=10000, label=-1)
positives = get_random_samples(base=base, n_samples=10000, label=1)

base_ready = np.vstack((positives, negatives))
np.random.shuffle(base_ready)
y = base_ready[:, -1]
X = base_ready[:, 0:-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

automl = AutoSklearnClassifier(
    time_left_for_this_task=3600,
    tmp_folder=f'/home/renan/Área de Trabalho/research-project-2/autosklearn_classification_example/',
    n_jobs=-1,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 5},
)
automl.fit(X_train, y_train)
print(automl.leaderboard())
predictions = automl.predict(X_test)
print("Accuracy score:", accuracy_score(y_test, predictions))

with open('automl.pkl', 'wb') as f:
    pickle.dump(automl, f)

In [None]:
# nao faz sentido, preciso pegar as models que estao no leaderboard

In [126]:
with open('automl.pkl', 'rb') as f:
    automl = pickle.load(f)
    print(automl.leaderboard())
    model = automl.show_models().get(105)   

          rank  ensemble_weight                type      cost    duration
model_id                                                                 
100          1             0.12       liblinear_svc  0.005187    6.014990
105          3             0.02                 mlp  0.005250   28.000155
130          2             0.02                 mlp  0.005250  136.142972
175          4             0.10                 mlp  0.005375   52.626909
181          5             0.02                 mlp  0.005562  100.574493
112          6             0.10       liblinear_svc  0.005688    6.664892
39           7             0.02  passive_aggressive  0.005688   10.506690
7            8             0.02                 mlp  0.005750  176.469227
167          9             0.14                 mlp  0.005875  104.483589
192         10             0.08          libsvm_svc  0.006375   32.711950
29          11             0.02  passive_aggressive  0.006500    7.716079
41          12             0.02       

In [None]:
model = SVC(C=1, gamma=0.001, kernel='rbf')

In [115]:
def get_report(y, y_pred, all=False):
    report = classification_report(y_test_fold, y_pred, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=True, zero_division='warn')

    if not all:
        label_acc = report['accuracy']
        label_sen = report['1.0']['recall']
        label_spe =report['-1.0']['recall']
    else:
        std_dev_acc = np.array([r[0] for r in report_all]).std().round(4)
        std_dev_sen = np.array([r[1] for r in report_all]).std().round(4)
        std_dev_spe = np.array([r[2] for r in report_all]).std().round(4)
        label_acc = f"{report['accuracy']}+-{std_dev_acc}"
        label_sen = f"{report['1.0']['recall']}+-{std_dev_sen}"
        label_spe = f"{report['-1.0']['recall']}+-{std_dev_spe}"

    report_fold = [label_acc, label_sen, label_spe]
    return report_fold


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

report_all = []
y_test_all = np.array([])
y_pred_all = np.array([])

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    model.fit(x_train_fold, y_train_fold)
    y_pred = model.predict(x_test_fold)
    report_all.append(get_report(y_test_fold, y_pred))
    y_test_all = np.concatenate((y_test_all, y_test_fold))
    y_pred_all = np.concatenate((y_pred_all, y_pred))
    print(get_report(y_test_fold, y_pred))

report_all.append(get_report(y=y_test_fold, y_pred=y_pred, all=True))

AttributeError: 'dict' object has no attribute 'fit'

In [59]:
indexes = list(range(1, 11))
indexes.append('Average')
pd.DataFrame(report_all, columns=['Accuracy', 'Sensibility', 'Specificity'], index=indexes)

Unnamed: 0,Accuracy,Sensibility,Specificity
1,0.994907,0.994685,0.99535
2,0.99546,0.995349,0.995682
3,0.994907,0.994436,0.995849
4,0.994907,0.994186,0.996347
5,0.995017,0.994352,0.996347
6,0.995128,0.994269,0.996845
7,0.995516,0.994767,0.997011
8,0.995073,0.9951,0.995019
9,0.995239,0.99485,0.996015
10,0.995405,0.994186,0.997841


In [60]:
pd.crosstab(y_test_all, y_pred_all, rownames=['real'], colnames=['predict'], margins=True)

predict,-1.0,1.0,All
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,59998,227,60225
1.0,648,119755,120403
All,60646,119982,180628
