## Pré-processamento

base: https://archive-beta.ics.uci.edu/dataset/174/parkinsons

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame

rs = 1              # Random State
folds = 10          # Quantity of folds
test_split = 0.25   # Validation
exec_time = 5       # Time of each AutoML (minutes)

In [None]:
park_data = pd.read_csv('../bases/test_bases/parkinsons.data', header=None)

park_data.head(5)

In [None]:
feature_names = list(park_data.iloc[0])

df = park_data.drop(park_data.index[0])
df = df.drop(0, axis=1)
df = df.astype(float)
df = df.rename(columns = pd.Series(feature_names))

feature_names.remove('name')

df.head(5)

In [None]:
sns.countplot(x="status", data= df)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

X = df.drop(columns="status", axis=1).values
y = df["status"].values
feature_names.remove("status")

std = StandardScaler()
X = std.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= test_split, random_state=rs)

kfold = StratifiedKFold(n_splits= folds, random_state= rs, shuffle=True)


In [None]:
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
# Oversampling unbanlanced data
sm = SMOTE(random_state=rs)
X_over, y_over = sm.fit_resample(X_train , y_train)

# Undersampling unbalanced data
tl = TomekLinks(sampling_strategy="auto")
X_under, y_under = tl.fit_resample(X_train, y_train)

In [None]:
print(np.unique(y_over, return_counts=True),'\n',
      np.unique(y_under, return_counts = True))

In [None]:
from sklearn.metrics import accuracy_score
def cross_val(classifier, X_train, y_train) -> np.array:

    model_results = []

    for train_index, test_index in kfold.split(X_train, y_train):

        X_fold_train, y_fold_train = X_train[train_index], y_train[train_index]
        X_fold_test, y_fold_test = X_train[test_index], y_train[test_index]

        classifier.fit(X_fold_train, y_fold_train)
        y_pred = classifier.predict(X_fold_test)
        model_results.append(accuracy_score(y_fold_test, y_pred))

    return np.array(model_results)

In [None]:
import shap

def compute_shap(model, feature_names, X_train, explainer_ = 'kernel', num_samples = 50) -> DataFrame:

    shap.initjs()

    #Defining and executing explainer
    sample = shap.sample(X_train, num_samples, random_state=rs)
    explainer = shap.TreeExplainer(model) if explainer_ == 'tree' else shap.KernelExplainer(model.predict, sample)
    shap_values = explainer.shap_values(sample)

    #Calculate shap_values impact of each feature
    shap_sum = np.abs(shap_values).mean(axis=0)

    #Make a dataframe of each feature and shap_value
    importance_df = pd.DataFrame([feature_names, shap_sum.tolist()]).T
    importance_df.columns = ['feature', 'importance']
    importance_df = importance_df.sort_values('importance', ascending=False)

    features = importance_df['feature'].index.to_list()

    shap.summary_plot(shap_values, sample, feature_names=feature_names)

    return features

In [None]:
from sklearn.base import clone

def feature_selection(model_, ordered_features, X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test, n_min = -1, threshold=0.03):

    features_list = ordered_features[:n_min].copy() if n_min >= 0 else ordered_features.copy()

    best_features = None
    best_acc = 0
    current = features_list.copy()

    params = model_.get_params()

    for feature in features_list[1:][::-1]:
        current.remove(feature)

        X_train_shap = X_train[:, current]
        X_test_shap = X_test[:, current]

        if 'max_features' in params:
            if len(current) == params['max_features']:
                print('Limite do modelo.')
                break

        if 'warm_start' in params:
            model = clone(model_)
        else:
            model = model_

        model.fit(X_train_shap, y_train)
        y_pred = model.predict(X_test_shap)
        test_acc = accuracy_score(y_test, y_pred)

        if test_acc > best_acc-best_acc*threshold:
            best_acc = test_acc
            best_features = current.copy()
            print(f'Best accuracy: {best_acc} with {len(best_features)} features')

    print(f'Results: Best accuracy: {best_acc} with {best_features}')

    return best_features

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def run_test(models: list, X_train_ = X_train, y_train_= y_train ,X_test_ = X_test, y_test_ = y_test, cm = False, new_features = None):

    if new_features != None:
        print("New features: ", new_features, end="\n\n")
        X_train_ = X_train_[:,new_features]
        X_test_ = X_test_[:,new_features]

    for model_ in models:

        model = clone(model_)
        model.fit(X_train_, y_train_)
        y_pred = model.predict(X_test_)
        test_acc = accuracy_score(y_test_, y_pred)

        print(model)
        print("Accuracy: ", test_acc, end="\n\n")

        if cm:
            matrix = ConfusionMatrixDisplay(confusion_matrix(y_test_, y_pred))
            matrix.plot()


# Teste com SVM

In [None]:
from sklearn.svm import SVC

model = SVC()

In [None]:
run_test([model], X_test, y_test)
run_test([model], X_over, y_over)

## Tunning

In [None]:
#Tunning

from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'coef0': [0.0, 0.5, 1.0],
    'shrinking': [True, False],
    'probability': [True],
    'random_state': [rs]
}

In [None]:
print("Grid Search...")
grid_search = GridSearchCV(model, param_grid, n_jobs=4, cv= kfold)
grid_search.fit(X_train, y_train)

print("Grid Search (over)...")
grid_search_over = GridSearchCV(model, param_grid, n_jobs=4, cv= kfold)
grid_search_over.fit(X_over, y_over)

In [None]:
print("Melhores parametros: ", grid_search.best_params_)
print("Melhores parametros: ", grid_search_over.best_params_)

model_gs = grid_search.best_estimator_
model_gs_over = grid_search_over.best_estimator_

In [None]:
run_test([model, model_gs])
run_test([model, model_gs_over], X_over, y_over)

## Feature Selection

In [None]:
shap_svm = compute_shap(model_gs, feature_names, X_train)

In [None]:
svm_features = feature_selection(model_gs, shap_svm, threshold=0.01)

In [None]:
run_test([model_gs], cm = True, new_features=svm_features)

# Auto Sklearn

In [None]:
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import balanced_accuracy, precision, recall, f1

minutos = 1

autosk = AutoSklearnClassifier(
    include = {
        "classifier": ["random_forest", "decision_tree", "extra_trees",
                        'liblinear_svc', 'libsvm_svc','k_nearest_neighbors'],
        "feature_preprocessor":["no_preprocessing"]
    },
    time_left_for_this_task= minutos*60,
    per_run_time_limit= 30,
    scoring_functions= [balanced_accuracy, precision, recall, f1],
    ensemble_class= 'none',
    #ensemble_nbest = 25,
    n_jobs = 4
)

autosk.fit(X_train, y_train, dataset_name= 'Parkinson Disease Prediction')

In [None]:
from sklearn.base import clone

models = autosk.show_models()
best_model = list(models.keys())[0]

sklearn_classifier = clone(models[best_model]["sklearn_classifier"])
sklearn_classifier.fit(X_train, y_train)

print(sklearn_classifier, '\n', autosk.sprint_statistics())

In [None]:
params = sklearn_classifier.get_params()

## Feature Selection

In [None]:
shap_autosk = compute_shap(sklearn_classifier, feature_names, X_train, num_samples=10)

In [None]:
autosk_features = feature_selection(sklearn_classifier, shap_autosk, threshold=0.03)

In [None]:
run_test([sklearn_classifier], cm = True, new_features= autosk_features)

# Fla ML

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
from flaml import AutoML

minutos = 1

fla_automl = AutoML()

fla_automl_settings = {
    "time_budget": minutos*60,
    "metric": 'accuracy',
    "n_jobs": -1,
    "ensemble": False,
    "verbose": 0,
    "n_splits": 5
}

fla_automl.fit(X_train, y_train, task= "classification", **fla_automl_settings)

In [None]:
run_test([flaml_classifier], cm = True)
run_test([flaml_classifier], X_over, y_over, cm = True)

## Feature Selection

In [None]:
shap_flaml = compute_shap(flaml_classifier, feature_names, X_train)

In [None]:
flaml_features = feature_selection(flaml_classifier, shap_flaml, threshold=0.01)

In [None]:
run_test([flaml_classifier], cm = True, new_features= flaml_features)
run_test([flaml_classifier], X_over, y_over, cm = True, new_features= flaml_features)

# Análise

In [None]:
from matplotlib_venn import venn3_unweighted

set1 = set(svm_features)
set2 = set(autosk_features)
set3 = set(flaml_features)

subsets = (set1 - set2 - set3,
           set2 - set1 - set3,
           set1 & set2 - set3,
           set3 - set1 - set2,
           set1 & set3 - set2,
           set2 & set3 - set1,
           set1 & set2 & set3)

venn3_unweighted(subsets, set_labels=('SVM', 'AutoSklearn', 'FlaML'))

plt.show()


In [None]:
print(autosk_features)
print(svm_features)
print(flaml_features)