In [3]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import warnings
import time
import os
import json
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    StratifiedKFold,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.manifold import TSNE
from imblearn.base import BaseSampler
from imblearn.over_sampling.base import BaseOverSampler
from sklearn.compose import ColumnTransformer, make_column_transformer


from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from autorank import autorank, plot_stats, create_report, latex_table

import sdgym
from sdv.metadata import SingleTableMetadata
from sdv.single_table import (
    CTGANSynthesizer,
    TVAESynthesizer,
    GaussianCopulaSynthesizer,
    CopulaGANSynthesizer,
)
from sdv.lite import SingleTablePreset
from sdv.evaluation.single_table import (
    evaluate_quality,
    get_column_plot,
    get_column_pair_plot,
    run_diagnostic,
)
from sdv.sampling import Condition
from imblearn.utils import check_target_type
from scipy import sparse

warnings.filterwarnings("ignore")
sns.set(style="darkgrid", font_scale=0.5)
custom_palette = ["#8b4513", "#90ee90", "#545454", "#6a287e", "#f0be00"]
sns.set_palette(custom_palette)

## Functions

In [4]:
def dummy(df, columns):
    dummy_variables = []
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column)
        dummy_variables.append(dummies)
    return dummy_variables

In [5]:
def dummy_transform(df, columns):
    dummy_variables = dummy(df, cat_features)
    df = pd.concat([df] + dummy_variables, axis=1)
    df = df.drop(cat_features, axis=1)
    return df

In [6]:
def label_encode_columns(df, columns_to_encode):
    label_encoder = LabelEncoder()

    for column in columns_to_encode:
        if column in df.columns:
            df[column] = label_encoder.fit_transform(df[column])

    return df

In [7]:
def mean_std_results(mean_df, std_df):
    concatenated_results = []

    for index, row_mean in mean_df.iterrows():
        classificador = row_mean['Classification']
        row_std = std_df.loc[std_df['Classification'] == classificador].squeeze()

        accuracy = f"{row_mean['Accuracy']:.4f} +- {row_std['Accuracy']:.2f}"
        precision = f"{row_mean['Precision']:.4f} +- {row_std['Precision']:.2f}"
        recall = f"{row_mean['Recall']:.4f} +- {row_std['Recall']:.2f}"
        f1 = f"{row_mean['F1 Score']:.4f} +- {row_std['F1 Score']:.2f}"
        roc_auc = f"{row_mean['ROC/AUC']:.4f} +- {row_std['ROC/AUC']:.2f}"

        concatenated_results.append({
            "Classification": classificador,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC/AUC": roc_auc
        })

    return pd.DataFrame(concatenated_results)

In [8]:
def evaluation_pipeline(X, y, num_features: list , cat_features: list, sampling_strategy: str, metadata=None, target=None):
    print(f"Sampling Strategy: {sampling_strategy}")
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []
    
    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]
    
    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
    
    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i+1}...")
       
            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
            
            X_train.to_csv("X_train.csv", index = False)
            X_test.to_csv("X_test.csv", index = False)
              
            normalization_step = ("normalization", standard_scaler)
            model_step = ("model", model)
            
            if sampling_strategy == "TBTR":
                balance_classes_step = ("resampling_tbtr", SDVPipelineTBTR(metadata = metadata, target = target, num_features = num_features, cat_features = cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "TSTR":
                balance_classes_step = ("resampling_tstr", SDVPipelineTSTR(metadata = metadata, target = target, num_features = num_features, cat_features = cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "SMOTE":
                balance_classes_step = ("resampling_smote", SMOTE(random_state=42))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "BASELINE":
                steps = [normalization_step, model_step]
                
            pipeline = Pipeline(steps = steps)

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1, scoring='accuracy') # scoring = accuracy ou roc_auc?
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)
    
            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )
        
            X_balanced = clf.best_estimator_.named_steps['resampling_tbtr'].X_balanced
            y_balanced = clf.best_estimator_.named_steps['resampling_tbtr'].y_balanced
                
            with open('debug_test.txt', 'w') as f:
                f.write(f'X_balanced: {X_balanced.shape}\n')
                f.write(f'y_balanced: {y_balanced.shape}\n')
                f.write(f'X_test: \n{X_test.shape}\n')
                f.write(f'y_test: {y_test.shape}\n')
                f.write(f'y_test distribution: \n{y_test.value_counts()}\n')
                f.write(f'y_balanced distribution: \n{y_balanced.value_counts()}\n')
        
    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df

In [16]:
def evaluation_pipeline(X, y, num_features: list , cat_features: list, sampling_strategy: str, metadata=None, target=None):
    print(f"Sampling Strategy: {sampling_strategy}")
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []
    
    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]
    
    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
    
    categorical_encoder = ColumnTransformer(
        transformers=[("categorical_onehotencoder", OneHotEncoder(), cat_features)],
        remainder="passthrough",
    )
    

    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i+1}...")
       
            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
                     
            onehotencoder_step = ("encoder", categorical_encoder)
            normalization_step = ("normalization", standard_scaler)
            model_step = ("model", model)
            
            if sampling_strategy == "TBTR":
                balance_classes_step = ("resampling_tbtr", SDVPipelineTBTR(metadata = metadata, target = target, num_features = num_features, cat_features = cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "TSTR":
                balance_classes_step = ("resampling_tstr", SDVPipelineTSTR(metadata = metadata, target = target, num_features = num_features, cat_features = cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "SMOTE":
                balance_classes_step = ("resampling_smote", SMOTE(random_state=42))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "BASELINE":
                steps = [onehotencoder_step, normalization_step, model_step]
                
            pipeline = Pipeline(steps = steps)

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1, scoring='accuracy') # scoring = accuracy ou roc_auc?
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)
    
            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )
                
    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df

In [None]:
def evaluation_pipeline(X, y, num_features: list , cat_features: list, sampling_strategy: str, metadata=None, target=None):
    print(f"Sampling Strategy: {sampling_strategy}")
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []
    
    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
    ]
    
    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
    
    categorical_encoder = ColumnTransformer(
        transformers=[("categorical_onehotencoder", OneHotEncoder(), cat_features)],
        remainder="passthrough",
    )
    

    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i+1}...")
       
            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
                     
            onehotencoder_step = ("encoder", categorical_encoder)
            normalization_step = ("normalization", standard_scaler)
            model_step = ("model", model)
            
            if sampling_strategy == "TBTR":
                balance_classes_step = ("resampling_tbtr", SDVPipelineTBTR(metadata = metadata, target = target, num_features = num_features, cat_features = cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "TSTR":
                balance_classes_step = ("resampling_tstr", SDVPipelineTSTR(metadata = metadata, target = target, num_features = num_features, cat_features = cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "SMOTE":
                balance_classes_step = ("resampling_smote", SMOTE(random_state=42))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "BASELINE":
                steps = [onehotencoder_step, normalization_step, model_step]
                
            pipeline = Pipeline(steps = steps)

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1, scoring='accuracy') # scoring = accuracy ou roc_auc?
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)
    
            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )
                
    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df

In [18]:
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

def evaluation_pipeline(X, y, num_features: list, cat_features: list, sampling_strategy: str, metadata=None, target=None):
    print(f"Sampling Strategy: {sampling_strategy}")
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []

    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]

    num_transformer = ("numerical_standard_scaler", StandardScaler(), num_features)
    cat_transformer = ("categorical_onehot", OneHotEncoder(), cat_features)
    transformers = [num_transformer, cat_transformer]

    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i + 1}...")

            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]

            normalization_step = ("normalization", ColumnTransformer(transformers=transformers, remainder="passthrough"))
            model_step = ("model", model)

            if sampling_strategy == "TBTR":
                balance_classes_step = ("resampling_tbtr", SDVPipelineTBTR(metadata=metadata, target=target, num_features=num_features, cat_features=cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "TSTR":
                balance_classes_step = ("resampling_tstr", SDVPipelineTSTR(metadata=metadata, target=target, num_features=num_features, cat_features=cat_features))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "SMOTE":
                balance_classes_step = ("resampling_smote", SMOTE(random_state=42))
                steps = [balance_classes_step, normalization_step, model_step]
            elif sampling_strategy == "BASELINE":
                steps = [normalization_step, model_step]

            pipeline = Pipeline(steps=steps)
            
            X_train_transformed = pipeline.named_steps['normalization'].fit_transform(X_train)
            print(f"Transformed data before training:\n{X_train_transformed}")

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1, scoring='accuracy')  # scoring = accuracy ou roc_auc?
            clf.fit(X_train, y_train)
            
            X_test_transformed = pipeline.named_steps['normalization'].fit_transform(X_test)
            print(f"Transformed data after training:\n{X_test_transformed}")

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)

            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )

            with open('debug_test.txt', 'w') as f:
                f.write(f'X_test: \n{X_test.shape}\n')
                f.write(f'y_test: {y_test.shape}\n')
                f.write(f'y_test distribution: \n{y_test.value_counts()}\n')

    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index=False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index=False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df


## Init

In [19]:
cat_features = ["repeat_retailer", "used_chip", "used_pin_number", "online_order"]
num_features = [
    "distance_from_home",
    "distance_from_last_transaction",
    "ratio_to_median_purchase_price",
]
target = "fraud"

df = pd.read_csv("data/card_fraud/card_fraud_original.csv").sample(1000) # apenas para analisar se o código funciona
# df = dummy_transform(df, cat_features)

y = df[target]
X = df.drop(target, axis=1)

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)
metadata.update_column(
    column_name="distance_from_home",
    sdtype="numerical",
    computer_representation="Float",
)

metadata.update_column(
    column_name="distance_from_last_transaction",
    sdtype="numerical",
    computer_representation="Float",
)

metadata.update_column(
    column_name="ratio_to_median_purchase_price",
    sdtype="numerical",
    computer_representation="Float",
)

metadata.update_column(
    column_name="ratio_to_median_purchase_price",
    sdtype="numerical",
    computer_representation="Float",
)

metadata.update_column(
    column_name="fraud",
    sdtype="categorical"
)

metadata.update_column(
    column_name="online_order",
    sdtype="categorical",
)

metadata.update_column(
    column_name="used_pin_number",
    sdtype="categorical",
)

metadata.update_column(
    column_name="used_chip",
    sdtype="categorical",
)

metadata.update_column(
    column_name="repeat_retailer",
    sdtype="categorical",
)

## Baseline

In [20]:
baseline_df = evaluation_pipeline(X = X, y = y, num_features = num_features, cat_features = cat_features, sampling_strategy = "BASELINE")
baseline_df

Sampling Strategy: BASELINE

Training Model: SVC()
Training fold 1...
Transformed data before training:
[[-0.23842126 -0.24237446  0.68180842 ...  0.          0.
   1.        ]
 [-0.27230101  0.21117599  1.07425408 ...  0.          0.
   1.        ]
 [-0.41270388 -0.21448779 -0.31162592 ...  0.          1.
   0.        ]
 ...
 [ 0.16862596 -0.18749428 -0.57620135 ...  0.          0.
   1.        ]
 [ 0.04337145 -0.25388811  0.38075105 ...  0.          0.
   1.        ]
 [ 0.8061746  -0.26400213 -0.18894016 ...  0.          1.
   0.        ]]
Transformed data after training:
[[-0.21166344 -0.33762947 -0.49941286 ...  0.          0.
   1.        ]
 [-0.21278516 -0.34067606 -0.40718626 ...  0.          0.
   1.        ]
 [ 0.9442208  -0.29182961  0.40032815 ...  0.          1.
   0.        ]
 ...
 [ 2.78030359  1.71907593 -0.45520773 ...  0.          0.
   1.        ]
 [-0.48150074 -0.2481489   0.08994409 ...  0.          0.
   1.        ]
 [-0.25192411 -0.30419872  2.19945639 ...  0.    

KeyboardInterrupt: 

In [8]:
baseline_df = evaluation_pipeline(X = X, y = y, num_features = num_features, cat_features = cat_features, sampling_strategy = "BASELINE")
baseline_df

Sampling Strategy: BASELINE

Training Model: SVC()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: DecisionTreeClassifier()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: KNeighborsClassifier()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: RandomForestClassifier()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: LogisticRegression(max_iter=1000)
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: MLPClassifier(max_iter=1000)
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...


Unnamed: 0,Classification,Accuracy,Precision,Recall,F1 Score,ROC/AUC
0,Decision Tree,0.9780 +- 0.01,0.8639 +- 0.14,0.9025 +- 0.03,0.8786 +- 0.08,0.9442 +- 0.02
1,KNN,0.9740 +- 0.01,0.8225 +- 0.04,0.9322 +- 0.05,0.8727 +- 0.02,0.9556 +- 0.03
2,Logistic Regression,0.9600 +- 0.01,0.8942 +- 0.04,0.6439 +- 0.04,0.7481 +- 0.04,0.8181 +- 0.02
3,MLP,0.9830 +- 0.01,0.9214 +- 0.09,0.8992 +- 0.08,0.9048 +- 0.05,0.9452 +- 0.04
4,Random Forest,0.9850 +- 0.01,0.9770 +- 0.03,0.8560 +- 0.16,0.9050 +- 0.10,0.9269 +- 0.08
5,SVM,0.9760 +- 0.01,0.8906 +- 0.06,0.8335 +- 0.07,0.8596 +- 0.05,0.9118 +- 0.03


## TBTR

In [21]:
# TBTR (Treinar com Dados Balanceados [SDV + Dados Reais] , Testar com Dados Reais)
class SDVPipelineTBTR(BaseSampler):
    _sampling_type = "over-sampling"
    
    _parameter_constraints = {
        "X": [pd.DataFrame],
        "y": [pd.DataFrame, pd.Series],
        "metadata": [object],
        "target": [str],
        "num_features": [list],
        "cat_features": [list],
    }
    
    def __init__(self, metadata, target, num_features, cat_features):
        self.metadata = metadata
        self.target = target
        self.num_features = num_features
        self.cat_features = cat_features
        self.synthesizer = SingleTablePreset(self.metadata, name="FAST_ML")
        super().__init__()

    def _fit_resample(self, X, y):
        X = pd.DataFrame(X, columns=self.num_features + self.cat_features).reset_index(drop=True)
        y = pd.DataFrame(y, columns=[self.target]).reset_index(drop=True)
        df_train = pd.merge(X, y, left_index=True, right_index=True)
        
        X.to_csv("X.csv", index = False)
        y.to_csv("y.csv", index = False)
        df_train.to_csv("df_train.csv", index = False)
        
        for col in self.cat_features:
            df_train[col] = df_train[col].astype(int)
        
        self.synthesizer.fit(df_train)

        class_counts = y[self.target].value_counts()
        minority_class = class_counts.idxmin()
        synthetic_samples_needed = class_counts.max() - class_counts.min()

        if minority_class == 0:
            balanced_conditions_0 = Condition(
                num_rows=synthetic_samples_needed,
                column_values={self.target: 0},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_0]
            )
        elif minority_class == 1:
            balanced_conditions_1 = Condition(
                num_rows=synthetic_samples_needed,
                column_values={self.target: 1},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_1]
            )
        
        # X_balanced = pd.concat([X, df_synth.drop(self.target, axis=1)], axis=0, ignore_index=True)
        # y_balanced = pd.concat([y, df_synth[[self.target]]], axis=0, ignore_index=True)
        
        X_balanced = pd.concat([X.reset_index(drop=True), df_synth.drop(self.target, axis=1).reset_index(drop=True)], axis=0, ignore_index=True)
        y_balanced = pd.concat([y.reset_index(drop=True), df_synth[[self.target]].reset_index(drop=True)], axis=0, ignore_index=True)
        
        X_balanced.to_csv("X_balanced.csv", index = False)
        y_balanced.to_csv("y_balanced.csv", index = False)
              
        if sparse.issparse(X):
            X_balanced = sparse.vstack([X_balanced], format=X.format)
        else:
            X_balanced = np.vstack([X_balanced])
            
        y_balanced = pd.Series(np.ravel(y_balanced))
        
        self.X_balanced = X_balanced
        self.y_balanced = y_balanced
        
        X = X_balanced
        y = y_balanced
             
        with open('debug.txt', 'w') as f:
            f.write(f'X: {X.shape}\n')
            f.write(f'y: {y.shape}\n')
            f.write(f'y: \n{y.value_counts()}\n')
            f.write(f'X_balanced: {X_balanced.shape}\n')
            f.write(f'y_balanced: {y_balanced.shape}\n')
            f.write(f'y_balanced: \n{y_balanced.value_counts()}\n')
        
        return X_balanced, y_balanced

In [22]:
tbtr_df = evaluation_pipeline(X = X, y = y, num_features = num_features, cat_features = cat_features, sampling_strategy = "TBTR", metadata = metadata, target = target)
tbtr_df

Sampling Strategy: TBTR

Training Model: SVC()
Training fold 1...
Transformed data before training:
[[-0.23842126 -0.24237446  0.68180842 ...  0.          0.
   1.        ]
 [-0.27230101  0.21117599  1.07425408 ...  0.          0.
   1.        ]
 [-0.41270388 -0.21448779 -0.31162592 ...  0.          1.
   0.        ]
 ...
 [ 0.16862596 -0.18749428 -0.57620135 ...  0.          0.
   1.        ]
 [ 0.04337145 -0.25388811  0.38075105 ...  0.          0.
   1.        ]
 [ 0.8061746  -0.26400213 -0.18894016 ...  0.          1.
   0.        ]]


Sampling conditions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 658/658 [00:00<00:00, 4386.67it/s]


Transformed data after training:
[[-0.21166344 -0.33762947 -0.49941286 ...  0.          0.
   1.        ]
 [-0.21278516 -0.34067606 -0.40718626 ...  0.          0.
   1.        ]
 [ 0.9442208  -0.29182961  0.40032815 ...  0.          1.
   0.        ]
 ...
 [ 2.78030359  1.71907593 -0.45520773 ...  0.          0.
   1.        ]
 [-0.48150074 -0.2481489   0.08994409 ...  0.          0.
   1.        ]
 [-0.25192411 -0.30419872  2.19945639 ...  0.          0.
   1.        ]]
Training fold 2...
Transformed data before training:
[[-0.30871038  0.24065804  1.23567589 ...  0.          0.
   1.        ]
 [-0.09375205 -0.25392454 -0.57004662 ...  0.          1.
   0.        ]
 [-0.30492228  0.23768361 -0.38786858 ...  0.          1.
   0.        ]
 ...
 [ 0.18233431 -0.19541242 -0.62724469 ...  0.          0.
   1.        ]
 [-0.2376048  -0.24849916  2.24467356 ...  0.          0.
   1.        ]
 [ 0.89234954 -0.27909764 -0.19013089 ...  0.          1.
   0.        ]]


Sampling conditions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 646/646 [00:00<00:00, 4682.47it/s]


Transformed data after training:
[[-0.18619523 -0.25735474  0.49667146 ...  0.          0.
   1.        ]
 [-0.32641519 -0.23377648 -0.26529448 ...  0.          1.
   0.        ]
 [-0.22203803 -0.2550756  -0.31675633 ...  0.          0.
   1.        ]
 ...
 [-0.13531351 -0.27104167 -0.42803875 ...  0.          1.
   0.        ]
 [ 0.05357682 -0.23702423 10.96802688 ...  0.          0.
   1.        ]
 [ 0.04052252 -0.26708957  0.26575991 ...  0.          0.
   1.        ]]
Training fold 3...
Transformed data before training:
[[-0.23440643 -0.25518547  0.70753327 ...  0.          0.
   1.        ]
 [-0.26680071  0.18822119  1.10053872 ...  0.          0.
   1.        ]
 [-0.40104756 -0.2279225  -0.28731813 ...  0.          1.
   0.        ]
 ...
 [ 0.15479367 -0.2015327  -0.55227095 ...  0.          0.
   1.        ]
 [ 0.03503099 -0.26644162  0.40604646 ...  0.          0.
   1.        ]
 [-0.20575186 -0.24898092  1.99573582 ...  0.          0.
   1.        ]]


KeyboardInterrupt: 

In [11]:
tbtr_df = evaluation_pipeline(X = X, y = y, num_features = num_features, cat_features = cat_features, sampling_strategy = "TBTR", metadata = metadata, target = target)
tbtr_df

Sampling Strategy: TBTR

Training Model: SVC()
Training fold 1...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 666/666 [00:00<00:00, 11288.45it/s]


Training fold 2...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 640/640 [00:00<00:00, 10847.15it/s]


Training fold 3...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 656/656 [00:00<00:00, 11714.24it/s]


Training fold 4...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 11068.63it/s]


Training fold 5...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 644/644 [00:00<00:00, 11103.39it/s]



Training Model: DecisionTreeClassifier()
Training fold 1...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 644/644 [00:00<00:00, 11103.29it/s]


Training fold 2...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 652/652 [00:00<00:00, 11241.04it/s]


Training fold 3...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 658/658 [00:00<00:00, 11749.30it/s]


Training fold 4...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 640/640 [00:00<00:00, 11227.90it/s]


Training fold 5...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 654/654 [00:00<00:00, 11473.51it/s]



Training Model: KNeighborsClassifier()
Training fold 1...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 644/644 [00:00<00:00, 11291.74it/s]


Training fold 2...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 656/656 [00:00<00:00, 11508.64it/s]


Training fold 3...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 652/652 [00:00<00:00, 11847.34it/s]


Training fold 4...


Sampling conditions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 662/662 [00:00<00:00, 8486.89it/s]


Training fold 5...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 634/634 [00:00<00:00, 10220.42it/s]



Training Model: RandomForestClassifier()
Training fold 1...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 652/652 [00:00<00:00, 11437.94it/s]


Training fold 2...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 646/646 [00:00<00:00, 11333.63it/s]


Training fold 3...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 660/660 [00:00<00:00, 11186.48it/s]


Training fold 4...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 648/648 [00:00<00:00, 10124.26it/s]


Training fold 5...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 11457.56it/s]



Training Model: LogisticRegression(max_iter=1000)
Training fold 1...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 11264.45it/s]


Training fold 2...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 650/650 [00:00<00:00, 11206.93it/s]


Training fold 3...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 640/640 [00:00<00:00, 11227.94it/s]


Training fold 4...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 652/652 [00:00<00:00, 11051.07it/s]


Training fold 5...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 664/664 [00:00<00:00, 11857.35it/s]



Training Model: MLPClassifier(max_iter=1000)
Training fold 1...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 11263.13it/s]


Training fold 2...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 644/644 [00:00<00:00, 10566.98it/s]


Training fold 3...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 670/670 [00:00<00:00, 12408.09it/s]


Training fold 4...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 648/648 [00:00<00:00, 11571.18it/s]


Training fold 5...


Sampling conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 644/644 [00:00<00:00, 11298.21it/s]


Unnamed: 0,Classification,Accuracy,Precision,Recall,F1 Score,ROC/AUC
0,Decision Tree,0.9100 +- 0.02,0.5200 +- 0.11,0.9172 +- 0.04,0.6569 +- 0.07,0.9134 +- 0.01
1,KNN,0.9390 +- 0.02,0.6091 +- 0.08,0.9627 +- 0.06,0.7433 +- 0.07,0.9498 +- 0.04
2,Logistic Regression,0.9390 +- 0.01,0.6148 +- 0.10,0.8579 +- 0.09,0.7154 +- 0.10,0.9019 +- 0.05
3,MLP,0.9430 +- 0.01,0.6193 +- 0.12,0.9548 +- 0.05,0.7470 +- 0.09,0.9483 +- 0.03
4,Random Forest,0.9510 +- 0.02,0.6741 +- 0.13,0.8774 +- 0.13,0.7619 +- 0.13,0.9172 +- 0.07
5,SVM,0.9440 +- 0.02,0.6473 +- 0.16,0.9088 +- 0.07,0.7439 +- 0.11,0.9287 +- 0.03


## TSTR

In [13]:
# TSTR (Treinar com Dados Sintéticos, Testar com Dados Reais)
# Dados Sintéticos Balanceados
class SDVPipelineTSTR(BaseSampler):
    _sampling_type = "over-sampling"
    
    _parameter_constraints = {
        "X": [pd.DataFrame],
        "y": [pd.DataFrame, pd.Series],
        "metadata": [object],
        "target": [str],
        "num_features": [list],
        "cat_features": [list],
    }
    
    def __init__(self, metadata, target, num_features, cat_features):
        self.metadata = metadata
        self.target = target
        self.num_features = num_features
        self.cat_features = cat_features
        self.synthesizer = SingleTablePreset(self.metadata, name="FAST_ML")
        super().__init__()
        
    def _fit_resample(self, X, y):
        X = pd.DataFrame(X, columns=self.num_features + self.cat_features).reset_index(drop=True)
        y = pd.DataFrame(y, columns=[self.target]).reset_index(drop=True)
        df_train = pd.merge(X, y, left_index=True, right_index=True)
        for col in self.cat_features:
            df_train[col] = df_train[col].astype(int)
        
        self.synthesizer.fit(df_train)
        
        class_counts = y.value_counts()
        majority_class = class_counts.idxmax()
        synthetic_samples_needed = class_counts.max()

        balanced_conditions_0 = Condition(
            num_rows=synthetic_samples_needed,
            column_values={target: 0},
        )

        balanced_conditions_1 = Condition(
            num_rows=synthetic_samples_needed,
            column_values={target: 1},
        )

        df_synth = self.synthesizer.sample_from_conditions(
            conditions=[balanced_conditions_0, balanced_conditions_1]
        )
        
        X_train_synth = df_synth.drop(target, axis=1)
        y_train_synth = df_synth[target]
        
        if sparse.issparse(X):
            X_train_synth = sparse.vstack([X_train_synth], format=X.format)
        else:
            X_train_synth = np.vstack([X_train_synth])
            
        y_train_synth = pd.Series(np.ravel(y_train_synth))

        return X_train_synth, y_train_synth

In [14]:
tstr_df = evaluation_pipeline(X = X, y = y, num_features = num_features, cat_features = cat_features, sampling_strategy = "TSTR", metadata = metadata, target = target)
tstr_df

Sampling Strategy: TSTR

Training Model: SVC()
Training fold 1...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1466/1466 [00:00<00:00, 9773.34it/s]


Training fold 2...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:00<00:00, 9200.37it/s]


Training fold 3...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1456/1456 [00:00<00:00, 9215.16it/s]


Training fold 4...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1442/1442 [00:00<00:00, 7589.46it/s]


Training fold 5...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1444/1444 [00:00<00:00, 7977.85it/s]



Training Model: DecisionTreeClassifier()
Training fold 1...


Sampling conditions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1444/1444 [00:00<00:00, 10241.13it/s]


Training fold 2...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1452/1452 [00:00<00:00, 9810.76it/s]


Training fold 3...


Sampling conditions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1458/1458 [00:00<00:00, 10120.27it/s]


Training fold 4...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1440/1440 [00:00<00:00, 8372.11it/s]


Training fold 5...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1454/1454 [00:00<00:00, 5769.83it/s]



Training Model: KNeighborsClassifier()
Training fold 1...


Sampling conditions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1444/1444 [00:00<00:00, 10348.50it/s]


Training fold 2...


Sampling conditions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1456/1456 [00:00<00:00, 10181.84it/s]


Training fold 3...


Sampling conditions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1452/1452 [00:00<00:00, 10014.12it/s]


Training fold 4...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1462/1462 [00:00<00:00, 7735.45it/s]


Training fold 5...


Sampling conditions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1434/1434 [00:00<00:00, 10170.18it/s]



Training Model: RandomForestClassifier()
Training fold 1...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1452/1452 [00:00<00:00, 9810.81it/s]


Training fold 2...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1446/1446 [00:00<00:00, 8262.85it/s]


Training fold 3...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1460/1460 [00:00<00:00, 8110.90it/s]


Training fold 4...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1448/1448 [00:00<00:00, 6188.00it/s]


Training fold 5...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1442/1442 [00:00<00:00, 6801.91it/s]



Training Model: LogisticRegression(max_iter=1000)
Training fold 1...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1448/1448 [00:00<00:00, 6830.20it/s]


Training fold 2...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1458/1458 [00:00<00:00, 6749.51it/s]


Training fold 3...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1442/1442 [00:00<00:00, 3525.67it/s]


Training fold 4...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1452/1452 [00:00<00:00, 6340.50it/s]


Training fold 5...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1448/1448 [00:00<00:00, 6766.39it/s]



Training Model: MLPClassifier(max_iter=1000)
Training fold 1...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1448/1448 [00:00<00:00, 6766.36it/s]


Training fold 2...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1444/1444 [00:00<00:00, 6747.57it/s]


Training fold 3...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1450/1450 [00:00<00:00, 6223.17it/s]


Training fold 4...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1458/1458 [00:00<00:00, 6657.53it/s]


Training fold 5...


Sampling conditions: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1448/1448 [00:00<00:00, 6552.07it/s]


Unnamed: 0,Classification,Accuracy,Precision,Recall,F1 Score,ROC/AUC
0,Decision Tree,0.8990 +- 0.02,0.4655 +- 0.16,0.5953 +- 0.19,0.5121 +- 0.16,0.7624 +- 0.09
1,KNN,0.9140 +- 0.03,0.5393 +- 0.08,0.7173 +- 0.13,0.6096 +- 0.07,0.8271 +- 0.07
2,Logistic Regression,0.9440 +- 0.02,0.6958 +- 0.10,0.7546 +- 0.12,0.7181 +- 0.08,0.8584 +- 0.06
3,MLP,0.9490 +- 0.01,0.7265 +- 0.04,0.7287 +- 0.13,0.7212 +- 0.07,0.8500 +- 0.06
4,Random Forest,0.9440 +- 0.03,0.7022 +- 0.18,0.7120 +- 0.18,0.7029 +- 0.17,0.8394 +- 0.10
5,SVM,0.9440 +- 0.01,0.6799 +- 0.09,0.7419 +- 0.11,0.7042 +- 0.07,0.8539 +- 0.06


## SMOTE

In [15]:
def evaluate_pipeline_smote(X, y, num_features: list , cat_features: list):
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []
    
    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]
    
    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
        
    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i+1}...")
       
            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
          
            balance_classes_step = ("balacing", SMOTE(random_state=42))
            normalization_step = ("normalization", standard_scaler)
            model_step = ("model", model)
            steps = [balance_classes_step, normalization_step, model_step]
            pipeline = Pipeline(steps = steps)

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1)
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)
    
            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )
    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df

In [16]:
smote_df = evaluation_pipeline(X = X, y = y, num_features = num_features, cat_features = cat_features, sampling_strategy = "SMOTE")
smote_df

Sampling Strategy: SMOTE

Training Model: SVC()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: DecisionTreeClassifier()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: KNeighborsClassifier()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: RandomForestClassifier()
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: LogisticRegression(max_iter=1000)
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...

Training Model: MLPClassifier(max_iter=1000)
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...


Unnamed: 0,Classification,Accuracy,Precision,Recall,F1 Score,ROC/AUC
0,Decision Tree,0.9780 +- 0.01,0.8481 +- 0.10,0.9337 +- 0.05,0.8862 +- 0.06,0.9580 +- 0.03
1,KNN,0.9690 +- 0.02,0.7811 +- 0.11,0.9513 +- 0.03,0.8549 +- 0.07,0.9613 +- 0.02
2,Logistic Regression,0.9410 +- 0.02,0.6308 +- 0.05,0.9169 +- 0.08,0.7454 +- 0.05,0.9302 +- 0.04
3,MLP,0.9790 +- 0.01,0.8520 +- 0.09,0.9415 +- 0.04,0.8911 +- 0.04,0.9625 +- 0.02
4,Random Forest,0.9820 +- 0.01,0.9058 +- 0.10,0.9318 +- 0.10,0.9117 +- 0.05,0.9592 +- 0.04
5,SVM,0.9660 +- 0.01,0.7533 +- 0.10,0.9148 +- 0.07,0.8229 +- 0.08,0.9426 +- 0.04


In [None]:
# ## debugs

# with open('debug.txt', 'w') as f:
#     f.write(f'synthetic_samples_needed: {synthetic_samples_needed}\n')
#     f.write(f'df_synth: {df_synth.shape}\n')
#     f.write(f'X: {X.shape}\n')
#     f.write(f'df_synth.drop(self.target, axis=1): {df_synth.drop(self.target, axis=1).shape}\n')
#     f.write(f'y: {y.shape}\n')
#     f.write(f'df_synth[self.target]: {df_synth[self.target].shape}\n')
#     f.write(f'y_balanced: {y_balanced.shape}\n')

In [None]:
onehotencoder = make_column_transformer(
    (OneHotEncoder(), cat_features),
    remainder='passthrough',
)

onehotencoder_step = ("encoder", onehotencoder)
[onehotencoder_step, normalization_step, model_step]

In [None]:
# ver a questão do pq do X estar indo pra float, é pra ficar int

# Colocar um step do pipeline a mais para o Encoder, tem alguns datasets que eu preciso transformar dados categóricos em numéricos antes da avaliação dos modelos
# Consertar um erro aparente sobre os dados inteiros estão virando float, mas devem ser mantidos inteiros
# Debugar o código no sentido de analisar se as classes estão sendo realmente equilibradas
# o SDV recebe os dados ainda splitados por conta do gridsearchcv que divide os dados em treino e val