In [6]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import warnings
import time
import os
import json
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    StratifiedKFold,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.manifold import TSNE
from imblearn.base import BaseSampler
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from autorank import autorank, plot_stats, create_report, latex_table

import sdgym
from sdv.metadata import SingleTableMetadata
from sdv.single_table import (
    CTGANSynthesizer,
    TVAESynthesizer,
    GaussianCopulaSynthesizer,
    CopulaGANSynthesizer,
)
from sdv.lite import SingleTablePreset
from sdv.evaluation.single_table import (
    evaluate_quality,
    get_column_plot,
    get_column_pair_plot,
    run_diagnostic,
)
from sdv.sampling import Condition

warnings.filterwarnings("ignore")
sns.set(style="darkgrid", font_scale=0.5)
custom_palette = ["#8b4513", "#90ee90", "#545454", "#6a287e", "#f0be00"]
sns.set_palette(custom_palette)

## Functions

In [7]:
def dummy(df, columns):
    dummy_variables = []
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column)
        dummy_variables.append(dummies)
    return dummy_variables

def dummy_transform(df, columns):
    dummy_variables = dummy(df, cat_features)
    df = pd.concat([df] + dummy_variables, axis=1)
    df = df.drop(cat_features, axis=1)
    return df

In [7]:
# def dummy_transform(df, columns):
#     dummy_variables = []
    
#     for column in columns:
#         dummies = pd.get_dummies(X[column], prefix=column)
#         dummy_variables.append(dummies)
#     result = pd.concat([df] + dummy_variables, axis=1)
#     result = result.drop(columns, axis=1)
    
#     return result

In [8]:
def label_encode_columns(df, columns_to_encode):
    label_encoder = LabelEncoder()

    for column in columns_to_encode:
        if column in df.columns:
            df[column] = label_encoder.fit_transform(df[column])

    return df

In [9]:
def mean_std_results(mean_df, std_df):
    concatenated_results = []

    for index, row_mean in mean_df.iterrows():
        classificador = row_mean['Classification']
        row_std = std_df.loc[std_df['Classification'] == classificador].squeeze()

        accuracy = f"{row_mean['Accuracy']:.4f} +- {row_std['Accuracy']:.2f}"
        precision = f"{row_mean['Precision']:.4f} +- {row_std['Precision']:.2f}"
        recall = f"{row_mean['Recall']:.4f} +- {row_std['Recall']:.2f}"
        f1 = f"{row_mean['F1 Score']:.4f} +- {row_std['F1 Score']:.2f}"
        roc_auc = f"{row_mean['ROC/AUC']:.4f} +- {row_std['ROC/AUC']:.2f}"

        concatenated_results.append({
            "Classification": classificador,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "ROC/AUC": roc_auc
        })

    return pd.DataFrame(concatenated_results)

In [10]:
class _ParamsValidationMixin:
    """Mixin class to validate parameters."""

    def _validate_params(self):
        """Validate types and values of constructor parameters.

        The expected type and values must be defined in the `_parameter_constraints`
        class attribute, which is a dictionary `param_name: list of constraints`. See
        the docstring of `validate_parameter_constraints` for a description of the
        accepted constraints.
        """
        if hasattr(self, "_parameter_constraints"):
            validate_parameter_constraints(
                self._parameter_constraints,
                self.get_params(deep=False),
                caller_name=self.__class__.__name__,
            )

class SDVPipelineTBTR(_ParamsValidationMixin, BaseSampler):
    _sampling_type = "bypass"

    _parameter_constraints = {
        "metadata": [object],
        "target": [str],
        "num_features": [list],
        "cat_features": [list],
    }
    
    def __init__(self, metadata, target, num_features, cat_features):
        print("Init")
        self.metadata = metadata
        self.target = target
        self.num_features = num_features
        self.cat_features = cat_features
        self.synthesizer = SingleTablePreset(self.metadata, name="FAST_ML")
        super().__init__()

    def _fit_resample(self, X, y):
        print("Fit")
        df_train = pd.concat([X, y], axis=1)
        self.synthesizer.fit(df_train)

        class_counts = y.value_counts()
        minority_class = class_counts.idxmin()
        synthetic_samples_needed = class_counts.max() - class_counts.min()

        if minority_class == 0:
            balanced_conditions_0 = Condition(
                num_rows=synthetic_samples_needed,
                column_values={self.target: 0},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_0]
            )
        elif minority_class == 1:
            balanced_conditions_1 = Condition(
                num_rows=synthetic_samples_needed,
                column_values={self.target: 1},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_1]
            )
        
        # onehotencoder
        # dummy_variables = dummy(df_synth, self.cat_features)
        # df_synth = pd.concat([df_synth] + dummy_variables, axis=1)
        # df_synth = df_synth.drop(self.cat_features, axis=1)
            
        # X_balanced = pd.concat([X, df_synth.drop(self.target, axis=1)])
        # y_balanced = pd.concat([y, df_synth[self.target]])
        
        X_df = pd.DataFrame(X, columns=self.num_features + self.cat_features)
        y_df = pd.Series(y, name=self.target)
        
        X_balanced = pd.concat([X_df, df_synth.drop(self.target, axis=1)], axis=0, ignore_index=True)
        y_balanced = pd.concat([y_df, df_synth[self.target]], axis=0, ignore_index=True)

        return X_balanced, y_balanced
    
    def fit_resample(self, X, y):
        print("Fit")
        output = self._fit_resample(X, y)
        return output

In [21]:
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class SDVPipelineTBTR(BaseSampler):
    _sampling_type = "bypass"
    
    def __init__(self, metadata, target, num_features, cat_features):
        self.metadata = metadata
        self.target = target
        self.num_features = num_features
        self.cat_features = cat_features
        self.synthesizer = SingleTablePreset(self.metadata, name="FAST_ML")
        super().__init__()

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        
        df_train = pd.concat([X, pd.Series(y, name=self.target)], axis=1)
        self.synthesizer.fit(df_train)
        
        self.class_counts_ = pd.Series(y).value_counts()
        self.minority_class_ = self.class_counts_.idxmin()
        self.synthetic_samples_needed_ = self.class_counts_.max() - self.class_counts_.min()

        return self

    def _fit_resample(self, X, y):
        check_is_fitted(self, "synthesizer")
        
        if self.minority_class_ == 0:
            balanced_conditions_0 = Condition(
                num_rows=self.synthetic_samples_needed_,
                column_values={self.target: 0},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_0]
            )
        elif self.minority_class_ == 1:
            balanced_conditions_1 = Condition(
                num_rows=self.synthetic_samples_needed_,
                column_values={self.target: 1},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_1]
            )

        X_df = pd.DataFrame(X, columns=self.num_features + self.cat_features)
        y_df = pd.Series(y, name=self.target)

        X_balanced = pd.concat([X_df, df_synth.drop(self.target, axis=1)], axis=0, ignore_index=True)
        y_balanced = pd.concat([y_df, df_synth[self.target]], axis=0, ignore_index=True)

        return X_balanced, y_balanced

In [25]:
def evaluate_pipeline_tbtr(X, y, num_features: list, cat_features: list, metadata, target):
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []

    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]

    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
    
    def create_dummy_transformer(columns):
        return FunctionTransformer(dummy_transform, kw_args={'columns': columns})

    dummy_transform_cats = ColumnTransformer(
        transformers=[
            ('one_hot_encoding', create_dummy_transformer(cat_features), cat_features),
        ],
        remainder='passthrough'
    )

    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i+1}...")

            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]

            print(
                f"X_train: {X_train.shape} \nX_test: {X_test.shape} \ny_train: {y_train.shape} \ny_test: {y_test.shape}"
            )

            balance_classes_step = ("balancing", SDVPipelineTBTR(metadata, target, num_features, cat_features))
            one_hot_encoder_step = ("one_hot_encoding", dummy_transform_cats, cat_features)
            normalization_step = ("normalization", standard_scaler, num_features)
            model_step = ("model", model)

            steps = [balance_classes_step, one_hot_encoder_step, normalization_step, model_step]
            pipeline = Pipeline(steps)

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1, error_score='raise')
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)

            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )

    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index=False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index=False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df

In [13]:
# # TBTR (Treinar com Dados Balanceados [SDV + Dados Reais], Testar com Dados Reais)
# class SDVPipelineTBTR(BaseEstimator, TransformerMixin):
#     def __init__(self, metadata, target):
#         self.metadata = metadata
#         self.target = target
#         self.synthesizer = SingleTablePreset(self.metadata, name="FAST_ML")

#     def fit(self, X_train, y_train):
#         df_train = pd.concat([X_train, y_train], axis=1)
#         self.synthesizer.fit(df_train)
#         return self

#     def resample(self, X_train, y_train):
#         class_counts = y_train.value_counts()
#         minority_class = class_counts.idxmin()
#         synthetic_samples_needed = class_counts.max() - class_counts.min()

#         if minority_class == 0:
#             balanced_conditions_0 = Condition(
#                 num_rows=synthetic_samples_needed,
#                 column_values={self.target: 0},
#             )
#             df_synth = self.synthesizer.sample_from_conditions(
#                 conditions=[balanced_conditions_0]
#             )
#         elif minority_class == 1:
#             balanced_conditions_1 = Condition(
#                 num_rows=synthetic_samples_needed,
#                 column_values={self.target: 1},
#             )
#             df_synth = self.synthesizer.sample_from_conditions(
#                 conditions=[balanced_conditions_1]
#             )
            
#         X_train_balanced = pd.concat([X_train, df_synth.drop(self.target, axis=1)])
#         y_train_balanced = pd.concat([y_train, df_synth[self.target]])

#         return X_train_balanced, y_train_balanced

In [14]:
# TSTR (Treinar com Dados Sintéticos, Testar com Dados Reais)
# Dados Sintéticos Balanceados
class SDVPipelineTSTR(BaseEstimator, TransformerMixin):
    def __init__(self, metadata, target):
        self.metadata = metadata
        self.synthesizer = SingleTablePreset(self.metadata, name="FAST_ML")

    def fit(self, X_train, y_train):
        df_train = pd.concat([X_train, y_train], axis=1)
        self.synthesizer.fit(df_train)
        return self

    def resample(self, X_train, y_train, target):
        class_counts = y_train.value_counts()
        majority_class = class_counts.idxmax()
        synthetic_samples_needed = class_counts.max()

        balanced_conditions_0 = Condition(
            num_rows=synthetic_samples_needed,
            column_values={target: 0},
        )

        balanced_conditions_1 = Condition(
            num_rows=synthetic_samples_needed,
            column_values={target: 1},
        )

        df_synth = self.synthesizer.sample_from_conditions(
            conditions=[balanced_conditions_0, balanced_conditions_1]
        )

        # onehotencoder
        # dummy_variables = dummy(df_synth, cat_features)
        # df_synth = pd.concat([df_synth] + dummy_variables, axis=1)
        # df_synth = df_synth.drop(cat_features, axis=1)

        X_train_synth = df_synth.drop(target, axis=1)
        y_train_synth = df_synth[target]

        return X_train_synth, y_train_synth

NameError: name 'BaseEstimator' is not defined

In [15]:
def evaluate_pipeline_tbtr(X, y, num_features: list , cat_features: list, metadata, target):
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []
    
    models = [
        (
            "SVM",
            SVC(),
            {"C": [0.1, 0.5, 1, 5, 10], "kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]
    
    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
    

    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i+1}...")
       
            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
            
            print(
                f"X_train: {X_train.shape} \nX_test: {X_test.shape} \ny_train: {y_train.shape} \ny_test: {y_test.shape}"
            )
             
            balance_classes_step = ("balacing", SDVPipelineTBTR(metadata, target, num_features, cat_features))
            normalization_step = ("normalization", standard_scaler)
            model_step = ("model", model)
            steps = [balance_classes_step, normalization_step, model_step]
            pipeline = Pipeline(steps)

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1, error_score='raise')
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)
    
            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )
        
    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df

In [16]:
def evaluate_pipeline_smote(X, y, num_features: list , cat_features: list):
    np.random.seed(42)

    num_outer_loop_folds = 5
    num_inner_loop_folds = 5
    results = []
    
    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]
    
    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
        
    for name, model, param_grid in models:
        print(f"\nTraining Model: {model}")
        folds = KFold(n_splits=num_outer_loop_folds, shuffle=True).split(X, y)
        for i, (train_index, test_index) in enumerate(folds):
            print(f"Training fold {i+1}...")
       
            X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
            X_test, y_test = X.iloc[test_index, :], y.iloc[test_index]
          
            balance_classes_step = ("balacing", SMOTE(random_state=42))
            normalization_step = ("normalization", standard_scaler)
            model_step = ("model", model)
            steps = [normalization_step, balance_classes_step, model_step]
            pipeline = Pipeline(steps)

            clf = GridSearchCV(pipeline, param_grid, cv=num_inner_loop_folds, n_jobs=-1)
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)
    
            results.append(
                {
                    "Classification": name,
                    "Accuracy": round(accuracy_test, 4),
                    "Precision": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )
    mean_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).mean()
    std_df = pd.DataFrame(results).groupby(["Classification"], as_index = False).std()
    results_df = mean_std_results(mean_df, std_df)
    return results_df

## Init

In [26]:
cat_features = ["Education", "Gender", "City", "EverBenched", "PaymentTier", "ExperienceInCurrentDomain"]
num_features = ["Age", "JoiningYear"]
target = "LeaveOrNot"

df = pd.read_csv("data/employee/employee.csv").sample(50)

# df = dummy_transform(df, cat_features)

y = df[target]
X = df.drop(target, axis=1)

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)

## TBTR

In [27]:
results_df = evaluate_pipeline_tbtr(X, y, num_features, cat_features, metadata, target)
results_df


Training Model: SVC()
Training fold 1...
X_train: (40, 8) 
X_test: (10, 8) 
y_train: (40,) 
y_test: (10,)


ValueError: Invalid parameter 'model' for estimator Pipeline(steps=[('balancing',
                 SDVPipelineTBTR(cat_features=['Education', 'Gender', 'City',
                                               'EverBenched', 'PaymentTier',
                                               'ExperienceInCurrentDomain'],
                                 metadata={
    "columns": {
        "Education": {
            "sdtype": "categorical"
        },
        "JoiningYear": {
            "sdtype": "numerical"
        },
        "City": {
            "sdtype": "categorical"
        },
        "PaymentTier": {
            "sdtype": "numerical"
        },
        "Age": {
            "sdtype": "numerical"
        },
        "Gender": {
            "sd...
                                                  ['Education', 'Gender',
                                                   'City', 'EverBenched',
                                                   'PaymentTier',
                                                   'ExperienceInCurrentDomain'])]),
                 ['Education', 'Gender', 'City', 'EverBenched', 'PaymentTier',
                  'ExperienceInCurrentDomain']),
                ('normalization',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical_standard_scaler',
                                                  StandardScaler(),
                                                  ['Age', 'JoiningYear'])]),
                 ['Age', 'JoiningYear']),
                ('model', SVC())]). Valid parameters are: ['memory', 'steps', 'verbose'].

## SMOTE

In [19]:
results_df = evaluate_pipeline_smote(X, y, num_features, cat_features)
results_df


Training Model: SVC()
Training fold 1...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 2...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 3...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 4...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 5...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)

Training Model: DecisionTreeClassifier()
Training fold 1...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 2...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 3...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 4...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)
Training fold 5...
X_train: (40, 22) 
X_test: (10, 22) 
y_train: (40,) 
y_test: (10,)

Training Model: KNeighborsClassifier()
Training fold 1...
X_train: (40, 22

Unnamed: 0,Classification,Accuracy,Precision,Recall,F1 Score,ROC/AUC
0,Decision Tree,0.6400 +- 0.05,0.3800 +- 0.22,0.3500 +- 0.25,0.3467 +- 0.21,0.5456 +- 0.09
1,KNN,0.6200 +- 0.13,0.5238 +- 0.28,0.5833 +- 0.30,0.4543 +- 0.08,0.6274 +- 0.10
2,Logistic Regression,0.7000 +- 0.16,0.5167 +- 0.39,0.5333 +- 0.45,0.4289 +- 0.29,0.6816 +- 0.21
3,MLP,0.6800 +- 0.11,0.5133 +- 0.37,0.4067 +- 0.38,0.3857 +- 0.25,0.6070 +- 0.16
4,Random Forest,0.7400 +- 0.11,0.4667 +- 0.38,0.4667 +- 0.36,0.4400 +- 0.31,0.6673 +- 0.18
5,SVM,0.7400 +- 0.15,0.7133 +- 0.28,0.5667 +- 0.25,0.5698 +- 0.10,0.6917 +- 0.14
