In [1]:
from autorank import autorank, plot_stats, create_report, latex_table

import sdgym
from sdv.metadata import SingleTableMetadata
from sdv.single_table import (
    CTGANSynthesizer,
    TVAESynthesizer,
    GaussianCopulaSynthesizer,
    CopulaGANSynthesizer,
)
from sdv.lite import SingleTablePreset
from sdv.evaluation.single_table import (
    evaluate_quality,
    get_column_plot,
    get_column_pair_plot,
    run_diagnostic,
)
from sdv.sampling import Condition
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import warnings
import time
import os
import json
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    StratifiedKFold,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.manifold import TSNE
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

warnings.filterwarnings("ignore")
sns.set(style="darkgrid", font_scale=0.5)
custom_palette = ["#8b4513", "#90ee90", "#545454", "#6a287e", "#f0be00"]
sns.set_palette(custom_palette)

In [2]:
def dummy(df, columns):
    dummy_variables = []
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column)
        dummy_variables.append(dummies)
    return dummy_variables

def label_encode_columns(df, columns_to_encode):
    label_encoder = LabelEncoder()

    for column in columns_to_encode:
        if column in df.columns:
            df[column] = label_encoder.fit_transform(df[column])

    return df

In [2]:
# TBTR (Treinar com Dados Balanceados [SDV + Dados Reais], Testar com Dados Reais)
class SDVPipelineBalanced(BaseEstimator, TransformerMixin):
    def __init__(self, metadata, name):
        self.synthesizer = SingleTablePreset(metadata, name="FAST_ML")

    def fit(self, X_train, y_train):
        df_train = pd.concat([X_train, y_train], axis=1)
        self.synthesizer.fit(df_train)
        return self

    def transform(self, X_train, y_train, target, cat_features):
        class_counts = y_train.value_counts()
        minority_class = class_counts.idxmin()
        synthetic_samples_needed = class_counts.max() - class_counts.min()

        if minority_class == 0:
            balanced_conditions_0 = Condition(
                num_rows=synthetic_samples_needed,
                column_values={target: 0},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_0]
            )
        elif minority_class == 1:
            balanced_conditions_1 = Condition(
                num_rows=synthetic_samples_needed,
                column_values={target: 1},
            )
            df_synth = self.synthesizer.sample_from_conditions(
                conditions=[balanced_conditions_1]
            )
            
        # onehotencoder
        # dummy_variables = dummy(df_synth, cat_features)
        # df_synth = pd.concat([df_synth] + dummy_variables, axis=1)
        # df_synth = df_synth.drop(cat_features, axis=1)

        X_train_balanced = pd.concat([X_train, df_synth.drop(target, axis=1)])
        y_train_balanced = pd.concat([y_train, df_synth[target]])

        return X_train_balanced, y_train_balanced


# TSTR (Treinar com Dados Sintéticos, Testar com Dados Reais)
# Dados Sintéticos Balanceados
class SDVPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, metadata, name):
        self.synthesizer = SingleTablePreset(metadata, name="FAST_ML")

    def fit(self, X_train, y_train):
        df_train = pd.concat([X_train, y_train], axis=1)
        self.synthesizer.fit(df_train)
        return self

    def transform(self, X_train, y_train, target):
        class_counts = y_train.value_counts()
        majority_class = class_counts.idxmax()
        synthetic_samples_needed = class_counts.max()

        balanced_conditions_0 = Condition(
            num_rows=synthetic_samples_needed,
            column_values={target: 0},
        )

        balanced_conditions_1 = Condition(
            num_rows=synthetic_samples_needed,
            column_values={target: 1},
        )

        df_synth = self.synthesizer.sample_from_conditions(
            conditions=[balanced_conditions_0, balanced_conditions_1]
        )

        # onehotencoder
        # dummy_variables = dummy(df_synth, cat_features)
        # df_synth = pd.concat([df_synth] + dummy_variables, axis=1)
        # df_synth = df_synth.drop(cat_features, axis=1)

        X_train_synth = df_synth.drop(target, axis=1)
        y_train_synth = df_synth[target]

        return X_train_synth, y_train_synth

NameError: name 'BaseEstimator' is not defined

In [1]:
def pipeline_evaluation(X, y):
    np.random.seed(12345)

    results = []

    models = [
        (
            "SVM",
            SVC(),
            {"model__C": [0.1, 0.5, 1, 5, 10], "model__kernel": ["linear", "rbf"]},
        ),
        (
            "Decision Tree",
            DecisionTreeClassifier(),
            {
                "model__max_depth": [None, 1, 2, 5, 10],
                "model__min_samples_split": [2, 5, 10],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "KNN",
            KNeighborsClassifier(),
            {
                "model__n_neighbors": [1, 3, 5, 7, 10],
                "model__weights": ["uniform", "distance"],
            },
        ),
        (
            "Random Forest",
            RandomForestClassifier(),
            {
                "model__n_estimators": [100, 200, 300],
                "model__max_depth": [None, 5, 10],
                "model__min_samples_split": [2, 5],
                "model__min_samples_leaf": [1, 5],
            },
        ),
        (
            "Logistic Regression",
            LogisticRegression(max_iter=1000),
            {
                "model__C": [0.1, 0.5, 1, 5, 10],
                "model__solver": ["liblinear", "sag", "saga"],
            },
        ),
        (
            "MLP",
            MLPClassifier(max_iter=1000),
            {
                "model__hidden_layer_sizes": [(100,), (100, 50)],
                "model__alpha": [0.0001, 0.001, 0.01],
            },
        ),
    ]

    # Normalização somente nos dados numericos
    standard_scaler = ColumnTransformer(
        transformers=[("numerical_standard_scaler", StandardScaler(), num_features)],
        remainder="passthrough",
    )
    normalization_step = ("normalization", standard_scaler)

    stratified_cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    for name, model, param_grid in models:
        for train_index, test_index in stratified_cv.split(X, y):
            X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model_step = ("model", model)
            steps = [normalization_step, model_step]
            pipeline = Pipeline(steps)
            pipeline.fit(X_train, y_train)

            clf = GridSearchCV(
                pipeline, param_grid, cv=stratified_cv, refit=True, scoring="accuracy"
            )
            clf.fit(X_train, y_train)

            best_model = clf.best_estimator_
            best_params = clf.best_params_

            y_pred = best_model.predict(X_test)
            accuracy_test = accuracy_score(y_test, y_pred)
            f1_test = f1_score(y_test, y_pred)
            precision_test = precision_score(y_test, y_pred)
            recall_test = recall_score(y_test, y_pred)
            roc_auc_test = roc_auc_score(y_test, y_pred)
            results.append(
                {
                    "Classificador": name,
                    "Acurácia": round(accuracy_test, 4),
                    "Precisão": round(precision_test, 4),
                    "Recall": round(recall_test, 4),
                    "F1 Score": round(f1_test, 4),
                    "ROC/AUC": round(roc_auc_test, 4),
                }
            )

    return pd.DataFrame(results)

## Baseline

In [11]:
### OBS ###
# mudar o path

df = pd.read_csv("data/employee/employee.csv")

In [12]:
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
2759,Masters,2017,Pune,2,31,Female,No,2,0
2760,Bachelors,2016,Pune,3,30,Male,No,2,0
2761,Masters,2013,Pune,2,37,Male,No,2,1
2762,Masters,2018,New Delhi,3,27,Male,No,5,1


In [13]:
cat_features = ["Education", "Gender", "City", "EverBenched", "PaymentTier", "ExperienceInCurrentDomain"]
num_features = ["JoiningYear"]
target = "LeaveOrNot"

In [14]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)

In [15]:
dummy_variables = dummy(df, cat_features)
df = pd.concat([df] + dummy_variables, axis=1)
df = df.drop(cat_features, axis=1)

In [16]:
y = df[target]
X = df.drop(target, axis=1)

In [17]:
baseline_df = pipeline_evaluation(X, y)
baseline_df

Unnamed: 0,Classificador,Acurácia,Precisão,Recall,F1 Score,ROC/AUC
0,SVM,0.6492,0.6018,0.3134,0.4121,0.5897
1,SVM,0.6691,0.6606,0.3303,0.4404,0.6099
2,SVM,0.6637,0.6429,0.3303,0.4364,0.6054
3,SVM,0.6474,0.6186,0.2752,0.381,0.5824
4,SVM,0.6594,0.6408,0.3041,0.4125,0.5968
5,Decision Tree,0.8047,0.8114,0.6544,0.7245,0.7781
6,Decision Tree,0.7884,0.8098,0.6055,0.6929,0.7565
7,Decision Tree,0.8156,0.8718,0.6239,0.7273,0.7821
8,Decision Tree,0.7902,0.8036,0.6193,0.6995,0.7604
9,Decision Tree,0.8134,0.8434,0.6452,0.7311,0.7838


## TSTR

## TBTR

## SMOTE