## Analise comparativa de Modelos

In [10]:
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Obtenção de dados 

In [11]:
df = pd.read_csv("../data/raw/data.csv")
df_dict = pd.read_csv("../data/external/dictionary.csv")
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,RowNumber,"Número da linha, usado como identificador sequ...",quantitativa,discreta
1,CustomerId,Identificador único do cliente,quantitativa,discreta
2,Surname,Sobrenome do cliente,qualitativa,nominal
3,CreditScore,Pontuação de crédito do cliente,quantitativa,discreta
4,Geography,País de residência do cliente,qualitativa,nominal
5,Gender,Gênero do cliente (Male/Female),qualitativa,nominal
6,Age,Idade do cliente,quantitativa,discreta
7,Tenure,Tempo de permanência do cliente no banco (em a...,quantitativa,discreta
8,Balance,Saldo da conta do cliente,quantitativa,contínua
9,NumOfProducts,Número de produtos que o cliente possui no banco,quantitativa,discreta


## Preparação de dados 

In [12]:
# Definindo a coluna alvo
target_column = 'Exited'

# Definindo as colunas nominais e contínuas com base na imagem
nominal_columns = [
    'Surname', 'Geography', 'Gender', 'HasCrCard', 'IsActiveMember'
]
continuous_columns = [
    'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary'
]

# Separando as features (X) e o alvo (y)
X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

# Pré-processamento
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False)), # codificação de variáveis nominais sem remover nenhuma categoria
])

continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), # tratamento de dados faltantes para variáveis contínuas
    ('normalization', StandardScaler()) # normalização de dados contínuos
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns)
])

# Definindo o modelo
model = LogisticRegression()


## Preparação de Modelos

In [17]:
from IPython.display import display, Markdown
import joblib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Verifique as colunas do DataFrame
print("Colunas do df:", df.columns)
print("Colunas do df_dict:", df_dict.columns)

# 3. Seleção de modelos
n_splits_comparative_analysis = 5  # Reduzindo o número de splits para economizar memória
n_folds_grid_search = 3  # Reduzindo o número de folds para economizar memória
test_size = .2
random_state = 42
scoring = 'accuracy'
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# Modelos a serem testados
max_iter = 1000
models = [
    ('K-Nearest Neighbors', KNeighborsClassifier(), {"n_neighbors": range(3, 10, 2), 'weights': ['uniform', 'distance']}),  # Reduzindo o range de n_neighbors
    ('Support Vector Machines', SVC(random_state=random_state, max_iter=max_iter), {"kernel": ["linear"], 'C':[1,10], 'gamma':[0.001]}),  # Simplificando os parâmetros
    ('Decision Tree', DecisionTreeClassifier(random_state=random_state), {'criterion':['gini'],'max_depth': [3, 6]}),  # Simplificando os parâmetros
    ('Random Forest', RandomForestClassifier(random_state=random_state), {'criterion':['gini'],'max_depth': [3, 6], 'n_estimators': [10]}),  # Simplificando os parâmetros
]

results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)

# Loop para treinar e validar cada modelo
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    try:
        scores = cross_validate(
            estimator=approach,
            X=X,
            y=y,
            cv=cross_validate_comparative_analysis,
            n_jobs=-1,
            scoring=metrics
        )
        scores_df = pd.DataFrame(scores)
        scores_df['model_name'] = model_name  # Adiciona a coluna 'model_name'
        display(scores_df.agg(['mean', 'std']))
        results = pd.concat([results, scores_df], ignore_index=True)
    except Exception as e:
        print(f"Erro ao treinar o modelo {model_name}: {e}")

# 3.1 Resultados gerais
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown("### 3.1 Resultados gerais"))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{
        'selector': 'thead th',
        'props': 'background-color: #333; color: white;'
    }])
)

# Persistência dos modelos
for model_name, model_object, model_parameters in models:
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    try:
        approach.fit(X, y)
        joblib.dump(approach, f"{model_name.replace(' ', '_').lower()}_model.joblib")
        print(f"Modelo {model_name} salvo como {model_name.replace(' ', '_').lower()}_model.joblib")
    except Exception as e:
        print(f"Erro ao salvar o modelo {model_name}: {e}")


Colunas do df: Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
Colunas do df_dict: Index(['variavel', 'descricao', 'tipo', 'subtipo'], dtype='object')
running K-Nearest Neighbors...
Erro ao treinar o modelo K-Nearest Neighbors: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}
running Support Vector Machines...
Erro ao treinar o modelo Support Vector Machines: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes

### 3.1 Resultados gerais

KeyError: 'model_name'