# Modeling

In [2]:
import os
from datetime import datetime
from hashlib import sha256

import pandas as pd
import numpy as np

# -------------------------
# model libs
from pycaret.classification import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

# -------------------------
# model validation and hyperparameter tunning libs
from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold,
                                     RandomizedSearchCV, GridSearchCV, train_test_split)

from sklearn.metrics import (precision_recall_curve, average_precision_score, classification_report, roc_curve)

from yellowbrick import ROCAUC

# -------------------------
# graph libs
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd
from utils.classification_model_evaluation import ClassificationModelEvaluation as cme

raw_path = "../data/raw/"
external_path = "../data/external/"
interim_path = "../data/interim/"
path_processed = "../data/processed/"
reports_path = "../reports/"

path_model = "../models/"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# load the autoreload extension
%load_ext autoreload

# Set extension to reload modules every time before executing code
%autoreload 2

## Carregando dataframes da preparação de dados

In [3]:
X_train_encoded = pd.read_parquet(interim_path+'X_train_encoded_v1.pqt')
X_validation_encoded = pd.read_parquet(interim_path+'X_validation_encoded_v1.pqt')

y_train = pd.read_parquet(interim_path+'y_train.pqt')
y_validation = pd.read_parquet(interim_path+'y_validation.pqt')

print(f"""O dataframe X_train possui:
- {X_train_encoded.shape[0]} registros; e
- {X_train_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

print(f"""O dataframe X_validation possui:
- {X_validation_encoded.shape[0]} registros; e
- {X_validation_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

O dataframe X_train possui:
- 623 registros; e
- 11 atributos, SEM a variável resposta ("Survived").

O dataframe X_validation possui:
- 268 registros; e
- 11 atributos, SEM a variável resposta ("Survived").



## Fazendo o split treinamento/validação no df_train

In [4]:
n_splits = 5
n_repeats = 3
random_state = 42
scoring = "accuracy"

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

## Modelo baseline ("Random Forest")

In [32]:
model_rf = RandomForestClassifier(random_state=42)

n_scores = cross_val_score(model_rf, X_train_encoded, y_train, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

print(f"{scoring}: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

accuracy: média: 0.805  desvio padrão: 0.024


## Usando comparação de modelos usando o pycaret

In [8]:
clf1 = setup(data = pd.concat([X_train_encoded, y_train], axis=1), target = 'Survived')
best = compare_models(sort = 'Accuracy')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8347,0.8517,0.7,0.8578,0.7679,0.6416,0.6522,0.928
gbc,Gradient Boosting Classifier,0.814,0.824,0.6647,0.8287,0.7337,0.5942,0.6056,0.024
lr,Logistic Regression,0.8071,0.8458,0.7118,0.7784,0.7417,0.5884,0.5917,0.026
ridge,Ridge Classifier,0.8048,0.0,0.7059,0.7778,0.737,0.5827,0.5874,0.007
lda,Linear Discriminant Analysis,0.8047,0.8447,0.7059,0.7776,0.737,0.5826,0.5872,0.007
lightgbm,Light Gradient Boosting Machine,0.8005,0.8623,0.6882,0.7831,0.7286,0.5723,0.5789,0.012
rf,Random Forest Classifier,0.7982,0.8365,0.7,0.7674,0.7288,0.5691,0.5736,0.066
xgboost,Extreme Gradient Boosting,0.789,0.8353,0.6765,0.7633,0.7135,0.548,0.5538,0.069
et,Extra Trees Classifier,0.7867,0.8101,0.7,0.7459,0.7182,0.5473,0.5517,0.06
ada,Ada Boost Classifier,0.7863,0.8195,0.7235,0.7297,0.7246,0.5503,0.5522,0.026
