# Modeling

In [6]:
import os
from datetime import datetime
from hashlib import sha256

import pandas as pd
import numpy as np

# -------------------------
# model libs
from pycaret.classification import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

# -------------------------
# model validation and hyperparameter tunning libs
from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold,
                                     RandomizedSearchCV, GridSearchCV, train_test_split, cross_validate)

from sklearn.metrics import (precision_recall_curve, average_precision_score, classification_report, roc_curve)

from yellowbrick import ROCAUC

# -------------------------
# graph libs
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd
from utils.classification_model_evaluation import ClassificationModelEvaluation as cme

raw_path = "../data/raw/"
external_path = "../data/external/"
interim_path = "../data/interim/"
path_processed = "../data/processed/"
reports_path = "../reports/"

path_model = "../models/"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# load the autoreload extension
%load_ext autoreload

# Set extension to reload modules every time before executing code
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Carregando dataframes da preparação de dados

In [2]:
X_train_encoded = pd.read_parquet(interim_path+'X_train_encoded_v1.pqt')
X_validation_encoded = pd.read_parquet(interim_path+'X_validation_encoded_v1.pqt')

y_train = pd.read_parquet(interim_path+'y_train.pqt')
y_validation = pd.read_parquet(interim_path+'y_validation.pqt')

print(f"""O dataframe X_train possui:
- {X_train_encoded.shape[0]} registros; e
- {X_train_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

print(f"""O dataframe X_validation possui:
- {X_validation_encoded.shape[0]} registros; e
- {X_validation_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

O dataframe X_train possui:
- 623 registros; e
- 11 atributos, SEM a variável resposta ("Survived").

O dataframe X_validation possui:
- 268 registros; e
- 11 atributos, SEM a variável resposta ("Survived").



## Fazendo o split treinamento/validação no df_train

In [19]:
n_splits = 10
n_repeats = 3
random_state = 42
scoring = "accuracy"

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

## Modelo baseline ("Random Forest")

In [20]:
model_rf = RandomForestClassifier(random_state=42, n_estimators=1000)

n_scores = cross_val_score(model_rf, X_train_encoded, y_train, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

print(f"{scoring}: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

accuracy: média: 0.809  desvio padrão: 0.039


In [21]:
cross = cross_validate(model_rf, X_train_encoded, y_train, cv=cv, scoring="accuracy", return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

# print("\n")

# dct_importance = {}

# for i, model in enumerate(cross['estimator']):
#     dct_importance[i] = model.feature_importances_

# print("Feature importance:")
# dct_importance = pd.DataFrame.from_dict(dct_importance, orient='index', columns=X_train_encoded)
# dct_importance.loc["mean", :] = dct_importance.apply(np.mean)
# dct_importance

test_score: -0.809
train_score: -0.982


## Usando comparação de modelos usando o pycaret

In [5]:
clf1 = setup(data = pd.concat([X_train_encoded, y_train], axis=1), target = 'Survived')
best = compare_models(sort = 'Accuracy')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8235,0.8509,0.6835,0.8249,0.7412,0.61,0.6204,0.015
catboost,CatBoost Classifier,0.8098,0.8424,0.6474,0.8212,0.7149,0.5765,0.5919,0.884
lr,Logistic Regression,0.7981,0.8274,0.6654,0.772,0.7123,0.5582,0.5638,0.588
gbc,Gradient Boosting Classifier,0.7959,0.8139,0.6654,0.7694,0.7044,0.5514,0.5615,0.027
ridge,Ridge Classifier,0.7912,0.0,0.6651,0.7526,0.704,0.5439,0.5478,0.008
lda,Linear Discriminant Analysis,0.789,0.8288,0.6651,0.7471,0.702,0.5396,0.543,0.007
xgboost,Extreme Gradient Boosting,0.78,0.8444,0.7015,0.7194,0.6999,0.5276,0.5367,0.315
ada,Ada Boost Classifier,0.7706,0.7786,0.6772,0.7074,0.6876,0.5069,0.511,0.03
rf,Random Forest Classifier,0.7684,0.816,0.6967,0.7088,0.6947,0.5091,0.5169,0.072
dt,Decision Tree Classifier,0.7571,0.7506,0.6904,0.6864,0.6775,0.4842,0.4944,0.008
