# Modeling

In [25]:
import os
from datetime import datetime
from hashlib import sha256

import pandas as pd
import numpy as np

# -------------------------
# model libs
from pycaret.classification import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

# -------------------------
# model validation and hyperparameter tunning libs
from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold,
                                     RandomizedSearchCV, GridSearchCV, train_test_split)

from sklearn.metrics import (precision_recall_curve, average_precision_score, classification_report, roc_curve)

from yellowbrick import ROCAUC

# -------------------------
# graph libs
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd
from utils.classification_model_evaluation import ClassificationModelEvaluation as cme

raw_path = "../data/raw/"
external_path = "../data/external/"
interim_path = "../data/interim/"
path_processed = "../data/processed/"
reports_path = "../reports/"

path_model = "../models/"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# load the autoreload extension
%load_ext autoreload

# Set extension to reload modules every time before executing code
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Carregando dataframes da preparação de dados

In [27]:
X_train_encoded = pd.read_parquet(interim_path+'X_train_encoded_v1.pqt')
X_validation_encoded = pd.read_parquet(interim_path+'X_validation_encoded_v1.pqt')

y_train = pd.read_parquet(interim_path+'y_train.pqt')
y_validation = pd.read_parquet(interim_path+'y_validation.pqt')

print(f"""O dataframe X_train possui:
- {X_train_encoded.shape[0]} registros; e
- {X_train_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

print(f"""O dataframe X_validation possui:
- {X_validation_encoded.shape[0]} registros; e
- {X_validation_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

O dataframe X_train possui:
- 623 registros; e
- 11 atributos, SEM a variável resposta ("Survived").

O dataframe X_validation possui:
- 268 registros; e
- 11 atributos, SEM a variável resposta ("Survived").



## Fazendo o split treinamento/validação no df_train

In [30]:
n_splits = 5
n_repeats = 3
random_state = 42
scoring = "accuracy"

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

## Modelo baseline ("Random Forest")

In [32]:
model_rf = RandomForestClassifier(random_state=42)

n_scores = cross_val_score(model_rf, X_train_encoded, y_train, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

print(f"{scoring}: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

accuracy: média: 0.805  desvio padrão: 0.024
