# Modeling

In [1]:
import os
from datetime import datetime
from hashlib import sha256

import pandas as pd
import numpy as np

# -------------------------
# model libs
from pycaret.classification import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

# -------------------------
# model validation and hyperparameter tunning libs
from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold,
                                     RandomizedSearchCV, GridSearchCV, train_test_split, cross_validate)

from sklearn.metrics import (precision_recall_curve, average_precision_score, classification_report, roc_curve)

from yellowbrick import ROCAUC

# -------------------------
# graph libs
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd
from utils.classification_model_evaluation import ClassificationModelEvaluation as cme

raw_path = "../data/raw/"
external_path = "../data/external/"
interim_path = "../data/interim/"
processed_path = "../data/processed/"
reports_path = "../reports/"

path_model = "../models/"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# load the autoreload extension
%load_ext autoreload

# Set extension to reload modules every time before executing code
%autoreload 2

## Carregando dataframes da preparação de dados

In [2]:
X_train_encoded = pd.read_parquet(interim_path+'X_train_encoded_v1.pqt')
X_validation_encoded = pd.read_parquet(interim_path+'X_validation_encoded_v1.pqt')

y_train = pd.read_parquet(interim_path+'y_train.pqt')
y_validation = pd.read_parquet(interim_path+'y_validation.pqt')

X_test = pd.read_parquet(interim_path+'df_test_encoded.pqt')

print(f"""O dataframe X_train possui:
- {X_train_encoded.shape[0]} registros; e
- {X_train_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

print(f"""O dataframe X_validation possui:
- {X_validation_encoded.shape[0]} registros; e
- {X_validation_encoded.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

print(f"""O dataframe X_test possui:
- {X_test.shape[0]} registros.
- {X_test.shape[1]} atributos, SEM a variável resposta ("Survived").""")

O dataframe X_train possui:
- 623 registros; e
- 11 atributos, SEM a variável resposta ("Survived").

O dataframe X_validation possui:
- 268 registros; e
- 11 atributos, SEM a variável resposta ("Survived").

O dataframe X_test possui:
- 418 registros.
- 11 atributos, SEM a variável resposta ("Survived").


## Fazendo o split treinamento/validação no df_train

In [3]:
n_splits = 6
n_repeats = 3
random_state = 42
scoring = "accuracy"

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

### Quantos splits usar?

In [4]:
model_rf = RandomForestClassifier(random_state=random_state)

dct_splits = {}

for n_split in range(2,10):
    cv = RepeatedStratifiedKFold(n_splits=n_split, n_repeats=n_repeats, random_state=random_state)

    cross = cross_validate(model_rf, X_train_encoded, y_train, cv=cv, scoring="accuracy", return_train_score=True, return_estimator=True, n_jobs=-1)

    dct_splits[n_split]={'train_score': round(-cross['train_score'].mean(), 3), 'test_score': round(-cross['test_score'].mean(), 3)}

pd.DataFrame.from_dict(dct_splits, orient='index')

Unnamed: 0,train_score,test_score
2,-0.989,-0.795
3,-0.985,-0.803
4,-0.984,-0.802
5,-0.983,-0.805
6,-0.983,-0.808
7,-0.983,-0.803
8,-0.982,-0.804
9,-0.982,-0.806


## Modelo baseline ("Random Forest")

In [None]:
model_rf = RandomForestClassifier(random_state=random_state, n_estimators=1000, n_jobs=-1, max_depth=int((len(X_train_encoded.columns))**0.5))

n_scores = cross_val_score(estimator=model_rf, X=X_train_encoded, y=y_train, scoring=scoring, cv=cv, n_jobs=-1, error_score='raise')

print(f"{scoring}: média: {n_scores.mean():.3f}  desvio padrão: {n_scores.std():.3f}")

In [None]:
cross = cross_validate(model_rf, X_train_encoded, y_train, cv=cv, scoring="accuracy", return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

print("\n")

dct_importance = {}

for i, model in enumerate(cross['estimator']):
    dct_importance[i] = model.feature_importances_

print("Feature importance:")
dct_importance = pd.DataFrame.from_dict(dct_importance, orient='index', columns=X_train_encoded.columns)
np.mean(dct_importance).sort_values(ascending=False)

## Gerando a primeira submissão

Usando o modelo baseline, geraremos a primeira submissão para ver como será a acurácia com os dados de teste.

In [None]:
X = pd.concat([X_train_encoded, X_validation_encoded], axis=0)
y = pd.concat([y_train, y_validation], axis=0)

model_rf.fit(X, y)

y_pred = model_rf.predict(X_test) 

y_pred = pd.DataFrame(y_pred, index=X_test.index, columns=["Survived"])

y_pred.to_csv(processed_path+"y_pred.csv")

## Usando comparação de modelos usando o pycaret

In [None]:
clf1 = setup(data = pd.concat([X_train_encoded, y_train], axis=1), target = 'Survived')
best = compare_models(sort = 'Accuracy')