In [45]:
import os

import pandas as pd
import numpy as np

from sklearn.model_selection import (cross_val_score, RepeatedStratifiedKFold,
                                     RandomizedSearchCV, GridSearchCV, train_test_split)

from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from feature_engine import categorical_encoders as ce

from sklearn.feature_selection import RFE

import matplotlib.pyplot as plt
import seaborn as sns

os.chdir("../src/")
from utils.data_describe import DataDescribe as dd

raw_path = "../data/raw/"
external_path = "../data/external/"
interim_path = "../data/interim/"
path_processed = "../data/processed/"
reports_path = "../reports/"

path_model = "../models/"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# load the autoreload extension
%load_ext autoreload

# Set extension to reload modules every time before executing code
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Carregando dataframes de raw

Aqui pegaremos os insights de [1_business_data_understanding_suto](1_business_data_understanding_suto.ipynb) e faremos as transformações propostas.

In [46]:
df_train = pd.read_csv(raw_path+'train.csv', index_col="PassengerId")
df_test = pd.read_csv(raw_path+'test.csv', index_col="PassengerId")

print(f"""O dataframe df_train possui:
- {df_train.shape[0]} registros; e
- {df_train.shape[1]} atributos, incluindo a variável resposta ("Survived").
""")

print(f"""O dataframe df_test possui:
- {df_test.shape[0]} registros; e
- {df_test.shape[1]} atributos, SEM a variável resposta ("Survived").
""")

O dataframe df_train possui:
- 891 registros; e
- 11 atributos, incluindo a variável resposta ("Survived").

O dataframe df_test possui:
- 418 registros; e
- 10 atributos, SEM a variável resposta ("Survived").



## Fazendo o split treinamento/validação no df_train

In [43]:
# Estou removendo temporariamente "Name", "Cabin", "Ticket". 

X = df_train.drop(["Survived", "Name", "Cabin", "Ticket"], axis=1)
df_test.drop(["Name", "Cabin", "Ticket"], axis=1, inplace=True)
y = df_train["Survived"]

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.3, random_state=random_state)

## Tratando nulos:

In [48]:
# TODO: REMOVER O CÓDIGO COMENTADO ABAIXO

# df_train = pd.read_csv(raw_path+'train.csv', index_col="PassengerId")
# df_test = pd.read_csv(raw_path+'test.csv', index_col="PassengerId")

# X = df_train.drop(["Survived", "Name", "Cabin", "Ticket"], axis=1)
# df_test = df_test.drop(["Name", "Cabin", "Ticket"], axis=1)
# y = df_train["Survived"]

# X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.3, random_state=random_state)

for feature in X_train.columns:
    print(feature)
    if str(X_train[feature].dtype) in ['int','int32','int64', 'float', 'float32', 'float64']:
        print("Numéricos: ", feature)
        median = np.median(X_train[feature].dropna())
        X_train[feature].fillna(median, inplace=True)
        X_validation[feature].fillna(median, inplace=True)
        df_test[feature].fillna(median, inplace=True)
        
    elif str(X_train[feature].dtype) in 'object':
        print("Não numéricos: ", feature)
        X_train[feature].fillna('não informado', inplace=True)
        X_validation[feature].fillna('não informado', inplace=True)
        df_test[feature].fillna('não informado', inplace=True)

Pclass
Numéricos:  Pclass
Sex
Não numéricos:  Sex
Age
Numéricos:  Age
SibSp
Numéricos:  SibSp
Parch
Numéricos:  Parch
Fare
Numéricos:  Fare
Embarked
Não numéricos:  Embarked


## OHE

In [49]:
lst_categoricas = list(X_train.select_dtypes(include='object').columns)
print(lst_categoricas)

['Sex', 'Embarked']


In [50]:
ohe_hot_encoder = ce.OneHotCategoricalEncoder(top_categories=5,
                                              variables=lst_categoricas,
                                              drop_last=True)

ohe_hot_encoder.fit(X_train)

X_train_encoded = ohe_hot_encoder.transform(X_train)

X_validation_encoded = ohe_hot_encoder.transform(X_validation)

df_test_encoded = ohe_hot_encoder.transform(df_test)

X_train_encoded.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Embarked_não informado
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
446,1,4.0,0,2,81.8583,1,0,1,0,0,0
651,3,28.0,0,0,7.8958,1,0,1,0,0,0
173,3,1.0,1,1,11.1333,0,1,1,0,0,0
451,2,36.0,1,2,27.75,1,0,1,0,0,0
315,2,43.0,1,1,26.25,1,0,1,0,0,0


## Exportação das bases para próxima etapa

In [53]:
X_train_encoded.to_parquet(interim_path+'X_train_encoded_v1.pqt')
X_validation_encoded.to_parquet(interim_path+'X_validation_encoded_v1.pqt')

y_train = pd.DataFrame(y_train)
y_validation = pd.DataFrame(y_validation)

y_train.to_parquet(interim_path+'y_train.pqt')
y_validation.to_parquet(interim_path+'y_validation.pqt')

df_test_encoded.to_parquet(interim_path+'df_test_encoded.pqt')