In [163]:
import pandas as pd
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]


In [164]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    titulo = nome.split(',')[1].split('.')[0].strip()
    if (titulo in ['Mille','Ms','Lady', 'Miss']):
        titulo = 'Miss'
    elif (titulo in ['Mme', 'Sir']):
        titulo = 'Mrs'
    elif (titulo in ['Master', 'Mr', 'Mrs']):
        titulo = 'Other'
    return titulo


class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True, cabin=True):
        self.excluirName = excluirName
        self.cabin = cabin
        
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        if not self.cabin:
            self.colunasIndesejadas.append('Cabin')

        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        if 'Cabin' not in self.colunasIndesejadas:
            full['CabinType'] = full['Cabin'].astype(str).str[0]
        return Xdrop
    

In [165]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [166]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()

In [167]:
class AtributoIdade(BaseEstimator, TransformerMixin):
    def __init__(self, categorizar=True):
        self.categorizar = categorizar
    def fit(self, X, y=None):
        self.idade = 1
        return self
    def transform(self, X, y=None):
         if self.categorizar:
            idades = X[:, self.idade]
            cat_idades = pd.cut(idades,[0,9,18,30,40,50,100], labels=[9,18,30,40,50,100]).astype(int)
            X[:, self.idade] = cat_idades
         return X

In [168]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('idade', AtributoIdade()),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [173]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np
from datetime import datetime

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier(n_jobs=-1))
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'atributosDesejados__cabin': [True, False],
    'classificador__max_depth': [None, 6],
}

# scorings = ["accuracy","balanced_accuracy","average_precision","f1","f1_micro","f1_macro","f1_weighted","precision","roc_auc","roc_auc_ovr","roc_auc_ovo","roc_auc_ovr_weighted","roc_auc_ovo_weighted"]

scorings = ['roc_auc']

for scoring in sorted(scorings, reverse=True): 
    start = datetime.now()

    modelo = GridSearchCV(pipetotal, param_grid=parametros, n_jobs=-1, scoring=scoring)   
    scores = cross_validate(modelo, X, y, cv=RepeatedKFold())
    mean_scores, std_scores = np.mean(scores['test_score']), np.std(scores['test_score'])

    end = datetime.now()
    total = (end - start)
    print(f"{scoring} Total {total} - Mean scores {mean_scores}, Std {std_scores}")
    print("----------------------------")

roc_auc Total 0:05:10.539376 - Mean scores 0.8652927388858126, Std 0.02999732009231759
----------------------------


In [171]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)