## Biblioteca

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings("ignore")

## Função

In [2]:
def cross_validation_func(db, model, k=10):

    i = 1
    df = pd.DataFrame()
    data = db.assign(seed=lambda x: np.random.randint(1, k+1, x.shape[0]))

    while i <= k:  
        X_train = data.pipe(lambda x: x[x.seed != i]).drop(["id", "target", "seed"], 1)
        y_train = data.pipe(lambda x: x[x.seed != i]).loc[:, ["target"]]

        X_test = data.pipe(lambda x: x[x.seed == i]).drop(["id", "target", "seed"], 1)
        y_test = data.pipe(lambda x: x[x.seed == i]).loc[:, ["target"]]

        result = model.fit(X_train, y_train)

        from sklearn.metrics import accuracy_score
        from sklearn.metrics import roc_auc_score
        acc = accuracy_score(y_test, result.predict(X_test))
        auc = roc_auc_score(y_test, result.predict_proba(X_test)[:, 1])
        
        df_data = pd.DataFrame(columns=['seed', 'accuracy_score', 'auc'], 
                           data=[[i, acc, auc]])
        df = pd.concat([df, df_data])

        i+=1
    return df

## Data

In [3]:
dados_treino = pd.read_csv("porto_seguro_limpo_treino.csv")
dados_teste = pd.read_csv("porto_seguro_limpo_teste.csv")

In [4]:
data = pd.concat([dados_teste, dados_treino])

## Logistic Regression

In [5]:
data_stepwise = pd.concat([pd.read_csv("porto_seguro_stepwise_reagrupado_treino.csv"), 
                           pd.read_csv("porto_seguro_stepwise_reagrupado_teste.csv")]).drop(["ps_car_05_cat_0_1"], 1)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1e9)
k = 10

cv_reglog = cross_validation_func(data_stepwise, model, k)
print("accuracy_score: %0.4f (+/- %0.4f)" % (cv_reglog.accuracy_score.mean(), cv_reglog.accuracy_score.std() * 2))
print("auc: %0.4f (+/- %0.4f)" % (cv_reglog.auc.mean(), cv_reglog.auc.std() * 2))

accuracy_score: 0.9636 (+/- 0.0014)
auc: 0.6290 (+/- 0.0135)


## Decision Tree Classifier

In [6]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5, min_samples_split=7)
k = 10

cv_tree = cross_validation_func(data, model, k)
print("accuracy_score: %0.4f (+/- %0.4f)" % (cv_tree.accuracy_score.mean(), cv_tree.accuracy_score.std() * 2))
print("auc: %0.4f (+/- %0.4f)" % (cv_tree.auc.mean(), cv_tree.auc.std() * 2))

accuracy_score: 0.9635 (+/- 0.0018)
auc: 0.6027 (+/- 0.0081)


## Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=5)
k = 10

cv_rf = cross_validation_func(data, model, k)
print("accuracy_score: %0.4f (+/- %0.4f)" % (cv_rf.accuracy_score.mean(), cv_rf.accuracy_score.std() * 2))
print("auc: %0.4f (+/- %0.4f)" % (cv_rf.auc.mean(), cv_rf.auc.std() * 2))

accuracy_score: 0.9636 (+/- 0.0013)
auc: 0.6218 (+/- 0.0153)


## XGBoost Classifier

In [8]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=100, max_depth=3, max_features=5)
k = 10

cv_xgb = cross_validation_func(data, model, k)
print("accuracy_score: %0.4f (+/- %0.4f)" % (cv_xgb.accuracy_score.mean(), cv_xgb.accuracy_score.std() * 2))
print("auc: %0.4f (+/- %0.4f)" % (cv_xgb.auc.mean(), cv_xgb.auc.std() * 2))

accuracy_score: 0.9636 (+/- 0.0018)
auc: 0.6375 (+/- 0.0130)
