# Biblioteca

In [65]:
import pandas as pd
import sklearn
import numpy as np
import statsmodels 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [66]:
sns.set(rc={'figure.figsize':(12,7)})
sns.set_style("whitegrid")
sns.set_palette("husl")

pd.set_option('display.max_rows', 200)

# Dados

In [109]:
dados_treino = pd.read_csv("porto_seguro_stepwise_reagrupado_treino.csv")
dados_teste = pd.read_csv("porto_seguro_stepwise_reagrupado_teste.csv")

In [110]:
X_treino = dados_treino.drop(["id", "target", "ps_car_05_cat_0_1"], 1)
y_treino = dados_treino.loc[:, ["target"]]

X_teste = dados_teste.drop(["id", "target", "ps_car_05_cat_0_1"], 1)
y_teste = dados_teste.loc[:, ["target"]]

# Previsão

In [111]:
import statsmodels.formula.api as sm

model = sm.Logit(y_treino, X_treino)
result = model.fit()
predictions = result.predict(X_teste)

Optimization terminated successfully.
         Current function value: 0.153099
         Iterations 8


# Resultados

A variável ps_car_05_cat_0_1 foi excluída, devido à falta de significância estatística (valor-p > 80%)

In [112]:
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:               416648
Model:                          Logit   Df Residuals:                   416613
Method:                           MLE   Df Model:                           34
Date:                Sun, 12 May 2019   Pseudo R-squ.:                 0.02604
Time:                        14:48:49   Log-Likelihood:                -63789.
converged:                       True   LL-Null:                       -65494.
                                        LLR p-value:                     0.000
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
ps_ind_01                    0.0123      0.004      2.769      0.006       0.004       0.021
ps_ind_03                    0.0257      0.003      7.580      0.000       0.019       0.

In [115]:
from sklearn.metrics import roc_auc_score

acc = roc_auc_score(y_teste, result.predict(X_teste))
print("AUC na base de teste: %.4f" % acc) 

AUC na base de teste: 0.6270


# Sklearn AUC e Cross Validation

In [121]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1e9).fit(X_treino, y_treino)
predictions = model.predict(X_teste)
predictions_proba = model.predict_proba(X_teste)[:, 1]

In [118]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LogisticRegression().fit(X_treino, y_treino), X_treino, y_treino, cv=10)
print("Cross Validation - Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Cross Validation - Accuracy: 0.9633 (+/- 0.0000)


In [122]:
from sklearn.metrics import roc_auc_score

acc = roc_auc_score(y_teste, model.predict_proba(X_teste)[:, 1])
print("AUC na base de teste: %.4f" % acc) 

acc = roc_auc_score(y_treino, model.predict_proba(X_treino)[:, 1])
print("AUC na base de treino: %.4f" % acc) 

AUC na base de teste: 0.6273
AUC na base de treino: 0.6306


In [136]:
print(model.intercept_)

[-1.14931386]


In [135]:
coef = pd.DataFrame(X_treino.columns).reset_index().merge(pd.DataFrame(model.coef_).T.reset_index(), on="index")
coef = coef.rename(columns={"0_x":"feature", "0_y":"coef"}).drop(["index"], 1)
coef

Unnamed: 0,feature,coef
0,ps_ind_01,0.011822
1,ps_ind_03,0.025926
2,ps_ind_07_bin,0.244406
3,ps_ind_08_bin,0.217218
4,ps_ind_15,-0.025734
5,ps_ind_16_bin,-0.070292
6,ps_ind_17_bin,0.314396
7,ps_reg_01,0.260752
8,ps_reg_02,0.08248
9,ps_car_11,-0.032008
