# Biblioteca

In [1]:
import pandas as pd
import sklearn
import numpy as np
import statsmodels 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
sns.set(rc={'figure.figsize':(12,7)})
sns.set_style("whitegrid")
sns.set_palette("husl")

pd.set_option('display.max_rows', 200)

# Dados

In [3]:
dados = pd.read_csv("porto_seguro_limpo.csv")
dados.drop(["id"], 1).describe().T.head(1)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,595212.0,0.036448,0.187401,0.0,0.0,0.0,0.0,1.0


In [4]:
dados_treino = pd.read_csv("porto_seguro_limpo_treino.csv")
dados_teste = pd.read_csv("porto_seguro_limpo_teste.csv")

In [5]:
X = dados_treino.drop(["id", "target"], 1)
y = dados_treino.loc[:, ["target"]]

X_teste = dados_teste.drop(["id", "target"], 1)
y_teste = dados_teste.loc[:, ["target"]]

# Previsão

In [6]:
import statsmodels.formula.api as sm

model = sm.Logit(y, X)
result = model.fit()
predictions = result.predict(X_teste)

Optimization terminated successfully.
         Current function value: 0.153064
         Iterations 8


# Resultados

In [7]:
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:               416648
Model:                          Logit   Df Residuals:                   416578
Method:                           MLE   Df Model:                           69
Date:                Mon, 06 May 2019   Pseudo R-squ.:                 0.02627
Time:                        10:01:49   Log-Likelihood:                -63774.
converged:                       True   LL-Null:                       -65494.
                                        LLR p-value:                     0.000
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
ps_ind_01                           0.0096      0.005      2.054      0.040       0.000       0.019
ps_ind_03                           0.0255      0.003      7.379    

In [8]:
from sklearn.metrics import roc_auc_score

acc = roc_auc_score(y_teste, predictions)
print("AUC na base de teste: %.4f" % acc) 
### valor anterior: 0.6197 --> é um ganho considerável, dado que o auc do vencedor da competição foi de 0,64849

acc = roc_auc_score(y, result.predict(X))
print("AUC na base de treino: %.4f" % acc) 

AUC na base de teste: 0.6272
AUC na base de treino: 0.6308


# Stepwise p-value < 50%

In [12]:
selected_features= ['ps_ind_01', 
                    'ps_ind_03', 
                    'ps_ind_06_bin', 
                    'ps_ind_07_bin', 
                    'ps_ind_08_bin', 
                    'ps_ind_12_bin', 
                    'ps_ind_15', 
                    'ps_ind_16_bin', 
                    'ps_ind_17_bin', 
                    'ps_ind_18_bin', 
                    'ps_reg_01', 
                    'ps_reg_02', 
                    'ps_car_08_cat', 
                    'ps_car_11', 
                    'ps_car_15', 
                    'ps_calc_01', 
                    'ps_calc_02', 
                    'ps_calc_03', 
                    'ps_calc_05', 
                    'ps_calc_06', 
                    'ps_calc_09', 
                    'ps_calc_14', 
                    'ps_calc_17_bin',
                    'ps_calc_19_bin',
                    'ps_car_07_cat_null', 
                    'ps_car_07_cat_1', 
                    'ps_car_05_cat_1', 
                    'ps_car_03_cat_null', 
                    'ps_car_03_cat_1', 
                    'ps_ind_04_cat_1',
                    'ps_ind_02_cat_null', 
                    'ps_ind_02_cat_2_3_4', 
                    'ps_ind_05_cat_0', 
                    'ps_ind_05_cat_1_3_4_5_6',
                    'ps_car_01_cat_6_7', 
                    'ps_car_01_cat_3_4_5_10',
                    'ps_car_01_cat_0_1_2_8_11', 
                    'ps_car_01_cat_9', 
                    'ps_car_04_cat_0_4', 
                    'ps_car_04_cat_1_2',
                    'ps_car_04_cat_3_8', 
                    'ps_car_04_cat_6_9',
                    'ps_car_06_cat_0_1_3_4_6_7_11_14', 
                    'ps_car_06_cat_10_12_15_16', 
                    'ps_car_09_cat_0_2_3',
                    'ps_car_11_cat_A', 
                    'ps_car_11_cat_B', 
                    'ps_car_11_cat_C', 
                    'ps_reg_03_no_out', 
                    'ps_car_12_no_out', 
                    'ps_car_13_no_out', 
                    'ps_car_14_no_out']

In [13]:
X = dados_treino[selected_features]
y = dados_treino.loc[:, ["target"]]

X_teste = dados_teste[selected_features]
y_teste = dados_teste.loc[:, ["target"]]

### Previsão 

In [14]:
import statsmodels.formula.api as sm

model = sm.Logit(y, X)
result = model.fit()
predictions = result.predict(X_teste)

Optimization terminated successfully.
         Current function value: 0.153076
         Iterations 8


### Resultados

In [15]:
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:               416648
Model:                          Logit   Df Residuals:                   416596
Method:                           MLE   Df Model:                           51
Date:                Mon, 06 May 2019   Pseudo R-squ.:                 0.02619
Time:                        10:09:12   Log-Likelihood:                -63779.
converged:                       True   LL-Null:                       -65494.
                                        LLR p-value:                     0.000
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
ps_ind_01                           0.0095      0.005      2.035      0.042       0.000       0.019
ps_ind_03                           0.0252      0.003      7.379    

In [16]:
from sklearn.metrics import roc_auc_score

acc = roc_auc_score(y_teste, predictions)
print("AUC na base de teste: %.4f" % acc) 
### valor anterior: 0.6197 --> é um ganho considerável, dado que o auc do vencedor da competição foi de 0,64849

acc = roc_auc_score(y, result.predict(X))
print("AUC na base de treino: %.4f" % acc) 

AUC na base de teste: 0.6269
AUC na base de treino: 0.6308


# R outputs

### Stepwise p-value < 0.15

### Stepwise p-value < 0.5

# Comentários

usar stepwise / backward selection 
- definir ponto de corte pelo pelo valor p acima de 50%
- depois limpar o modelo na mao
- pedir saida do VIF - variance inflation factor