# 8.5. Aplicação Prática Regressão Logística

In [1]:
import numpy as np
import pandas as pd
import scipy.stats

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Carregando a base de dados

path = r"/Users/leandroguerra/Library/CloudStorage/OneDrive-Personale/MBA - XP Educação/M3 - Códigos/german_credit_data.csv"
credit = pd.read_csv(path, header = 0, names = ['Index', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose', 'default'])

credit.head(10)

Unnamed: 0,Index,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,default
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
5,5,35,male,1,free,,,9055,36,education,good
6,6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good
7,7,35,male,3,rent,little,moderate,6948,36,car,good
8,8,61,male,1,own,rich,,3059,12,radio/TV,good
9,9,28,male,3,own,little,moderate,5234,30,car,bad


In [3]:
credit["default"].value_counts()

good    700
bad     300
Name: default, dtype: int64

In [5]:
credit["Saving accounts"].value_counts()

little        603
NA            183
moderate      103
quite rich     63
rich           48
Name: Saving accounts, dtype: int64

In [6]:
credit["Purpose"].value_counts()

car                    337
radio/TV               280
furniture/equipment    181
business                97
education               59
repairs                 22
domestic appliances     12
vacation/others         12
Name: Purpose, dtype: int64

In [4]:
credit["Saving accounts"] = credit["Saving accounts"].fillna(value = "NA")
credit["Checking account"] = credit["Checking account"].fillna(value = "NA")

In [7]:
# Removendo os NAs
credit["Saving accounts"]= credit["Saving accounts"].map(
    {'little': 'little', 'moderate': 'moderate', 'quite rich':'other','rich':'other', 'NA':'other'})
credit["Checking account"]= credit["Checking account"].map(
    {'little': 'little', 'moderate': 'moderate','rich':'other', 'NA':'other' })

# Ajustando a coluna Purpose
le = LabelEncoder()
credit["Purpose"] = le.fit_transform(credit["Purpose"])

In [9]:
credit["Purpose"].value_counts()

1    337
5    280
4    181
0     97
3     59
6     22
2     12
7     12
Name: Purpose, dtype: int64

In [10]:
credit["default"] = credit["default"].map({"good":0, "bad": 1})
credit["default"].value_counts()

0    700
1    300
Name: default, dtype: int64

In [11]:
x = credit[['Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose']]

y = credit["default"]

In [12]:
# Criando dummies das colunas categóricas e normalização das colunas numéricas

cat_cols = list(x.columns[x.dtypes == "object"])
num_cols = list(x.columns[x.dtypes != "object"])

dummy =  pd.get_dummies(x[cat_cols], drop_first = True)

x = pd.concat([dummy, x[num_cols]], axis=1, join = "inner")

ss = StandardScaler()

x = ss.fit_transform(x)

In [13]:
x

array([[ 0.67028006,  0.63444822, -0.4669334 , ..., -0.74513141,
        -1.23647786,  1.07326286],
       [-1.49191369,  0.63444822, -0.4669334 , ...,  0.94981679,
         2.24819436,  1.07326286],
       [ 0.67028006,  0.63444822, -0.4669334 , ..., -0.41656241,
        -0.73866754,  0.06170503],
       ...,
       [ 0.67028006,  0.63444822, -0.4669334 , ..., -0.87450324,
        -0.73866754,  1.07326286],
       [ 0.67028006, -1.57617276, -0.4669334 , ..., -0.50552769,
         1.9992892 ,  1.07326286],
       [ 0.67028006,  0.63444822, -0.4669334 , ...,  0.46245715,
         1.9992892 , -0.9498528 ]])

In [14]:
# Separando entre treinamento e teste

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5, random_state = 42)

In [15]:
lr = LogisticRegression()

model = lr.fit(x_train, y_train)

model.score(x_test, y_test)

0.74

In [16]:
# Previsão dos resultados
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

In [17]:
conf = confusion_matrix(y_test, y_pred_test)
print(conf)
print("Registros erroneamente classificados como Default     =  %.1f" %(conf[0][1]/ sum(sum(conf))*100 ) )
print("Registros erroneamente classificados como Não-Default = %.1f"%(conf[1][0]/ sum(sum(conf))*100 ) )
print("Registros corretamente classificados como Default     = %.1f"% (conf[1][1]/ sum(sum(conf))*100 ) )
print("Registros corretamente classificados como Não-Default = %.1f"% (conf[0][0]/ sum(sum(conf))*100 ))

[[315  38]
 [ 92  55]]
Registros erroneamente classificados como Default     =  7.6
Registros erroneamente classificados como Não-Default = 18.4
Registros corretamente classificados como Default     = 11.0
Registros corretamente classificados como Não-Default = 63.0


In [18]:
# Calculando a ROC (AUC) e o Gini - Treinamento

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_pred_train)
roc_auc_train = auc(fpr_train, tpr_train)

print("AUC - Treinamento : %f" % roc_auc_train)
print("Gini - Treinamento: ", round(2*roc_auc_train-1,2))
print("\n")

# Calculando a ROC e o Gini - Teste
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_pred_test)
roc_auc_test = auc(fpr_test, tpr_test)

print("AUC - Teste : %f" % roc_auc_test)
print("Gini - Teste: ", round(2*roc_auc_test-1,2))

AUC - Treinamento : 0.654395
Gini - Treinamento:  0.31


AUC - Teste : 0.633250
Gini - Teste:  0.27
