In [1]:
# To support both python 2 and python 3
# 让这份笔记同步支持 python 2 和 python 3
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [2]:
#使用statsmodels便于统计分析数据
import statsmodels.api as sm
import statsmodels.formula.api as smf
#使用sklearn便于预测（机器学习）
from sklearn import linear_model
#使用patsy便于生成模型
import patsy

**5. In Chapter 4, we used logistic regression to predict the probability of
default using income and balance on the Default data set. We will
now estimate the test error of this logistic regression model using the
validation set approach. Do not forget to set a random seed before
beginning your analysis.(a) Fit a logistic regression model that uses income and balance to
predict default**

In [3]:
Default = pd.read_csv("data/Default.csv").drop('Unnamed: 0',axis = 1)
Default.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [4]:
#将'default','student'哑变量便于分析
key_titles = ['default','student']
Default_Dummy = Default.copy()
for key_title in key_titles:
    dummies = pd.get_dummies(Default[key_title],prefix = key_title)
    Default_Dummy = Default_Dummy.join(dummies)
Default_Dummy.head()

Unnamed: 0,default,student,balance,income,default_No,default_Yes,student_No,student_Yes
0,No,No,729.526495,44361.625074,1,0,1,0
1,No,Yes,817.180407,12106.1347,1,0,0,1
2,No,No,1073.549164,31767.138947,1,0,1,0
3,No,No,529.250605,35704.493935,1,0,1,0
4,No,No,785.655883,38463.495879,1,0,1,0


In [5]:
glm_smf = smf.logit('default_Yes ~ income + balance',data = Default_Dummy).fit()
glm_smf.summary()

Optimization terminated successfully.
         Current function value: 0.078948
         Iterations 10


0,1,2,3
Dep. Variable:,default_Yes,No. Observations:,10000.0
Model:,Logit,Df Residuals:,9997.0
Method:,MLE,Df Model:,2.0
Date:,"Tue, 31 Dec 2019",Pseudo R-squ.:,0.4594
Time:,15:59:14,Log-Likelihood:,-789.48
converged:,True,LL-Null:,-1460.3
Covariance Type:,nonrobust,LLR p-value:,4.541e-292

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-11.5405,0.435,-26.544,0.000,-12.393,-10.688
income,2.081e-05,4.99e-06,4.174,0.000,1.1e-05,3.06e-05
balance,0.0056,0.000,24.835,0.000,0.005,0.006


**(b) Using the validation set approach, estimate the test error of this
model. In order to do this, you must perform the following steps:
i. Split the sample set into a training set and a validation set.
ii. Fit a multiple logistic regression model using only the train-
ing observations.
iii. Obtain a prediction of default status for each individual in
the validation set by computing the posterior probability of
default for that individual, and classifying the individual to
the default category if the posterior probability is greater
than 0.5.
iv. Compute the validation set error, which is the fraction of
the observations in the validation set that are misclassified.**

In [6]:
#i. Split the sample set into a training set and a validation set.
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(Default_Dummy, test_size = 0.5, random_state = 1983)

In [7]:
#ii. Fit a multiple logistic regression model using only the training observations.
glm_smf = smf.logit('default_Yes ~ income + balance',data = train_set).fit()

Optimization terminated successfully.
         Current function value: 0.073830
         Iterations 10


In [8]:
'''
iii. Obtain a prediction of default status for each individual in the validation set by computing the posterior probability of default 
for that individual, and classifying the individual to the default category if the posterior probability is greater than 0.5.
'''
default_Yes_predict = (glm_smf.predict(test_set) > 0.5)

In [9]:
#iv. Compute the validation set error, which is the fraction of the observations in the validation set that are misclassified.
(test_set['default_Yes'] != default_Yes_predict).mean()

0.0294

**(c) Repeat the process in (b) three times, using three different splits
of the observations into a training set and a validation set. Com-
ment on the results obtained.**

In [10]:
for repeat_time in range(3):
    train_set, test_set = train_test_split(Default_Dummy, test_size = 0.5, random_state = repeat_time)
    glm_smf = smf.logit('default_Yes ~ income + balance',data = train_set).fit()
    default_Yes_predict = (glm_smf.predict(test_set) > 0.5)
    print('the {} fraction : '.format(repeat_time) , (test_set['default_Yes'] != default_Yes_predict).mean())

Optimization terminated successfully.
         Current function value: 0.072956
         Iterations 10
the 0 fraction :  0.029
Optimization terminated successfully.
         Current function value: 0.079028
         Iterations 10
the 1 fraction :  0.025
Optimization terminated successfully.
         Current function value: 0.084655
         Iterations 10
the 2 fraction :  0.0248


错误率变动较大，从0.0248至0.029

**(d) Now consider a logistic regression model that predicts the prob-
ability of default using income , balance , and a dummy variable
for student . Estimate the test error for this model using the val-
idation set approach. Comment on whether or not including a
dummy variable for student leads to a reduction in the test error
rate.**

In [11]:
for repeat_time in range(3):
    train_set, test_set = train_test_split(Default_Dummy, test_size = 0.5, random_state = repeat_time)
    glm_smf = smf.logit('default_Yes ~ income + balance + student_Yes',data = train_set).fit()
    default_Yes_predict = (glm_smf.predict(test_set) > 0.5)
    print('the {} fraction : '.format(repeat_time) , (test_set['default_Yes'] != default_Yes_predict).mean())

Optimization terminated successfully.
         Current function value: 0.072293
         Iterations 10
the 0 fraction :  0.0292
Optimization terminated successfully.
         Current function value: 0.077791
         Iterations 10
the 1 fraction :  0.0262
Optimization terminated successfully.
         Current function value: 0.083772
         Iterations 10
the 2 fraction :  0.0254


从结果上看没有影响