In [1]:
# 모듈 가져오기
import pandas as pd
import statsmodels.api as sm
import itertools
import time

In [2]:
# 데이터 가져오기 - 도요타 중고차 가격 데이터
Toyota = pd.read_csv('./Data/ToyotaCorolla.csv')

In [3]:
# 데이터 확인
Toyota.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Central_Lock,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,1,0,0,0,1,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,1,0,1,0,0,0,1,0,0,0
2,3,?TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,0,1,0,0,0,1,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,0,1,0,0,0,1,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,1,0,1,0,1,0,0,0


In [4]:
# 데이터 컬럼정보 확인
Toyota.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                1436 non-null   int64 
 1   Model             1436 non-null   object
 2   Price             1436 non-null   int64 
 3   Age_08_04         1436 non-null   int64 
 4   Mfg_Month         1436 non-null   int64 
 5   Mfg_Year          1436 non-null   int64 
 6   KM                1436 non-null   int64 
 7   Fuel_Type         1436 non-null   object
 8   HP                1436 non-null   int64 
 9   Met_Color         1436 non-null   int64 
 10  Automatic         1436 non-null   int64 
 11  cc                1436 non-null   int64 
 12  Doors             1436 non-null   int64 
 13  Cylinders         1436 non-null   int64 
 14  Gears             1436 non-null   int64 
 15  Quarterly_Tax     1436 non-null   int64 
 16  Weight            1436 non-null   int64 
 17  Mfr_Guarantee 

In [5]:
# 데이터 확인
dummies = pd.get_dummies(Toyota['Fuel_Type'])

In [6]:
# 불필요한 변수 제거 및 더미변수 추가
remove_cols = ['Id', 'Model', 'Fuel_Type']
tmp = Toyota.drop(remove_cols, axis = 1)
toyota_new = pd.concat([tmp, dummies], axis = 1)

In [7]:
# 데이터 확인
toyota_new

Unnamed: 0,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,cc,Doors,...,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Tow_Bar,CNG,Diesel,Petrol
0,13500,23,10,2002,46986,90,1,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
1,13750,23,10,2002,72937,90,1,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
2,13950,24,9,2002,41711,90,1,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
3,14950,26,7,2002,48000,90,0,0,2000,3,...,0,0,0,1,0,0,0,0,1,0
4,13750,30,3,2002,38500,90,0,0,2000,3,...,0,1,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,7500,69,12,1998,20544,86,1,0,1300,3,...,0,1,1,1,0,0,0,0,0,1
1432,10845,72,9,1998,19000,86,0,0,1300,3,...,0,0,1,1,0,0,0,0,0,1
1433,8500,71,10,1998,17016,86,0,0,1300,3,...,0,0,0,1,0,0,0,0,0,1
1434,7250,70,11,1998,16916,86,1,0,1300,3,...,0,0,0,1,0,0,0,0,0,1


In [10]:
# 데이터 분할
toyota_added = sm.add_constant(toyota_new, has_constant = 'add')
X = toyota_added.drop('Price', axis = 1)
y = toyota_added['Price']

## AIC가가 가장 작은값을 가지는 모델 -> 최적 회귀모델

In [25]:
def processSubset(X, y, feature_set):
    model = sm.OLS(y, X[feature_set])
    regression = model.fit()
    AIC = regression.aic    
    return {'model' : regression, "AIC" : AIC}

def getBest(X,y,k):
    tic = time.time()
    results = list()
    for combo in itertools.combinations(X.columns.difference(['const']), k):
        combo = list(combo) + ['const']
        results.append(processSubset(X,y,feature_set = combo))
    models = pd.DataFrame(results)
    bestModel = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print('Processed', models.shape[0], 'models on', k, 'predictors in', (toc-tic), 'seconds')
    return models, bestModel

In [27]:
# 최적 모델 확인
best_model = getBest(X,y,len(X.columns.difference(['const'])) - 1)[1]

Processed 36 models on 35 predictors in 0.2154226303100586 seconds


In [28]:
# 최적 모델 summary
best_model['model'].summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.909
Model:,OLS,Adj. R-squared:,0.907
Method:,Least Squares,F-statistic:,436.4
Date:,"Tue, 10 Nov 2020",Prob (F-statistic):,0.0
Time:,22:14:43,Log-Likelihood:,-12088.0
No. Observations:,1436,AIC:,24240.0
Df Residuals:,1403,BIC:,24420.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ABS,-268.4855,112.874,-2.379,0.018,-489.905,-47.066
Age_08_04,-118.8924,3.452,-34.439,0.000,-125.665,-112.120
Airbag_1,118.3702,220.070,0.538,0.591,-313.331,550.071
Airbag_2,-77.3409,115.119,-0.672,0.502,-303.164,148.482
Airco,198.4050,79.304,2.502,0.012,42.838,353.972
Automatic,346.0122,134.701,2.569,0.010,81.776,610.249
Automatic_airco,2441.3807,167.488,14.576,0.000,2112.828,2769.934
BOVAG_Guarantee,491.4573,112.028,4.387,0.000,271.697,711.218
Backseat_Divider,-264.8703,114.059,-2.322,0.020,-488.614,-41.127

0,1,2,3
Omnibus:,150.375,Durbin-Watson:,1.716
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1152.366
Skew:,0.08,Prob(JB):,5.8500000000000004e-251
Kurtosis:,7.386,Cond. No.,1.33e+16


### 전진선택법

In [34]:
# 변수 사전 정의
predictors = X.columns

# 전진 선택법
def forward(X,y,predictors):
    remainingPredictors = [p for p in X.columns.difference(['const']) if p not in predictors]
    tic = time.time()
    results = list()
    for p in remainingPredictors:
        results.append(processSubset(X, y, feature_set=predictors + [p] + ['const']))
        models = pd.DataFrame(results)
        bestModel = models.loc[models['AIC'].argmin(), :]
        toc = time.time()
        print('Processed', models.shape[0], 'models on', len(predictors) + 1, 'predictors in', (toc-tic))
        print('Selected predictors:', bestModel['model'].model.exog_names, 'AIC:' , bestModel['AIC'])
        return bestModel
    
# 전진 선택법 모델
def forward_model(X,y):
    fModels = pd.DataFrame(columns = ['AIC', 'model'])
    tic = time.time()
    predictors = list()
    for i in range(1, len(X.columns.difference(['const']))+1):
        forwardResult = forward(X,y,predictors)
        if i > 1:
            if forwardResult['AIC'] > fmodelBefore:
                break
        fModels.loc[i] = forwardResult
        predictors = fModels.loc[i]['model'].model.exog_names
        fmodelBefore = fModels.loc[i]['AIC']
        predictors = [k for k in predictors if k != 'const']
    toc = time.time()
    print("Total elapesed time : ", (toc - tic), "seconds.")
    return (fModels['model'][len(fModels['model'])])

In [36]:
# 최적회귀모델
forward_model(X,y).summary()

Processed 1 models on 1 predictors in 0.01094961166381836
Selected predictors: ['ABS', 'const'] AIC: 27476.22486484318
Processed 1 models on 2 predictors in 0.006992340087890625
Selected predictors: ['ABS', 'Age_08_04', 'const'] AIC: 25495.50966652165
Processed 1 models on 3 predictors in 0.004986286163330078
Selected predictors: ['ABS', 'Age_08_04', 'Airbag_1', 'const'] AIC: 25495.551241670393
Total elapesed time :  0.04687333106994629 seconds.


0,1,2,3
Dep. Variable:,Price,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.772
Method:,Least Squares,F-statistic:,2428.0
Date:,"Tue, 10 Nov 2020",Prob (F-statistic):,0.0
Time:,22:22:23,Log-Likelihood:,-12745.0
No. Observations:,1436,AIC:,25500.0
Df Residuals:,1433,BIC:,25510.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ABS,-625.9303,128.834,-4.858,0.000,-878.654,-373.206
Age_08_04,-176.3490,2.700,-65.323,0.000,-181.645,-171.053
const,2.111e+04,221.252,95.394,0.000,2.07e+04,2.15e+04

0,1,2,3
Omnibus:,348.274,Durbin-Watson:,1.261
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2670.318
Skew:,0.913,Prob(JB):,0.0
Kurtosis:,9.426,Cond. No.,315.0


### 후진제거법

In [37]:
# 후진 제거법
def backward(X,y,predictors):
    tic = time.time()
    results = list()
    for combo in itertools.combinations(predictors, len(predictors)-1):
        results.append(processSubset(X,y, list(combo) + ['const']))
    models = pd.DataFrame(results)
    bestModel = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print("Processed",models.shape[0],"models on",len(predictors)-1,
          "predictors in",(toc - tic))
    print("Selected predictors :",bestModel['model'].model.exog_names,
          ' AIC:',bestModel['AIC'])
    return bestModel
# 후진 제거법 모델
def backward_model(X,y):
    BModels = pd.DataFrame(columns = ['AIC', 'model'])
    tic = time.time()
    predictors = X.columns.difference(['const'])
    BmodelBefore = processSubset(X,y,predictors)['AIC']
    while (len(predictors)>1):
        backwardResult = backward(X,y,predictors)
        if backwardResult['AIC'] > BmodelBefore:
            break
        BModels.loc[len(predictors)-1] = backwardResult
        predictors = BModels.loc[len(predictors)-1]['model'].model.exog_names
        BmodelBefore = backwardResult['AIC']
        predictors = [k for k in predictors if k != 'const']
    
    
    toc = time.time()
    print("Total elapsed time :",(toc - tic), "seconds.")
    return BModels['model'].dropna().iloc[-1]

In [38]:
backward_model(X,y).summary()

Processed 36 models on 35 predictors in 0.23839163780212402
Selected predictors : ['ABS', 'Age_08_04', 'Airbag_1', 'Airbag_2', 'Airco', 'Automatic', 'Automatic_airco', 'BOVAG_Guarantee', 'Backseat_Divider', 'Boardcomputer', 'CD_Player', 'CNG', 'Central_Lock', 'Cylinders', 'Diesel', 'Doors', 'Gears', 'Guarantee_Period', 'HP', 'KM', 'Metallic_Rim', 'Mfg_Month', 'Mfg_Year', 'Mfr_Guarantee', 'Mistlamps', 'Petrol', 'Power_Steering', 'Powered_Windows', 'Quarterly_Tax', 'Radio', 'Radio_cassette', 'Sport_Model', 'Tow_Bar', 'Weight', 'cc', 'const']  AIC: 24242.24759492178
Processed 35 models on 34 predictors in 0.23041749000549316
Selected predictors : ['ABS', 'Age_08_04', 'Airbag_1', 'Airbag_2', 'Airco', 'Automatic', 'Automatic_airco', 'BOVAG_Guarantee', 'Backseat_Divider', 'Boardcomputer', 'CD_Player', 'CNG', 'Central_Lock', 'Cylinders', 'Diesel', 'Doors', 'Gears', 'Guarantee_Period', 'HP', 'KM', 'Metallic_Rim', 'Mfg_Month', 'Mfg_Year', 'Mfr_Guarantee', 'Mistlamps', 'Petrol', 'Powered_Windows

0,1,2,3
Dep. Variable:,Price,R-squared:,0.908
Model:,OLS,Adj. R-squared:,0.907
Method:,Least Squares,F-statistic:,608.7
Date:,"Tue, 10 Nov 2020",Prob (F-statistic):,0.0
Time:,22:25:51,Log-Likelihood:,-12091.0
No. Observations:,1436,AIC:,24230.0
Df Residuals:,1412,BIC:,24360.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ABS,-297.4536,87.734,-3.390,0.001,-469.557,-125.350
Age_08_04,-22.9327,9.007,-2.546,0.011,-40.601,-5.265
Airco,191.3195,75.749,2.526,0.012,42.726,339.913
Automatic,315.9247,132.939,2.376,0.018,55.146,576.704
Automatic_airco,2418.4800,157.138,15.391,0.000,2110.230,2726.730
BOVAG_Guarantee,497.0693,110.651,4.492,0.000,280.012,714.127
Backseat_Divider,-290.6908,103.011,-2.822,0.005,-492.762,-88.619
Boardcomputer,-269.4259,102.935,-2.617,0.009,-471.349,-67.503
CD_Player,219.7123,85.235,2.578,0.010,52.511,386.914

0,1,2,3
Omnibus:,151.108,Durbin-Watson:,1.714
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1183.057
Skew:,0.051,Prob(JB):,1.2700000000000001e-257
Kurtosis:,7.445,Cond. No.,1.33e+16


### 단계적 방법

In [42]:
def Stepwise_model(X,y):
    stepModels = pd.DataFrame(columns = ['AIC', 'model'])
    tic = time.time()
    predictors = list()
    SmodelBefore = processSubset(X,y,predictors + ['const'])['AIC']
    for i in range(1, len(X.columns.difference(['const']))+1):
        forwardResult = forward(X,y,predictors)
        print('forward')
        stepModels.loc[i] = forwardResult
        predictors = stepModels.loc[i]['model'].model.exog_names
        predictors = [k for k in predictors if k != 'const']
        backwardResult = backward(X,y,predictors)
        if backwardResult['AIC'] < forwardResult['AIC']:
            stepModels.loc[i] = backwardResult
            predictors = stepModels.loc[i]['model'].model.exog_names
            smodelBefore = stepModels.loc[i]['AIC']
            predictors = [k for k in predictors if k != 'const']
            print('backward')
        if stepModels.loc[i]['AIC'] > SmodelBefore:
            break
        else:
            smodelBefore = stepModels.loc[i]['AIC']
    toc = time.time()
    print("Total elapsed time : ", (toc - tic), "seconds")
    return stepModels['model'][len(stepModels['model'])]

In [43]:
Stepwise_model(X, y).summary()

Processed 1 models on 1 predictors in 0.005980968475341797
Selected predictors: ['ABS', 'const'] AIC: 27476.22486484318
forward
Processed 1 models on 0 predictors in 0.004976749420166016
Selected predictors : ['const']  AIC: 27615.537867847517
Processed 1 models on 2 predictors in 0.008977651596069336
Selected predictors: ['ABS', 'Age_08_04', 'const'] AIC: 25495.50966652165
forward
Processed 2 models on 1 predictors in 0.00897669792175293
Selected predictors : ['Age_08_04', 'const']  AIC: 25516.97057066566
Processed 1 models on 3 predictors in 0.005984067916870117
Selected predictors: ['ABS', 'Age_08_04', 'Airbag_1', 'const'] AIC: 25495.551241670393
forward
Processed 3 models on 2 predictors in 0.008976936340332031
Selected predictors : ['ABS', 'Age_08_04', 'const']  AIC: 25495.50966652165
backward
Processed 1 models on 3 predictors in 0.00498652458190918
Selected predictors: ['ABS', 'Age_08_04', 'Airbag_1', 'const'] AIC: 25495.551241670393
forward
Processed 3 models on 2 predictors in

Processed 1 models on 3 predictors in 0.0029916763305664062
Selected predictors: ['ABS', 'Age_08_04', 'Airbag_1', 'const'] AIC: 25495.551241670393
forward
Processed 3 models on 2 predictors in 0.010970115661621094
Selected predictors : ['ABS', 'Age_08_04', 'const']  AIC: 25495.50966652165
backward
Processed 1 models on 3 predictors in 0.0049896240234375
Selected predictors: ['ABS', 'Age_08_04', 'Airbag_1', 'const'] AIC: 25495.551241670393
forward
Processed 3 models on 2 predictors in 0.007980823516845703
Selected predictors : ['ABS', 'Age_08_04', 'const']  AIC: 25495.50966652165
backward
Processed 1 models on 3 predictors in 0.003989458084106445
Selected predictors: ['ABS', 'Age_08_04', 'Airbag_1', 'const'] AIC: 25495.551241670393
forward
Processed 3 models on 2 predictors in 0.012965202331542969
Selected predictors : ['ABS', 'Age_08_04', 'const']  AIC: 25495.50966652165
backward
Processed 1 models on 3 predictors in 0.003989696502685547
Selected predictors: ['ABS', 'Age_08_04', 'Airba

0,1,2,3
Dep. Variable:,Price,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.772
Method:,Least Squares,F-statistic:,2428.0
Date:,"Tue, 10 Nov 2020",Prob (F-statistic):,0.0
Time:,22:33:00,Log-Likelihood:,-12745.0
No. Observations:,1436,AIC:,25500.0
Df Residuals:,1433,BIC:,25510.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ABS,-625.9303,128.834,-4.858,0.000,-878.654,-373.206
Age_08_04,-176.3490,2.700,-65.323,0.000,-181.645,-171.053
const,2.111e+04,221.252,95.394,0.000,2.07e+04,2.15e+04

0,1,2,3
Omnibus:,348.274,Durbin-Watson:,1.261
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2670.318
Skew:,0.913,Prob(JB):,0.0
Kurtosis:,9.426,Cond. No.,315.0
