In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
import re

In [26]:
csv_in = 'condo.csv'
df = pd.read_csv(csv_in, sep=',', skiprows=0, header=0, encoding='shift_jis')
print(df.shape)
print(df.info())
display(df.head())

(25, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   No         25 non-null     int64 
 1   minutes    25 non-null     int64 
 2   Price      25 non-null     int64 
 3   Area       25 non-null     int64 
 4   Year       25 non-null     int64 
 5   Structure  25 non-null     object
 6   BCR        25 non-null     int64 
 7   FAR        25 non-null     int64 
 8   Reformed   25 non-null     int64 
dtypes: int64(8), object(1)
memory usage: 1.9+ KB
None


Unnamed: 0,No,minutes,Price,Area,Year,Structure,BCR,FAR,Reformed
0,1,3,190000000,70,1,ＲＣ,80,500,0
1,2,5,350000000,200,39,ＳＲＣ,60,400,0
2,4,4,38000000,30,16,ＲＣ,80,700,0
3,5,2,30000000,20,17,ＳＲＣ,80,700,0
4,6,4,29000000,25,14,ＲＣ,80,600,0


In [19]:
X = df.drop(columns=['No', 'Price'])
y = df['Price']
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

X: (25, 7)


Unnamed: 0,minutes,Area,Year,Structure,BCR,FAR,Reformed
0,3,70,1,ＲＣ,80,500,0
1,5,200,39,ＳＲＣ,60,400,0
2,4,30,16,ＲＣ,80,700,0
3,2,20,17,ＳＲＣ,80,700,0
4,4,25,14,ＲＣ,80,600,0


y: (25,)
0    190000000
1    350000000
2     38000000
3     30000000
4     29000000
Name: Price, dtype: int64


In [20]:
X_dumm = pd.get_dummies(X, drop_first=True, dtype='uint8')
print('X_dumm:', X_dumm.shape)
display(X_dumm.head())

X_dumm: (25, 7)


Unnamed: 0,minutes,Area,Year,BCR,FAR,Reformed,Structure_ＳＲＣ
0,3,70,1,80,500,0,0
1,5,200,39,60,400,0,1
2,4,30,16,80,700,0,0
3,2,20,17,80,700,0,1
4,4,25,14,80,600,0,0


In [21]:
X_scaled_ar = scale(X_dumm)
y_scaled_ar = scale(y)
X_scaled = pd.DataFrame(X_scaled_ar, columns=X_dumm.columns)
y_scaled = pd.Series(y_scaled_ar, name=y.name)

In [22]:
d_scaled = pd.concat([X_scaled, y_scaled], axis=1)
display(d_scaled.head())

Unnamed: 0,minutes,Area,Year,BCR,FAR,Reformed,Structure_ＳＲＣ,Price
0,-0.242536,0.57039,-1.483523,0.294884,-0.711057,-0.436436,-0.75,1.640727
1,0.970143,3.88069,1.404482,-3.391165,-1.646659,-0.436436,1.333333,3.849716
2,0.363803,-0.448164,-0.343521,0.294884,1.160146,-0.436436,-0.75,-0.457813
3,-0.848875,-0.702802,-0.26752,0.294884,1.160146,-0.436436,1.333333,-0.568263
4,0.363803,-0.575483,-0.495521,0.294884,0.224544,-0.436436,-0.75,-0.582069


In [23]:
def step_aic_forward(model, exog, endog, **kwargs):
    exog = np.r_[[exog]].flatten()
    endog = np.r_[[endog]].flatten()
    remaining = set(exog)
    selected = []
    
    formula_head = 'Q("' + '") + Q("'.join(endog) + '") ~ '
    formula = formula_head + '1'
    aic = model(formula=formula, **kwargs).fit().aic
    print('AIC: {:.3f}, formula: {}'.format(aic, formula))
    
    current_score, best_new_score = aic, aic
    
    while True:
        score_with_candidates = []
        for candidate in remaining:
            formula_tail = 'Q("' + '") + Q("'.join(selected + [candidate]) + '")'
            formula = formula_head + formula_tail
            aic = model(formula=formula, **kwargs).fit().aic
            print('AIC: {:.3f}, formula: {}'.format(aic, formula))
            score_with_candidates.append((aic, candidate))
        
        score_with_candidates.sort()
        best_score, best_candidate = score_with_candidates[0]
        
        improved = False
        if best_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_score
            improved = True
            
        if not remaining or not improved: break
    
    formula = formula_head + 'Q("' + '") + Q("'.join(selected) + '")'
    print('The best formula: {}'.format(formula))
    aic = model(formula=formula, **kwargs).fit().aic
    print('Minimum AIC: {:.3f}'.format(aic))
    
    ret = model(formula, **kwargs).fit()
    ret.model.exog_names_org = [re.sub(r'Q\(\"(.*)\"\)',r'\1',x) for x in list(ret.model.exog_names)]
    ret.model.endog_names_org = re.sub(r'Q\(\"(.*)\"\)',r'\1',ret.model.endog_names)
    return ret

In [24]:
header_y = y_scaled.name
header_x = X_scaled.columns
model = step_aic_forward(smf.ols, header_x, header_y, data=d_scaled)

AIC: 72.947, formula: Q("Price") ~ 1
AIC: 74.493, formula: Q("Price") ~ Q("Structure_ＳＲＣ")
AIC: 74.898, formula: Q("Price") ~ Q("minutes")
AIC: 69.082, formula: Q("Price") ~ Q("FAR")
AIC: 74.278, formula: Q("Price") ~ Q("Year")
AIC: 56.065, formula: Q("Price") ~ Q("BCR")
AIC: 74.079, formula: Q("Price") ~ Q("Reformed")
AIC: 11.323, formula: Q("Price") ~ Q("Area")
AIC: 74.493, formula: Q("Price") ~ Q("Structure_ＳＲＣ")
AIC: 74.898, formula: Q("Price") ~ Q("minutes")
AIC: 69.082, formula: Q("Price") ~ Q("FAR")
AIC: 74.278, formula: Q("Price") ~ Q("Year")
AIC: 56.065, formula: Q("Price") ~ Q("BCR")
AIC: 74.079, formula: Q("Price") ~ Q("Reformed")
AIC: 11.323, formula: Q("Price") ~ Q("Area")
AIC: 11.483, formula: Q("Price") ~ Q("Area") + Q("Structure_ＳＲＣ")
AIC: 12.593, formula: Q("Price") ~ Q("Area") + Q("minutes")
AIC: 11.179, formula: Q("Price") ~ Q("Area") + Q("FAR")
AIC: 6.957, formula: Q("Price") ~ Q("Area") + Q("Year")
AIC: 12.855, formula: Q("Price") ~ Q("Area") + Q("BCR")
AIC: 10.550

In [25]:
print("Selected variables:", model.model.exog_names_org[1:])
print("Adjusted R-squared:", model.rsquared_adj)

Selected variables: ['Area', 'Year', 'FAR']
Adjusted R-squared: 0.9386299216689996
