In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
import re

In [40]:
df = pd.read_csv('condo.csv', sep=',', header=0,encoding='shift-jis')
print(df.shape)
print(df.info())
display(df.head())

(25, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   No         25 non-null     int64 
 1   minutes    25 non-null     int64 
 2   Price      25 non-null     int64 
 3   Area       25 non-null     int64 
 4   Year       25 non-null     int64 
 5   Structure  25 non-null     object
 6   BCR        25 non-null     int64 
 7   FAR        25 non-null     int64 
 8   Reformed   25 non-null     int64 
dtypes: int64(8), object(1)
memory usage: 1.9+ KB
None


Unnamed: 0,No,minutes,Price,Area,Year,Structure,BCR,FAR,Reformed
0,1,3,190000000,70,1,ＲＣ,80,500,0
1,2,5,350000000,200,39,ＳＲＣ,60,400,0
2,4,4,38000000,30,16,ＲＣ,80,700,0
3,5,2,30000000,20,17,ＳＲＣ,80,700,0
4,6,4,29000000,25,14,ＲＣ,80,600,0


In [41]:
X = df.drop(columns=['No','Price'])
y = df['Price']
print('X:', X.shape)
print('y:', y.shape)
print(X['Structure'].value_counts())

X: (25, 7)
y: (25,)
Structure
ＲＣ     16
ＳＲＣ     9
Name: count, dtype: int64


In [42]:
X_dumm = pd.get_dummies(X, drop_first=True, dtype='uint8')
print('X_dumm:', X_dumm.shape)
display(X_dumm.head())

X_dumm: (25, 7)


Unnamed: 0,minutes,Area,Year,BCR,FAR,Reformed,Structure_ＳＲＣ
0,3,70,1,80,500,0,0
1,5,200,39,60,400,0,1
2,4,30,16,80,700,0,0
3,2,20,17,80,700,0,1
4,4,25,14,80,600,0,0


In [43]:
X_dumm_c = sm.add_constant(X_dumm)
model = sm.OLS(y, X_dumm_c)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.930
Method:                 Least Squares   F-statistic:                     46.51
Date:                Mon, 06 Oct 2025   Prob (F-statistic):           7.36e-10
Time:                        17:03:33   Log-Likelihood:                -450.39
No. Observations:                  25   AIC:                             916.8
Df Residuals:                      17   BIC:                             926.5
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          7.574e+07   9.08e+07      0.834

In [44]:
print(results.rsquared)
print(results.rsquared_adj)
print(results.params)

0.9503795837148971
0.9299476475975017
const            7.573668e+07
minutes         -2.274047e+06
Area             1.775590e+06
Year            -9.504725e+05
BCR             -2.994994e+05
FAR             -6.894131e+04
Reformed         7.205383e+06
Structure_ＳＲＣ    4.965924e+05
dtype: float64


In [45]:
X_scaled_ar = scale(X_dumm)
y_scaled_ar = scale(y)
X_scaled = pd.DataFrame(X_scaled_ar, columns=X_dumm.columns)
y_scaled = pd.Series(y_scaled_ar, name=y.name)
model = sm.OLS(y_scaled, X_scaled)
results_scaled = model.fit()
print(results_scaled.summary())

                                 OLS Regression Results                                
Dep. Variable:                  Price   R-squared (uncentered):                   0.950
Model:                            OLS   Adj. R-squared (uncentered):              0.931
Method:                 Least Squares   F-statistic:                              49.25
Date:                Mon, 06 Oct 2025   Prob (F-statistic):                    1.85e-10
Time:                        17:03:33   Log-Likelihood:                          2.0684
No. Observations:                  25   AIC:                                      9.863
Df Residuals:                      18   BIC:                                      18.40
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [46]:
print(results_scaled.params.sort_values(key=np.abs, ascending=False))

Area             0.962703
Year            -0.172663
FAR             -0.101733
minutes         -0.051779
Reformed         0.036470
BCR             -0.022436
Structure_ＳＲＣ    0.003291
dtype: float64


In [47]:
X_test = pd.DataFrame([[5,65, 4, 75, 510, 1,1],
                       [15,75, 10, 65, 410, 0,0],
                        ],
                        columns=X_dumm.columns)  # example
print('X for prediction:')
display(X_test)

X for prediction:


Unnamed: 0,minutes,Area,Year,BCR,FAR,Reformed,Structure_ＳＲＣ
0,5,65,4,75,510,1,1
1,15,75,10,65,410,0,0


In [48]:
X_test_c = sm.add_constant(X_test,has_constant='add')
y_test = results.predict(X_test_c)
print(y_test)

0    1.260574e+08
1    1.175571e+08
dtype: float64


In [49]:
y_pred = results.predict(X_dumm_c)
print(y_pred.head())

0    1.338248e+08
1    3.373661e+08
2    3.248177e+07
3    1.882008e+07
4    3.239889e+07
dtype: float64


In [50]:
mse = mean_squared_error(y, y_pred)
print('MSE, RMSE:', mse, np.sqrt(mse))

MSE, RMSE: 260323312082204.3 16134537.863918021


In [51]:
d_scaled=pd.concat([X_scaled,y_scaled],axis=1)
display(d_scaled.head())

Unnamed: 0,minutes,Area,Year,BCR,FAR,Reformed,Structure_ＳＲＣ,Price
0,-0.242536,0.57039,-1.483523,0.294884,-0.711057,-0.436436,-0.75,1.640727
1,0.970143,3.88069,1.404482,-3.391165,-1.646659,-0.436436,1.333333,3.849716
2,0.363803,-0.448164,-0.343521,0.294884,1.160146,-0.436436,-0.75,-0.457813
3,-0.848875,-0.702802,-0.26752,0.294884,1.160146,-0.436436,1.333333,-0.568263
4,0.363803,-0.575483,-0.495521,0.294884,0.224544,-0.436436,-0.75,-0.582069


In [52]:
def step_aic_forward(model, exog, endog, **kwargs):
    exog = np.r_[[exog]].flatten()
    endog = np.r_[[endog]].flatten()
    remaining = set(exog)
    selected = [] 

    formula_head = 'Q("' + '") + Q("'.join(endog) + '") ~ '
    formula = formula_head + '1'
    aic = model(formula=formula, **kwargs).fit().aic
    print('AIC: {:.3f}, formula: {}'.format(aic, formula))

    current_score, best_new_score = aic, aic

    while True:
        score_with_candidates = []
        for candidate in remaining:
            formula_tail = 'Q("' + '") + Q("'.join(selected + [candidate]) + '")'
            formula = formula_head + formula_tail
            aic = model(formula=formula, **kwargs).fit().aic
            print('AIC: {:.3f}, formula: {}'.format(aic, formula))

            score_with_candidates.append((aic, candidate))

        score_with_candidates.sort()
        best_score, best_candidate = score_with_candidates[0]

        improved = False
        if best_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_score
            improved = True
            
        if not remaining or not improved: break

    formula = formula_head + 'Q("' + '") + Q("'.join(selected) + '")'
    print('The best formula: {}'.format(formula))
    aic = model(formula=formula, **kwargs).fit().aic
    print('Minimum AIC: {:.3f}'.format(aic))
    
    ret = model(formula, **kwargs).fit()
    ret.model.exog_names_org = [re.sub(r'Q\(\"(.*)\"\)',r'\1',x) for x in list(ret.model.exog_names)]
    ret.model.endog_names_org = re.sub(r'Q\(\"(.*)\"\)',r'\1',ret.model.endog_names)
    return ret

In [53]:
header_y = y_scaled.name
header_x = X_scaled.columns
model = step_aic_forward(smf.ols, header_x,header_y, data=d_scaled)

AIC: 72.947, formula: Q("Price") ~ 1
AIC: 69.082, formula: Q("Price") ~ Q("FAR")
AIC: 74.898, formula: Q("Price") ~ Q("minutes")
AIC: 11.323, formula: Q("Price") ~ Q("Area")
AIC: 56.065, formula: Q("Price") ~ Q("BCR")
AIC: 74.079, formula: Q("Price") ~ Q("Reformed")
AIC: 74.278, formula: Q("Price") ~ Q("Year")
AIC: 74.493, formula: Q("Price") ~ Q("Structure_ＳＲＣ")
AIC: 11.179, formula: Q("Price") ~ Q("Area") + Q("FAR")
AIC: 12.593, formula: Q("Price") ~ Q("Area") + Q("minutes")
AIC: 12.855, formula: Q("Price") ~ Q("Area") + Q("BCR")
AIC: 10.550, formula: Q("Price") ~ Q("Area") + Q("Reformed")
AIC: 6.957, formula: Q("Price") ~ Q("Area") + Q("Year")
AIC: 11.483, formula: Q("Price") ~ Q("Area") + Q("Structure_ＳＲＣ")
AIC: 5.838, formula: Q("Price") ~ Q("Area") + Q("Year") + Q("FAR")
AIC: 7.669, formula: Q("Price") ~ Q("Area") + Q("Year") + Q("minutes")
AIC: 8.169, formula: Q("Price") ~ Q("Area") + Q("Year") + Q("BCR")
AIC: 8.950, formula: Q("Price") ~ Q("Area") + Q("Year") + Q("Reformed")
AI

In [54]:
X_scaled2 = X_scaled[['Area', 'Year', 'FAR']]

mod2 = sm.OLS(y_scaled,X_scaled2)
res2 = mod2.fit()
print(res2.summary())

                                 OLS Regression Results                                
Dep. Variable:                  Price   R-squared (uncentered):                   0.946
Model:                            OLS   Adj. R-squared (uncentered):              0.939
Method:                 Least Squares   F-statistic:                              129.2
Date:                Mon, 06 Oct 2025   Prob (F-statistic):                    4.04e-14
Time:                        17:03:34   Log-Likelihood:                          1.0811
No. Observations:                  25   AIC:                                      3.838
Df Residuals:                      22   BIC:                                      7.494
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [55]:
num_cols = mod2.exog.shape[1] 
vifs = [variance_inflation_factor(mod2.exog, i)
        for i in range(0, num_cols)]
pd.DataFrame(vifs, index=mod2.exog_names, columns=["VIF"])

Unnamed: 0,VIF
Area,1.2816
Year,1.10085
FAR,1.191255


In [56]:
X_final = X_dumm[['Area', 'Year', 'FAR']]

mod_final = sm.OLS(y, sm.add_constant(X_final))
res_final = mod_final.fit()
print(res_final.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.946
Model:                            OLS   Adj. R-squared:                  0.939
Method:                 Least Squares   F-statistic:                     123.4
Date:                Mon, 06 Oct 2025   Prob (F-statistic):           1.71e-13
Time:                        17:03:34   Log-Likelihood:                -451.37
No. Observations:                  25   AIC:                             910.7
Df Residuals:                      21   BIC:                             915.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        3.84e+07   2.47e+07      1.552      0.1

In [57]:
Sigma = np.asmatrix(X_final.cov())

def Mahala2(vec_x, vec_mean, mat):
    length = mat.shape[0]
    vec_x = np.array(vec_x, dtype='float64')
    vec = np.asmatrix((vec_x - vec_mean).values.reshape(length, 1))
    inv = np.linalg.inv(mat)
    mahala2 = vec.T.dot(inv.dot(vec))
    return mahala2[0, 0]

n = len(X_final) 
dfm = res_final.df_model 
t_0025 = ss.t.isf(q=0.05/2, df=n-dfm-1) 
vec_mean = X_final.mean() 
print(vec_mean)
display(X_final.head())

Area     47.60
Year     20.52
FAR     576.00
dtype: float64


Unnamed: 0,Area,Year,FAR
0,70,1,500
1,200,39,400
2,30,16,700
3,20,17,700
4,25,14,600


In [58]:
X = sm.add_constant(X_final).iloc[0,:] 
print(X)
hat_y=X.dot(res_final.params)

D2_0 = Mahala2(X_final.iloc[0,:], vec_mean, Sigma)
Ve = res_final.scale
Se_o = np.sqrt((1/n + D2_0 / (n-1)) * Ve) 

ci_low = hat_y - t_0025 * Se_o
ci_up = hat_y + t_0025 * Se_o 

print("理論値（theoretical value）:", hat_y)
print("理論値の信頼区間（CI of theoretical value）：({0},{1})".format(ci_low,ci_up))

const      1.0
Area      70.0
Year       1.0
FAR      500.0
Name: 0, dtype: float64
理論値（theoretical value）: 131076914.15414217
理論値の信頼区間（CI of theoretical value）：(114604611.27388398,147549217.03440034)


In [59]:
X_test2 = X_test[['Area', 'Year', 'FAR']]
X = sm.add_constant(X_test2).iloc[0,:]
print(X)
hat_y=X.dot(res_final.params) 

D2_0 = Mahala2(X_test2.iloc[0,:], vec_mean, Sigma) 
Se_p = np.sqrt((1 + 1/n + D2_0 / (n-1)) * Ve) 

pi_low = hat_y - t_0025 * Se_p
pi_up = hat_y + t_0025 * Se_p

print("予測値（Predicted value）:", hat_y)
print("予測値の信頼区間（CI of predicted value）：({0},{1})".format(pi_low,pi_up))

const      1.0
Area      65.0
Year       4.0
FAR      510.0
Name: 0, dtype: float64
予測値（Predicted value）: 119198934.78865579
予測値の信頼区間（CI of predicted value）：(78467939.23492995,159929930.34238163)
