In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
import re

In [2]:
df = pd.read_csv('usedcar.csv')
print(df.shape)
display(df.head())
display(df.describe())

(12, 4)


Unnamed: 0,価格,走行距離,乗車年数,車検
0,89,4.3,5,24
1,99,1.9,4,18
2,128,5.2,2,13
3,98,5.1,3,4
4,52,4.0,6,15


Unnamed: 0,価格,走行距離,乗車年数,車検
count,12.0,12.0,12.0,12.0
mean,60.666667,5.4,6.166667,14.083333
std,33.823965,2.182159,2.329,9.039895
min,23.0,1.9,2.0,0.0
25%,38.75,3.975,4.75,5.5
50%,47.5,4.95,6.5,14.5
75%,91.25,7.45,8.0,24.0
max,128.0,8.7,10.0,24.0


In [3]:
X = df.drop(columns='価格')
y = df['価格']
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

X: (12, 3)


Unnamed: 0,走行距離,乗車年数,車検
0,4.3,5,24
1,1.9,4,18
2,5.2,2,13
3,5.1,3,4
4,4.0,6,15


y: (12,)
0     89
1     99
2    128
3     98
4     52
Name: 価格, dtype: int64


In [4]:
X_scaled_ar = scale(X)
y_scaled_ar = scale(y)
X_scaled = pd.DataFrame(X_scaled_ar, columns=X.columns)
y_scaled = pd.Series(y_scaled_ar, name=y.name)
d_scaled = pd.concat([X_scaled, y_scaled], axis=1)
display(d_scaled.head())

Unnamed: 0,走行距離,乗車年数,車検,価格
0,-0.526503,-0.523205,1.145768,0.874918
1,-1.675236,-0.971666,0.45253,1.183713
2,-0.095728,-1.868588,-0.125168,2.079217
3,-0.143592,-1.420127,-1.165024,1.152833
4,-0.670094,-0.074744,0.105911,-0.267622


In [5]:
def step_aic_forward(model, exog, endog, **kwargs):
    exog = np.r_[[exog]].flatten()
    endog = np.r_[[endog]].flatten()
    remaining = set(exog)
    selected = []

    formula_head = 'Q("' + '") + Q("'.join(endog) + '") ~ '
    formula = formula_head + '1'
    aic = model(formula=formula, **kwargs).fit().aic
    print('AIC: {:.3f}, formula: {}'.format(aic, formula))

    current_score, best_new_score = aic, aic

    while True:
        score_with_candidates = []
        for candidate in remaining:
            formula_tail = 'Q("' + '") + Q("'.join(selected + [candidate]) + '")'
            formula = formula_head + formula_tail
            aic = model(formula=formula, **kwargs).fit().aic
            print('AIC: {:.3f}, formula: {}'.format(aic, formula))

            score_with_candidates.append((aic, candidate))

        score_with_candidates.sort()
        best_score, best_candidate = score_with_candidates[0]

        improved = False
        if best_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_score
            improved = True
            
        if not remaining or not improved: break

    formula = formula_head + 'Q("' + '") + Q("'.join(selected) + '")'
    print('The best formula: {}'.format(formula))
    aic = model(formula=formula, **kwargs).fit().aic
    print('Minimum AIC: {:.3f}'.format(aic))
    
    ret = model(formula, **kwargs).fit()
    ret.model.exog_names_org = [re.sub(r'Q\(\"(.*)\"\)',r'\1',x) for x in list(ret.model.exog_names)]
    ret.model.endog_names_org = re.sub(r'Q\(\"(.*)\"\)',r'\1',ret.model.endog_names)
    return ret

In [6]:
header_y = y_scaled.name
header_x = X_scaled.columns
model = step_aic_forward(smf.ols, header_x, header_y, data=d_scaled)

AIC: 36.055, formula: Q("価格") ~ 1
AIC: 37.968, formula: Q("価格") ~ Q("車検")
AIC: 34.832, formula: Q("価格") ~ Q("走行距離")
AIC: 16.809, formula: Q("価格") ~ Q("乗車年数")
AIC: 17.267, formula: Q("価格") ~ Q("乗車年数") + Q("車検")
AIC: 13.738, formula: Q("価格") ~ Q("乗車年数") + Q("走行距離")
AIC: 14.560, formula: Q("価格") ~ Q("乗車年数") + Q("走行距離") + Q("車検")
The best formula: Q("価格") ~ Q("乗車年数") + Q("走行距離")
Minimum AIC: 13.738


In [8]:
selected_vars = model.model.exog_names_org
print("Selected variables:", selected_vars)
selected_vars_clean = [var for var in selected_vars if var != 'Intercept']
print("Variables for analysis:", selected_vars_clean)
X_scaled_selected = X_scaled[selected_vars_clean]
mod_selected = sm.OLS(y_scaled, X_scaled_selected)
res_selected = mod_selected.fit()
print(res_selected.summary())

Selected variables: ['Intercept', '乗車年数', '走行距離']
Variables for analysis: ['乗車年数', '走行距離']
                                 OLS Regression Results                                
Dep. Variable:                     価格   R-squared (uncentered):                   0.888
Model:                            OLS   Adj. R-squared (uncentered):              0.866
Method:                 Least Squares   F-statistic:                              39.81
Date:                Mon, 29 Sep 2025   Prob (F-statistic):                    1.73e-05
Time:                        17:48:43   Log-Likelihood:                         -3.8691
No. Observations:                  12   AIC:                                      11.74
Df Residuals:                      10   BIC:                                      12.71
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std 



In [9]:
print(f"Adjusted R-squared: {res_selected.rsquared_adj:.3f}")

Adjusted R-squared: 0.866
