In [71]:
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm

pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
df = pd.read_csv('../../data/0320/df_raw.csv')
df = df[df['room_period'].isin(['2年間', '24ヶ月間'])]
df.shape()

In [26]:
# 初期費用合計
def get_initialFee(x):
    fees = x.room_initialFeeText.split('/')
    ifee = 0
    for fee in fees:
        if 'ヶ月' in fee:
            ifee += float(fee.replace('ヶ月', ''))*x.room_price
        elif '万円' in fee:
            ifee += float(fee.replace('万円', ''))*10000
        elif '無' in fee:
            pass
        else:
            pass
    return ifee

In [None]:
## 変数処理
df['room_price'] = df['room_price'].str.replace('万円', '').astype(float)*10000
df['room_commonFee'] = df['room_commonFee'].str.replace('円', '').str.replace(',', '').str.replace('-', '0').astype(int)
df['room_initialFee'] = df.apply(lambda x: get_initialFee(x), axis= 1)
df['room_monthly'] = df['room_price']+ df['room_commonFee']+ (df['room_initialFee']/24)
df['building_stationMin'] = df['building_stationText'].apply(lambda x: min([int(i) for i in re.sub(r'[^0-9;]', '', x).split(';')]))

df['room_floor'] = df['room_floor'].str.replace('-', '1').str.replace('地下', '-').str.replace('階', '').astype('int')
df['room_area'] = df['room_area'].str.replace('㎡', '').astype(float)
df['building_age'] = df['building_AFText'].apply(lambda x : 0 if '新築' in x else int(x.split('年 / ')[0]))
df['room_2fUp'] = (df['room_floor'] >= 2).astype(int)
df['room_corner'] = df['room_detailText'].apply(lambda x: 1 if '角部屋' in x else 0)
df['room_south'] = (df['room_facing'] == '南').astype(int)
df['room_elevator'] = df['room_detailText'].apply(lambda x: 1 if 'エレベーター' in x else 0)
df['building_PRC'] = df['building_structure'].apply(lambda x: 1 if ('PC' in x)|('RC' in x)else 0)
df['building_S'] = df['building_structure'].apply(lambda x: 1 if x in ['鉄骨造', '軽量鉄骨造'] else 0)

df['room_internet'] = df['room_detailText'].apply(lambda x: 1 if 'インターネット対応' in x else 0)
df['room_autolock'] = df['room_detailText'].apply(lambda x: 1 if 'オートロック' in x else 0)
df['room_tvMonitor'] = df['room_detailText'].apply(lambda x: 1 if 'TVモニタ付インターホン' in x else 0)
df['room_flooring'] = df['room_detailText'].apply(lambda x: 1 if 'フローリング' in x else 0)
df['room_washIn'] = df['room_detailText'].apply(lambda x: 1 if '室内洗濯機置場' in x else 0)
df['room_airCon'] = df['room_detailText'].apply(lambda x: 1 if 'エアコン' in x else 0)
df['room_gas'] = df['room_detailText'].apply(lambda x: 1 if 'ガスコンロ' in x else 0)
df['room_cook2'] = df['room_detailText'].apply(lambda x: 1 if 'コンロ二口' in x else 0)
df['room_bathSep'] = df['room_detailText'].apply(lambda x: 1 if 'バス・トイレ別' in x else 0)
df['room_reheat'] = df['room_detailText'].apply(lambda x: 1 if '追焚' in x else 0)
df['room_sinkSep'] = df['room_detailText'].apply(lambda x: 1 if '洗面所独立' in x else 0)
df['room_washlet'] = df['room_detailText'].apply(lambda x: 1 if '温水洗浄便座' in x else 0)
df['room_pet'] = df['room_detailText'].apply(lambda x: 1 if ('ペット可' in x)|('ペット相談' in x) else 0)

df['room_parkingIn'] = df['room_parking'].apply(lambda x: 1 if '空有' in str(x) else 0)
df['room_bicycle'] = df['room_detailText'].apply(lambda x: 1 if '駐輪場' in x else 0)
df['room_parkingNear'] = df['room_parking'].apply(lambda x: 1 if '近隣' in str(x) else 0)
df['room_citygas'] = df['room_detailText'].apply(lambda x: 1 if '都市ガス' in x else 0)

df.head(3)

In [68]:
variables = ['room_monthly','building_stationMin','room_floor',
             'room_area','building_age','room_2fUp','room_corner','room_south','room_elevator','building_PRC',
             'building_S','room_internet','room_autolock','room_tvMonitor','room_flooring','room_washIn',
             'room_airCon','room_gas','room_cook2','room_bathSep','room_reheat','room_sinkSep','room_washlet',
             'room_pet','room_parkingIn','room_bicycle','room_parkingNear','room_citygas']
data = df.loc[:,variables] 

data.columns = ['p', 'AC2', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6', 
                'SF7', 'SF8', 'SF9', 'IH1', 'IH2', 'IH3', 'IH4', 'IH5', 
                'IH6', 'IH7', 'IH8', 'IH9', 'IH10', 'IH11', 'IH12', 'IH13', 
                'EH1', 'EH2', 'SA1', 'SA2']
data.to_csv('../../data/0320/data.csv', encoding = 'utf-8-sig', index=0)

In [74]:
data.shape

(2373, 29)

In [70]:
# 要約統計量
data.describe()

Unnamed: 0,p,AC2,SF1,SF2,SF3,SF4,SF5,SF6,SF7,SF8,...,IH8,IH9,IH10,IH11,IH12,IH13,EH1,EH2,SA1,SA2
count,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,...,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0,2373.0
mean,134915.9131,5.657,3.689,32.5888,25.8538,0.8045,0.5394,0.1677,0.5499,0.6991,...,0.3704,0.7611,0.2984,0.4526,0.5171,0.1395,0.032,0.5466,0.1391,0.8458
std,92759.5674,2.594,2.8198,18.7698,15.8644,0.3967,0.4986,0.3737,0.4976,0.4587,...,0.483,0.4265,0.4576,0.4979,0.4998,0.3465,0.1761,0.4979,0.3461,0.3613
min,32000.0,1.0,-4.0,6.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,84250.0,4.0,2.0,20.8,13.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,107250.0,6.0,3.0,26.5,23.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
75%,165000.0,7.0,5.0,40.52,38.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
max,1350000.0,14.0,35.0,253.64,73.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [73]:
# 重回帰分析
data['log_p'] = np.log(data['p'])
y = data['log_p']
X = data.drop(columns=['p', 'log_p'])
X = sm.add_constant(X)

# モデル1
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  log_p   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.899
Method:                 Least Squares   F-statistic:                     784.5
Date:                Thu, 21 Mar 2024   Prob (F-statistic):               0.00
Time:                        10:29:36   Log-Likelihood:                 1164.6
No. Observations:                2373   AIC:                            -2273.
Df Residuals:                    2345   BIC:                            -2112.
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         11.0076      0.023    472.141      0.0