In [2]:
###-- Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

!ls 'drive/My Drive/jupyter/ProbSpace/RealEstatePrices/data/'

In [0]:
###----------------------------------------###
###           Input information            ###
###----------------------------------------###
##-- CSV filename
train_filename, test_filename = "train_data.csv", "test_data.csv"

##-- Data PATH
PATH_data = "drive/My Drive/jupyter/ProbSpace/RealEstatePrices/data/"

In [0]:
###-----------------------------------###
###        Import Library             ###
###-----------------------------------###
##-- Pandas
import pandas as pd
from pandas import Series, DataFrame
pd.set_option('max_columns', 30)
##-- Numpy
import numpy as np
##-- Matplotlib
import matplotlib.pylab as plt
import matplotlib.cm as cm #-- for gradation
import seaborn as sns
##-- Scikit-learn
import sklearn  #-- print(sklearn.__version__)

##-- Ignore Warning
import warnings
warnings.filterwarnings('ignore')

plt.style.use('bmh')
from itertools import cycle
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

In [0]:
###-----------------------------###
###        Read dataset         ###
###-----------------------------###
f = pd.read_csv(PATH_data+"/"+train_filename, encoding="utf-8")
g = pd.read_csv(PATH_data+"/"+test_filename, encoding="utf-8")
##-- Convert Feature names from Japanese into English
##-- List is in "names.json"
import json
with open(PATH_data+"/"+"names.json", "r", encoding="utf-8") as f_json:
     d = json.load(f_json)
f = f.rename(columns=d)
g = g.rename(columns=d)

In [0]:
##-- Remove the values, deviating significantly
# f = f[f["y"] < (90)]
# print(f.shape)
# plt.hist(f['y'], bins=20)

In [0]:
##-- Assign appropriate numerical values to non-numeric data
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('30分?60分','45')
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('1H?1H30','75')
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('1H30?2H','105')
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('2H?','120')
f['TimeToNearestStation'] = pd.to_numeric(f['TimeToNearestStation'], errors='coerce')
##-----------------------------------------------------------------------------
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('30分?60分','45')
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('1H?1H30','75')
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('1H30?2H','105')
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('2H?','120')
g['TimeToNearestStation'] = pd.to_numeric(g['TimeToNearestStation'], errors='coerce')

In [0]:
##-- Convert the Japanese calendar to the Western calendar. 
##-- However, it is roughly the value before the war.
f['BuildingYear'] = f['BuildingYear'].dropna()
f['BuildingYear'] = f['BuildingYear'].str.replace('戦前','昭和20年')
f['年号'] = f['BuildingYear'].str[:2]
f['和暦年数'] = f['BuildingYear'].str[2:].str.strip('年').fillna(0).astype(int)
#-- AD: Western calendar
f.loc[f['年号']=='昭和','BuildingYear_AD'] = f['和暦年数'] + 1925
f.loc[f['年号']=='平成','BuildingYear_AD'] = f['和暦年数'] + 1988
##-----------------------------------------------------------------------------
g['BuildingYear'] = g['BuildingYear'].dropna()
g['BuildingYear'] = g['BuildingYear'].str.replace('戦前','昭和20年')
g['年号'] = g['BuildingYear'].str[:2]
g['和暦年数'] = g['BuildingYear'].str[2:].str.strip('年').fillna(0).astype(int)

g.loc[g['年号']=='昭和','BuildingYear_AD'] = g['和暦年数'] + 1925
g.loc[g['年号']=='平成','BuildingYear_AD'] = g['和暦年数'] + 1988

In [0]:
##-- Convert "String" into numerical values.
f['Area'] = f['Area'].replace('2000㎡以上',"3500")
f['Area'] = f['Area'].replace('5000㎡以上',"5000")
f['Area'] = pd.to_numeric(f['Area'], errors='coerce')

g['Area'] = g['Area'].replace('2000㎡以上',"3500")
g['Area'] = g['Area'].replace('5000㎡以上',"5000")
g['Area'] = pd.to_numeric(g['Area'], errors='coerce')

##-- Convert "String" into numerical values.
f['TotalFloorArea'] = f['TotalFloorArea'].replace('2000㎡以上',"3500")
f['TotalFloorArea'] = f['TotalFloorArea'].replace('5000㎡以上',"5000")
f['TotalFloorArea'] = pd.to_numeric(f['TotalFloorArea'], errors='coerce')

g['TotalFloorArea'] = g['TotalFloorArea'].replace('2000㎡以上',"3500")
g['TotalFloorArea'] = g['TotalFloorArea'].replace('5000㎡以上',"5000")
g['TotalFloorArea'] = pd.to_numeric(g['TotalFloorArea'], errors='coerce')

In [0]:
###-- Create the enginearing feature(Number of room)
room_code = {\
            "オープンフロア":1,"スタジオ":1,"メゾネット":2,\
            "１Ｒ":1,"１Ｒ＋Ｓ":1, "１Ｋ":1,"１Ｋ＋Ｓ":1,"１ＤＫ":2, "１ＤＫ＋Ｓ":2,\
            "１Ｌ":2,"１ＬＫ":2,"１ＬＤＫ":2,"１ＬＤＫ＋Ｓ":2,"１ＬＫ＋Ｓ":2,"１Ｌ＋Ｓ":2,\
            "１ＬＤ＋Ｓ":2,
            "２Ｋ":2,"２Ｋ＋Ｓ":2,"２ＤＫ":3,"２ＤＫ＋Ｓ":3,"２Ｋ＋Ｓ":3,"２ＬＫ＋Ｓ":3,\
            "２ＬＤ":3,"２Ｄ":3,"２ＬＤ＋Ｓ":3,"２ＬＫ":3,"２ＬＤＫ":3,"２ＬＤＫ＋Ｓ":3,\
            "３Ｋ":3,"３Ｋ＋Ｓ":3,"３ＤＫ":4,"３ＤＫ＋Ｓ":4,"３Ｋ＋Ｓ":4,"３ＬＫ＋Ｓ":4,\
            "３ＬＤ":4,"３ＬＤ＋Ｓ":4,"３ＬＫ":4,"３ＬＤＫ":4,"３ＬＤＫ＋Ｓ":4,"３ＬＤＫ＋Ｋ":4,\
            "４Ｋ":4,"４Ｋ＋Ｓ":4,"４ＤＫ":5,"４ＤＫ＋Ｓ":5,"４ＬＫ＋Ｓ":5,\
            "４ＬＤ":5,"４ＬＤ＋Ｓ":5,"４ＬＫ":5,"４ＬＤＫ":5,"４ＬＤＫ＋Ｓ":5,\
            "５Ｋ":5,"５Ｋ＋Ｓ":5,"５ＤＫ":6,"５ＤＫ＋Ｓ":6,"５ＬＫ＋Ｓ":6,"５ＬＫ＋Ｓ":6,\
            "５ＬＤ":6,"５ＬＤ＋Ｓ":6,"５ＬＫ":6,"５ＬＤＫ":6,"５ＬＤＫ＋Ｓ":6,\
            "６Ｋ":6,"６Ｋ＋Ｓ":6,"６ＤＫ":7,"６ＤＫ＋Ｓ":7,\
            "６ＬＤ":7,"６ＬＤ＋Ｓ":7,"６ＬＫ":7,"６ＬＤＫ":7,"６ＬＤＫ＋Ｓ":7,\
            "７Ｋ":7,"７Ｋ＋Ｓ":7,"７ＤＫ":8,"７ＤＫ＋Ｓ":8,\
            "７ＬＤ":8,"７ＬＤ＋Ｓ":8,"７ＬＫ":8,"７ＬＤＫ":8,"７ＬＤＫ＋Ｓ":8,\
            }

f["NumberOfRoom"] = f["FloorPlan"].copy()
f["NumberOfRoom"] = f["NumberOfRoom"].fillna(1)
g["NumberOfRoom"] = g["FloorPlan"].copy()
g["NumberOfRoom"] = g["NumberOfRoom"].fillna(1)

for name in room_code:
    f["NumberOfRoom"] = f["NumberOfRoom"].replace(name, room_code[name])
    g["NumberOfRoom"] = g["NumberOfRoom"].replace(name, room_code[name])

# print(f["NumberOfRoom"].head(5))

In [20]:
f["NumberOfRoom"].astype(int)
g["NumberOfRoom"].astype(int)
print(f["NumberOfRoom"].dtype)

int64


In [0]:
###-- Create the enginearing feature(Area per a room)
f["AreaPerRoom"] = f['TotalFloorArea'] / f["NumberOfRoom"]
g["AreaPerRoom"] = g['TotalFloorArea'] / g["NumberOfRoom"]

In [0]:
##-- Feature to use
input_name = [\
              #-- "延床面積（㎡）"
              "TotalFloorArea",\
              #-- "面積（㎡）"(IMPORTANT)
              'Area',\
              #-- "容積率（％）"
              'FloorAreaRatio',\
              #-- "最寄駅：距離（分）"
              'TimeToNearestStation',\
              #-- "建築年(西暦))"
              'BuildingYear_AD',\
              #-- "市区町村名"
              'Municipality',\
              #-- "最寄駅：名称"
              'NearestStation',\
              ###---  Base Line ---###
              #-- '地区名'
              "DistrictName",\
              ##-- "用途"
              'Use',\
              #-- "種類"(IMPORTANT)
              "Type",\
              # #-- "今後の利用目的"
              # "Purpose",\
              # #-- '間取り'(IMPORTANT)
              # 'FloorPlan',\
              # #-- "改装"
              # "Renovation",\
              ###--- Features created by S. Nakamura ---###
              # ##-- "部屋の数"
              # "NumberOfRoom",\
              ##-- "１部屋あたりの面積"
              "AreaPerRoom",\
              ]

##-- Numerical variables
nume_cols = [\
             "TotalFloorArea",\
             'Area',\
             'FloorAreaRatio',\
             'TimeToNearestStation',\
             'BuildingYear_AD',\
            #  "NumberOfRoom",\
             "AreaPerRoom",\
             ]
##-- Categorical variables
cat_cols = [\
            "Municipality",\
            "NearestStation",\
            ##-- Base line --##
            "DistrictName",\
            "Use",\
            "Type",\
            # "Purpose",\
            # 'FloorPlan',\
            # "Renovation"\
            ]

In [24]:
##-- Preapare exploratory and target variables
X = f[input_name]
Y = f['y']
# Y = np.log( f['y'] )  #-- 分布を正規分布に近づけるため対数をとる

X_pre = g[input_name]

###--------------------------------------###
###-- Preprocessing for missing values --###
###--------------------------------------###
##-- In the case of "Categorical variable"
for name in cat_cols:
  X[name] = X[name].fillna("Unknown")
  X_pre[name] = X_pre[name].fillna("Unknown")

##-- In the case of "Numerical variable"
for name in nume_cols:
  X[name] = X[name].fillna(X[name].median())
  X_pre[name] = X_pre[name].fillna(X_pre[name].median())


print(X.isna().sum())

TotalFloorArea          0
Area                    0
FloorAreaRatio          0
TimeToNearestStation    0
BuildingYear_AD         0
Municipality            0
NearestStation          0
DistrictName            0
Use                     0
Type                    0
AreaPerRoom             0
dtype: int64


In [0]:
# ##-- Create interacted variables
# k = 0
# for i in range(len(nume_cols)):
#     for j in range(len(nume_cols)):
#         if i == j:
#             pass
#         else:
#             k += 1
#             ##-- 積
#             name_seki = "feature_seki" + str(k)
#             X[name_seki] = X[nume_cols[i]]*X[nume_cols[j]]
#             X_pre[name_seki] = X_pre[nume_cols[i]]*X_pre[nume_cols[j]]
#             # ##-- 商
#             # name_shou = "feature_shou" + str(k)
#             # X[name_shou] = X[nume_cols[i]]*X[nume_cols[j]]
#             # X_pre[name_shou] = X_pre[nume_cols[i]]*X_pre[nume_cols[j]]
#             # ##-- 差
#             # name_sa = "feature_sa" + str(k)
#             # X[name_sa] = X[nume_cols[i]]*X[nume_cols[j]]
#             # X_pre[name_sa] = X_pre[nume_cols[i]]*X_pre[nume_cols[j]]
# # X.head()

In [0]:
##-- Label Encoding for categorical variable
from sklearn import preprocessing

for name in cat_cols:
  ##-- Create the Label encoding object
  le = preprocessing.LabelEncoder()
  ##-- Merge the train and test datasets
  X_fit = X[name].append(X_pre[name])
  le.fit(X_fit)

  X[name] = le.transform(X[name])
  X_pre[name] = le.transform(X_pre[name])

In [0]:
###-- Split the dataset
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=99)

In [31]:
###-- Import LightGBM --###
import lightgbm as lgb

###--  Set params  --###
params = {
          'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'learning_rate': 0.05,
          'max_depth': 8,
          'lambda_l1': 1.0,
          'lambda_l2': 1.0,
          'num_threads': 4,
          'verbose': -1,
          'num_leaves': 200,
          'feature_fraction': 0.85,
          'bagging_fraction': 0.85,
          'bagging_freq': 5,
          'min_child_samples': 80,
          #-- Added
          'seed': 99,                                                                        
}

###-- KFold Cross validation --###
num_splits=8
from sklearn.model_selection import KFold
kf = KFold(n_splits=num_splits, shuffle=True, random_state=20200102)

models, loss_list = [], []
i = 0
for train_idx, val_idx in kf.split(X_train):
  i = i + 1
  ###-- Set the dataset --###
  X_kfold_train, Y_kfold_train = X_train.iloc[train_idx, :], Y_train.iloc[train_idx]
  X_kfold_val, Y_kfold_val = X_train.iloc[val_idx, :], Y_train.iloc[val_idx]

  ###-- Set dataset --###
  train_data_set = lgb.Dataset(X_kfold_train, Y_kfold_train, categorical_feature=cat_cols)
  test_data_set = lgb.Dataset(X_kfold_val, Y_kfold_val, reference=train_data_set)

  ###--  Training  --###
  gbm = lgb.train(\
                    params,
                    train_data_set,
                    valid_names=['train', 'valid'],
                    valid_sets=test_data_set,
                    # early_stopping_rounds=20,  ##--default: 20
                    # verbose_eval=40,
                    # num_boost_round=100,
                    early_stopping_rounds=100,  ##--default: 20
                    verbose_eval=100,
                    num_boost_round=5000,
                  )
  
  Y_val_pre = gbm.predict(X_kfold_val)

  ##-- loss: RMSE
  # loss = np.sqrt( sklearn.metrics.mean_squared_error(np.exp(Y_val_pre), np.exp(Y_kfold_val)) )
  loss = np.sqrt( sklearn.metrics.mean_squared_error(Y_val_pre, Y_kfold_val) )
  loss_list.append(loss)
  print(loss)

  #-- Keep the trained model
  models.append(gbm)

Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 231.887
[200]	train's rmse: 227.361
[300]	train's rmse: 222.486
[400]	train's rmse: 219.73
[500]	train's rmse: 218.876
[600]	train's rmse: 217.469
[700]	train's rmse: 217.414
Early stopping, best iteration is:
[655]	train's rmse: 216.432
216.43240674775234
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 189.628
[200]	train's rmse: 192.374
Early stopping, best iteration is:
[111]	train's rmse: 188.875
188.8747675436238
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 140.067
Early stopping, best iteration is:
[70]	train's rmse: 138.688
138.6878759792056
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 173.711
[200]	train's rmse: 174.198
Early stopping, best iteration is:
[106]	train's rmse: 172.921
172.9206690580867
Training until validation scores don't improve for 100 rounds.
[100]	train's rmse: 255.50

In [0]:
def predict_ensemble(models, num_model, X_pre):
    Y_pre_list = []
    for i in range(num_model):
        Y_pre = models[i].predict(X_pre)
        Y_pre_list.append(Y_pre)

    Y_pre_list = np.array(Y_pre_list)

    Y_pre_submit_tem = Y_pre_list[0]
    for i in range(1, num_splits):
        Y_pre_submit_tem += Y_pre_list[i]

    Y_pre_submit = Y_pre_submit_tem / float(num_model)

    return Y_pre_submit

In [33]:
print("train-average loss:", np.array(loss_list).sum() / num_splits)

Y_val_pre = predict_ensemble(models, num_splits, X_val)
los_val = np.sqrt( sklearn.metrics.mean_squared_error(Y_val_pre, Y_val) )
# los_val = np.sqrt( sklearn.metrics.mean_squared_error(np.exp(Y_val_pre), np.exp(Y_val)) )
print("validation loss:", los_val)

train-average loss: 218.95765967723224
validation loss: 150.42030889127952


In [34]:
pd.DataFrame({'Feature': X_train.columns,\
              'importance':gbm.feature_importance(importance_type='gain')})\
              .sort_values('importance', ascending=False)

Unnamed: 0,Feature,importance
1,Area,57254540000.0
0,TotalFloorArea,44363150000.0
2,FloorAreaRatio,20102080000.0
5,Municipality,19793290000.0
3,TimeToNearestStation,7897211000.0
10,AreaPerRoom,5229591000.0
4,BuildingYear_AD,4504509000.0
7,DistrictName,3925658000.0
6,NearestStation,3442482000.0
8,Use,2653830000.0


In [0]:
Y_pre_submit = predict_ensemble(models, num_splits, X_pre)

# Y_pre_submit = np.exp( Y_pre_submit )

In [0]:
###------------------------------------###
###        結果ファイルへの書き出し        ###
###------------------------------------###
from pathlib import Path
root = Path(PATH_data)

submit = pd.DataFrame({'y': Y_pre_submit})
submit.index.name = 'id'
submit.index = submit.index + 1
submit.to_csv(root.joinpath("submission.csv"), index=True)