In [30]:
###-- Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

!ls 'drive/My Drive/jupyter/ProbSpace/RealEstatePrices/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
names.json  published_land_price.csv  test_data.csv  train_data.csv


In [1]:
#-- Install Optuna --#
!pip install optuna
import optuna
print(optuna.__version__)

In [0]:
###----------------------------------------###
###           Input information            ###
###----------------------------------------###
##-- CSV filename
train_filename, test_filename = "train_data.csv", "test_data.csv"

##-- Data PATH
PATH_data = "drive/My Drive/jupyter/ProbSpace/RealEstatePrices/data/"

In [0]:
###-----------------------------------###
###        Import Library             ###
###-----------------------------------###
##-- Pandas
import pandas as pd
from pandas import Series, DataFrame
pd.set_option('max_columns', 30)
##-- Numpy
import numpy as np
##-- Matplotlib
import matplotlib.pylab as plt
import matplotlib.cm as cm #-- for gradation
import seaborn as sns
##-- Scikit-learn
import sklearn  #-- print(sklearn.__version__)

##-- Ignore Warning
import warnings
warnings.filterwarnings('ignore')

plt.style.use('bmh')
from itertools import cycle
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

In [0]:
###-----------------------------###
###        Read dataset         ###
###-----------------------------###
f = pd.read_csv(PATH_data+"/"+train_filename, encoding="utf-8")
g = pd.read_csv(PATH_data+"/"+test_filename, encoding="utf-8")
##-- Convert Feature names from Japanese into English
##-- List is in "names.json"
import json
with open(PATH_data+"/"+"names.json", "r", encoding="utf-8") as f_json:
     d = json.load(f_json)
f = f.rename(columns=d)
g = g.rename(columns=d)

In [0]:
##-- Assign appropriate numerical values to non-numeric data
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('30分?60分','45')
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('1H?1H30','75')
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('1H30?2H','105')
f['TimeToNearestStation'] = f['TimeToNearestStation'].replace('2H?','120')
f['TimeToNearestStation'] = pd.to_numeric(f['TimeToNearestStation'], errors='coerce')
##-----------------------------------------------------------------------------
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('30分?60分','45')
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('1H?1H30','75')
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('1H30?2H','105')
g['TimeToNearestStation'] = g['TimeToNearestStation'].replace('2H?','120')
g['TimeToNearestStation'] = pd.to_numeric(g['TimeToNearestStation'], errors='coerce')

In [0]:
##-- Convert the Japanese calendar to the Western calendar. 
##-- However, it is roughly the value before the war.
f['BuildingYear'] = f['BuildingYear'].dropna()
f['BuildingYear'] = f['BuildingYear'].str.replace('戦前','昭和20年')
f['年号'] = f['BuildingYear'].str[:2]
f['和暦年数'] = f['BuildingYear'].str[2:].str.strip('年').fillna(0).astype(int)
#-- AD: Western calendar
f.loc[f['年号']=='昭和','BuildingYear_AD'] = f['和暦年数'] + 1925
f.loc[f['年号']=='平成','BuildingYear_AD'] = f['和暦年数'] + 1988
##-----------------------------------------------------------------------------
g['BuildingYear'] = g['BuildingYear'].dropna()
g['BuildingYear'] = g['BuildingYear'].str.replace('戦前','昭和20年')
g['年号'] = g['BuildingYear'].str[:2]
g['和暦年数'] = g['BuildingYear'].str[2:].str.strip('年').fillna(0).astype(int)

g.loc[g['年号']=='昭和','BuildingYear_AD'] = g['和暦年数'] + 1925
g.loc[g['年号']=='平成','BuildingYear_AD'] = g['和暦年数'] + 1988

In [0]:
##-- Convert "String" into numerical values.
f['Area'] = f['Area'].replace('2000㎡以上',"3500")
f['Area'] = f['Area'].replace('5000㎡以上',"5000")
f['Area'] = pd.to_numeric(f['Area'], errors='coerce')

g['Area'] = g['Area'].replace('2000㎡以上',"3500")
g['Area'] = g['Area'].replace('5000㎡以上',"5000")
g['Area'] = pd.to_numeric(g['Area'], errors='coerce')

##-- Convert "String" into numerical values.
f['TotalFloorArea'] = f['TotalFloorArea'].replace('2000㎡以上',"3500")
f['TotalFloorArea'] = f['TotalFloorArea'].replace('5000㎡以上',"5000")
f['TotalFloorArea'] = pd.to_numeric(f['TotalFloorArea'], errors='coerce')

g['TotalFloorArea'] = g['TotalFloorArea'].replace('2000㎡以上',"3500")
g['TotalFloorArea'] = g['TotalFloorArea'].replace('5000㎡以上',"5000")
g['TotalFloorArea'] = pd.to_numeric(g['TotalFloorArea'], errors='coerce')

In [0]:
##-- 使用する説明変数
input_name = [\
              #-- "最寄駅：名称"
              'NearestStation',\
              #-- "最寄駅：距離（分）"
              'TimeToNearestStation',\
              #-- "容積率（％）"
              'FloorAreaRatio',\
              #-- "建築年(西暦))"
              'BuildingYear_AD',\
              ###--  Add  --###
              #-- "市区町村名"
              'Municipality',\
              ##-- "面積（㎡）"(IMPORTANT)
              'Area',\
              #-- '地区名'
              "DistrictName",\
              #-- "今後の利用目的"
              "Purpose",\
              ##-- "用途"
              'Use',\
              #-- '間取り'(IMPORTANT)
              'FloorPlan',\
              #-- "延床面積（㎡）"
              "TotalFloorArea",\
              #-- "改装"
              "Renovation",\
              #-- "種類"(IMPORTANT)
              "Type"\
              ]

##-- 数値変数
nume_cols = [\
             'TimeToNearestStation',\
             'FloorAreaRatio',\
             'BuildingYear_AD',\
             'Area',\
             "TotalFloorArea"\
             ]
##-- カテゴリカル変数
cat_cols = [\
            "NearestStation",\
            'Municipality',\
            "DistrictName",\
            "Purpose",\
            'Use',\
            'FloorPlan',\
            "Renovation",\
            "Type"\
            ]

In [2]:
##-- Preapare exploratory and target variables
X = f[input_name]
Y = f['y']
# Y = np.log( f['y'] )  #-- 分布を正規分布に近づけるため対数をとる

X_pre = g[input_name]

###--------------------------------------###
###-- Preprocessing for missing values --###
###--------------------------------------###
##-- In the case of "Categorical variable"
for name in cat_cols:
  X[name] = X[name].fillna("Unknown")
  X_pre[name] = X_pre[name].fillna("Unknown")

##-- In the case of "Numerical variable"
for name in nume_cols:
  X[name] = X[name].fillna(X[name].median())
  X_pre[name] = X_pre[name].fillna(X_pre[name].median())


print(X.isna().sum())

In [0]:
##-- Label Encoding for categorical variable
from sklearn import preprocessing

for name in cat_cols:
  ##-- Create the Label encoding object
  le = preprocessing.LabelEncoder()
  ##-- Merge the train and test datasets
  X_fit = X[name].append(X_pre[name])
  le.fit(X_fit)

  X[name] = le.transform(X[name])
  X_pre[name] = le.transform(X_pre[name])

In [0]:
###-- Split the dataset
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=99)

In [0]:
###-- Import LightGBM --###
##-- LightGBM implemented with Optuna for LGBM
# import optuna.integration.lightgbm as lgb  
import lightgbm as lgb

##-- Definition of Optuna model
def objective_with_datasets(X_train, X_val, Y_train, Y_val):
  ##-- Pure Optuna definition
  def objective(trial):
      train_x, test_x, train_y, test_y = train_test_split(X_train, Y_train, test_size=0.25)
      dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cat_cols)
      dtest = lgb.Dataset(test_x, test_y, reference=dtrain)
 
      param = {                                                                                               
          'boosting_type': 'gbdt',                                                                             
          'objective': 'regression',
          'metric': 'rmse',
          'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 0.1),
          'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-6, 1.0),
          'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-6, 1.0),
          'num_leaves': trial.suggest_int('num_leaves', 30, 200),
          'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
          'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
          'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
          'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
          #-- Added                                                                                
          # 'learning_rate': 0.001,
          'seed': 99,                                                                      
      }  
 
      gbm = lgb.train(param, dtrain, valid_sets=dtest)
      Y_pre = gbm.predict(X_val)

      ##-- Accuracy: RMSE
      loss = np.sqrt( sklearn.metrics.mean_squared_error(np.exp(Y_pre), np.exp(Y_val)) )
    
      return loss
    
  return objective


In [0]:
##-- Optuna Go !!!
study = optuna.create_study(direction='minimize')
study.optimize( objective_with_datasets(X_train, X_val, Y_train, Y_val), n_trials=20 )

In [0]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)