In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn import preprocessing
from matplotlib import pyplot
from time import time, strftime, localtime

In [3]:
### Load data
train_df = pd.read_csv("../dataset/train.csv")
test_df = pd.read_csv("../dataset/test.csv")
submit_test_df = pd.read_csv("../dataset/submit_test.csv")

In [4]:
### Split data
train_fillna_df = train_df.iloc[:,1:-1].fillna(value=0.0)
test_fillna_df = test_df.iloc[:,1:].fillna(value=0.0)
all_fillna_df = pd.concat((train_fillna_df, test_fillna_df), axis = 0)

label_name = "total_price"
label_df = pd.DataFrame(train_df[label_name])

In [6]:
### Remove outliers
train_remove_outlier_df = pd.concat((train_fillna_df, label_df), axis=1)
#train_remove_outlier_df = train_remove_outlier_df[train_fillna_df.total_floor <= 16]
#print(train_remove_outlier_df.shape)
#train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.txn_floor <= 12]
#print(train_remove_outlier_df.shape)
train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.building_material > 2]
print(train_remove_outlier_df.shape)
train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.building_use <= 4]
print(train_remove_outlier_df.shape)
train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.village_income_median != 0]
train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.village_income_median <= 900]
print(train_remove_outlier_df.shape)
train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.town_area <= 150]
print(train_remove_outlier_df.shape)
train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.town_population_density <= 35000]
print(train_remove_outlier_df.shape)
train_remove_outlier_df = train_remove_outlier_df[train_remove_outlier_df.death_rate <= 8]
print(train_remove_outlier_df.shape) #8

### Split x and y
label_remove_outliers_df = pd.DataFrame(train_remove_outlier_df[label_name])
train_remove_outlier_df = train_remove_outlier_df.iloc[:,:-1]
print("label_remove_outliers_df.shape:", label_remove_outliers_df.shape)
print("train_remove_outlier_df.shape:", train_remove_outlier_df.shape)

(53628, 234)
(48649, 234)
(44883, 234)
(44349, 234)
(43358, 234)
(41175, 234)
label_remove_outliers_df.shape: (41175, 1)
train_remove_outlier_df.shape: (41175, 233)


In [7]:
### Drop no need features
train_drop_df = train_remove_outlier_df.drop(
   ["town", "village", "parking_area", "parking_price", "land_area", "building_area"], axis=1)
test_drop_df = test_fillna_df.drop(
   ["town", "village", "parking_area", "parking_price", "land_area", "building_area"], axis=1)

In [8]:
# [No-ohe version]
### min max scalar
# preserve "column name" and "index" after scaler
# remove outliers on training data
min_max_scaler = preprocessing.MinMaxScaler()
mms_train_process_df = pd.DataFrame(min_max_scaler.fit_transform(train_drop_df), index=train_drop_df.index)
mms_test_process_df = pd.DataFrame(min_max_scaler.fit_transform(test_drop_df))

In [5]:
# [No-ohe version]
### min max scalar
# preserve column name after scaler
min_max_scaler = preprocessing.MinMaxScaler()
mms_features_df = pd.DataFrame(min_max_scaler.fit_transform(all_fillna_df))

### Split to train & test data
train_num = train_df.shape[0]
#train_num = train_remove_outlier_df.shape[0]
mms_train_process_df = mms_features_df.iloc[:train_num,:]
mms_test_process_df = mms_features_df.iloc[train_num:,:]

print("mms_train_process_df.shape:", mms_train_process_df.shape)
print("mms_test_process_df.shape:", mms_test_process_df.shape)

mms_train_process_df.shape: (60000, 233)
mms_test_process_df.shape: (10000, 233)


In [6]:
### Shuffle data
shuff_train_all = shuffle(pd.concat((mms_train_process_df, label_df), axis=1), random_state=7)
# No min max scalar
#shuff_train_all = shuffle(pd.concat((train_process_df, label_df), axis=1))
# Drop unimportant features
#shuff_train_all = shuffle(pd.concat((temp_train_df, label_df), axis=1))
# Drop & min max
#shuff_train_all = shuffle(pd.concat((mms_train_process_df, label_df), axis=1))
# Drop & min max & remove outliers
#shuff_train_all = shuffle(pd.concat((mms_train_process_df, label_remove_outliers_df), axis=1), random_state=0)

# training and validation num
tv_num = round(shuff_train_all.shape[0] * 0.85)

### Split data
X_train = shuff_train_all.iloc[:tv_num,:-1]
y_train = shuff_train_all.iloc[:tv_num,-1:]

print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)

# Validation data
X_val = shuff_train_all.iloc[tv_num:,:-1]
y_val = shuff_train_all.iloc[tv_num:,-1:]

print("X_val.shape:", X_val.shape)
print("y_val.shape:", y_val.shape)

X_train.shape: (51000, 233)
y_train.shape: (51000, 1)
X_val.shape: (9000, 233)
y_val.shape: (9000, 1)


In [7]:
### Transfter into data matrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_val, label=y_val)
#dtest = xgb.DMatrix(mms_test_process_df)
# No min max scaler
#dtest = xgb.DMatrix(test_process_df)
# Drop unimportant features
#dtest = xgb.DMatrix(temp_test_df)
# Drop & min max
dtest = xgb.DMatrix(mms_test_process_df)

In [9]:
### Booster parameters

### Fit the model
XGB_Regressor = xgb.XGBRegressor(colsample_bytree=0.6, subsample=0.7,
                             learning_rate=0.1, max_depth=8, 
                             min_child_weight=0.3, n_estimators=2000,
                             reg_alpha=0.08, reg_lambda=1.5, gamma=0.00001, 
                             verbosity=1, random_state =7, nthread = -1,
                             tree_method='gpu_hist' )

print('開始時間：', strftime("%Y-%m-%d %H:%M:%S", localtime()))
t0 = time()

### Grid Search CV
#cv_params = {'n_estimators': [600, 700, 800, 900, 1000]} #result: n_estimators=1000
cv_params = {'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [0.1, 0.2, 0.3, 0.5]} #result: max_depth=8, min_child_weight=0.3 #spent: 62.3 min
#此次無測 #cv_params = {'min_child_weight': [0.3, 0.4, 0.5, 0.6]} #result: min_child_weight=0.3 #spent: 2.2min
#cv_params = {'gamma': [0.00001, 0.00005, 0.0001, 0.0002]} #result: gamma=0.00001 #spent: 8.1min
#cv_params = {'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]} #result: subsample=0.9, colsample_bytree=0.7 #spent: 33.6min
#cv_params = {'reg_alpha': [0.08, 0.1, 1.2, 1.5], 'reg_lambda': [1.5, 1.8, 2]} #result: reg_alpha=0.08, reg_lambda=1.5 #spent: 26.1min
#cv_params = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2]} #result: learning_rate=0.1, reg_lambda=1 #spent: 10.5min
#cv_params = {'random_state': [0,1,2,3,4,7]} #result: random_state= #spent: min
optimized_GBM = GridSearchCV(estimator=XGB_Regressor, param_grid=cv_params, scoring='neg_mean_squared_error', cv=5, verbose=3, n_jobs=4)
optimized_GBM.fit(X_train, y_train)
evalute_result = optimized_GBM.cv_results_
print('spent time: %0.3fs' % (time() - t0), strftime("%Y-%m-%d %H:%M:%S", localtime()))

print('每輪迭代運行結果:{0}' .format(evalute_result))
print('參數的最佳取值：{0}' .format(optimized_GBM.best_params_))
print('最佳模型得分:{0}' .format(optimized_GBM.best_score_))

開始時間： 2019-06-28 07:33:12
Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  4.4min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 62.6min
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed: 153.3min finished


spent time: 9210.715s 2019-06-28 10:06:42
每輪迭代運行結果:{'mean_fit_time': array([ 35.98495727,  40.05808563,  43.41869912,  42.60248203,
        54.23278298,  47.79699197,  46.02632666,  45.79454646,
        77.47144418,  77.88972812,  78.09038963,  78.21625185,
       116.83442054, 116.63865781, 116.35276875, 115.75677724,
       178.33335071, 178.19978971, 178.03031311, 178.78447504,
       280.13561616, 280.63945198, 280.71725101, 280.20129323,
       431.47214141, 432.58263025, 431.21819744, 433.6701448 ,
       655.10812087, 658.54389858, 660.06700168, 647.8750031 ]), 'std_fit_time': array([ 1.1056229 ,  1.59352959,  0.78867451,  0.62067408,  2.24593567,
        1.28640041,  0.31339908,  0.22326453,  0.84557356,  0.42461642,
        1.1806748 ,  0.71751797,  1.96455898,  1.50256922,  1.19775587,
        0.78048966,  3.33308639,  3.56591226,  3.13517831,  3.50327974,
        8.85550094,  8.89541106,  8.93219674,  8.8288072 , 10.78049116,
        9.965835  , 10.87714782,  9.90432799, 19.

In [None]:
### Validation
RMSE_train = np.sqrt(mean_squared_error(y_train, y_test))
print('RMSE_train:', RMSE_train.round(4))

y_val_test = bst.predict(dvalidation)
RMSE_val = np.sqrt(mean_squared_error(y_val, y_val_test))
print('RMSE_val:', "{:,}".format(RMSE_val.round(4)))

In [None]:
print("y_train:", pd.DataFrame(y_train).describe())
print("y_test:", pd.DataFrame(y_test).describe())
print("xgb_pred:", pd.DataFrame(xgb_pred).describe())
print("number of xgb_pred are negative:", len(xgb_pred[xgb_pred < 0]))

In [None]:
import os
import re

def getMaxFileNum():
    max = 0
    for f in os.listdir("../submit"):
        fileNum = re.findall('\d+', f)
        if len(fileNum):
            if int(fileNum[0]) > max:
                max = int(fileNum[0])
    return str(max + 1)

# Submission
filePath = "../submit/submit_test_" + getMaxFileNum() + ".csv"

pred_df = pd.DataFrame(np.array(xgb_pred), columns=["total_price"])
ans_df = pd.merge(submit_test_df["building_id"].to_frame(), pred_df, left_index=True, right_index=True, how="outer")
ans_df.to_csv(filePath,sep=",",index=False,encoding="UTF-8")

print('filePath:', filePath)