In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
from sklearn.model_selection import train_test_split

In [None]:
print('Loading data ...')
train = pd.read_csv('../input/train_2016.csv')
prop = pd.read_csv('../input/properties_2016.csv')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)
df_train = train.merge(prop, how='left', on='parcelid')
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

In [None]:
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)
del df_train; gc.collect()

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1)
x_train = x_train.values.astype(np.float32, copy=False)
x_valid = x_valid.values.astype(np.float32, copy=False)

d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_valid, label=y_valid)

In [None]:
params = {
    'max_bin': 10,
    'learning_rate': 0.0021,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': '12',
    'sub_feature': 0.5,
    'bagging_fraction': 0.85,
    'bagging_freq': 40,
    'num_leaves': 512,
    'min_data': 500,
    'min_hessian': 0.05
}

watchlist = [d_valid]
clf = lgb.train(params, d_train, 400, watchlist)

In [None]:
del d_train, d_valid; gc.collect()
del x_train, x_valid; gc.collect()

In [None]:
print("Prepare for the prediction ...")
sample = pd.read_csv('../input/sample_submission.csv')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')
del sample, prop; gc.collect()

In [None]:
x_test = df_test[train_columns]
del df_test; gc.collect()

In [None]:
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)
x_test = x_test.values.astype(np.float32, copy=False)

In [None]:
print("Start prediction ...")
# num_threads > 1 will predict very slow in kernal
clf.reset_parameter({"num_threads":1})
p_test = clf.predict(x_test)
del x_test; gc.collect()

In [None]:
print("Start write result ...")
sub = pd.read_csv('../input/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test
sub.to_csv('../output/LightBGM.csv', index=False, float_format='%.4f')