In [74]:
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.metrics import mean_squared_error

### [Without CD]

In [79]:
df_train =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_subdist.csv'), header=0, skiprows=0)
df_train = df_train.drop('Unnamed: 0', axis = 1)
df_test = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_subdist.csv'), header=0, skiprows=0)
df_test = df_test.drop('Unnamed: 0', axis = 1)
df_test_week_addrcode = df_test.iloc[:,[0,1,2]]

# Importing the dataset
X_train = df_train.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]]
y_train = df_train.iloc[:,[3]]
X_test = df_test.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]]
y_test = df_test.iloc[:,[3]]


# Splitting the dataset into the Training set and Test set
#from sklearn.cross_validation import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [80]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
#print('Saving model...')
# save model to file
#gbm.save_model('model.txt')

Starting training...
[1]	valid_0's l2: 0.738732	valid_0's l1: 0.516486
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.738056	valid_0's l1: 0.516336
[3]	valid_0's l2: 0.736778	valid_0's l1: 0.515487
[4]	valid_0's l2: 0.7357	valid_0's l1: 0.514884
[5]	valid_0's l2: 0.734812	valid_0's l1: 0.514011
[6]	valid_0's l2: 0.734041	valid_0's l1: 0.513223
[7]	valid_0's l2: 0.733705	valid_0's l1: 0.51245
[8]	valid_0's l2: 0.732998	valid_0's l1: 0.511504
[9]	valid_0's l2: 0.732626	valid_0's l1: 0.510584
[10]	valid_0's l2: 0.732474	valid_0's l1: 0.50997
[11]	valid_0's l2: 0.732451	valid_0's l1: 0.509705
[12]	valid_0's l2: 0.732674	valid_0's l1: 0.509366
[13]	valid_0's l2: 0.732718	valid_0's l1: 0.508932
[14]	valid_0's l2: 0.733235	valid_0's l1: 0.508865
[15]	valid_0's l2: 0.733874	valid_0's l1: 0.508788
[16]	valid_0's l2: 0.734309	valid_0's l1: 0.508222
Early stopping, best iteration is:
[11]	valid_0's l2: 0.732451	valid_0's l1: 0.509705


In [81]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Starting predicting...
The rmse of prediction is: 0.8558334449512031


### [With CD]

In [82]:
df_train =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_subdist.csv'), header=0, skiprows=0)
df_train = df_train.drop('Unnamed: 0', axis = 1)
df_test = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_subdist.csv'), header=0, skiprows=0)
df_test = df_test.drop('Unnamed: 0', axis = 1)
df_test_week_addrcode = df_test.iloc[:,[0,1,2]]

# Importing the dataset
X_train = df_train.iloc[:,[6,12,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]]
y_train = df_train.iloc[:,[3]]
X_test = df_test.iloc[:,[6,12,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]]
y_test = df_test.iloc[:,[3]]


# Splitting the dataset into the Training set and Test set
#from sklearn.cross_validation import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [83]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
#print('Saving model...')
# save model to file
#gbm.save_model('model.txt')

Starting training...
[1]	valid_0's l2: 0.738482	valid_0's l1: 0.516062
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.736851	valid_0's l1: 0.515167
[3]	valid_0's l2: 0.735482	valid_0's l1: 0.514037
[4]	valid_0's l2: 0.73457	valid_0's l1: 0.513525
[5]	valid_0's l2: 0.733468	valid_0's l1: 0.512624
[6]	valid_0's l2: 0.733072	valid_0's l1: 0.512263
[7]	valid_0's l2: 0.732915	valid_0's l1: 0.511783
[8]	valid_0's l2: 0.732875	valid_0's l1: 0.511177
[9]	valid_0's l2: 0.73291	valid_0's l1: 0.510546
[10]	valid_0's l2: 0.732559	valid_0's l1: 0.510378
[11]	valid_0's l2: 0.732669	valid_0's l1: 0.509976
[12]	valid_0's l2: 0.7329	valid_0's l1: 0.509589
[13]	valid_0's l2: 0.733239	valid_0's l1: 0.509913
[14]	valid_0's l2: 0.73331	valid_0's l1: 0.510281
[15]	valid_0's l2: 0.733721	valid_0's l1: 0.509998
Early stopping, best iteration is:
[10]	valid_0's l2: 0.732559	valid_0's l1: 0.510378


In [84]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Starting predicting...
The rmse of prediction is: 0.8558967769119842
