# Light GBM

In [2]:
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import glob

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Sub-district

### [Without CD]

In [3]:
df_train =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_subdist.csv'), header=0, skiprows=0)
df_train = df_train.drop('Unnamed: 0', axis = 1)
df_test = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_subdist.csv'), header=0, skiprows=0)
df_test = df_test.drop('Unnamed: 0', axis = 1)
df_test_week_addrcode = df_test.iloc[:,[0,1,2,3]]

# Importing the dataset
X_train = df_train.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]]
y_train = df_train.iloc[:,[3]]
X_test = df_test.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]]
y_test = df_test.iloc[:,[3]]


# Splitting the dataset into the Training set and Test set
#from sklearn.cross_validation import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [4]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
#print('Saving model...')
# save model to file
#gbm.save_model('model.txt')

Starting training...
[1]	valid_0's l2: 0.738732	valid_0's l1: 0.516486
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.738056	valid_0's l1: 0.516336
[3]	valid_0's l2: 0.736778	valid_0's l1: 0.515487
[4]	valid_0's l2: 0.7357	valid_0's l1: 0.514884
[5]	valid_0's l2: 0.734812	valid_0's l1: 0.514011
[6]	valid_0's l2: 0.734041	valid_0's l1: 0.513223
[7]	valid_0's l2: 0.733705	valid_0's l1: 0.51245
[8]	valid_0's l2: 0.732998	valid_0's l1: 0.511504
[9]	valid_0's l2: 0.732626	valid_0's l1: 0.510584
[10]	valid_0's l2: 0.732474	valid_0's l1: 0.50997
[11]	valid_0's l2: 0.732451	valid_0's l1: 0.509705
[12]	valid_0's l2: 0.732674	valid_0's l1: 0.509366
[13]	valid_0's l2: 0.732718	valid_0's l1: 0.508932
[14]	valid_0's l2: 0.733235	valid_0's l1: 0.508865
[15]	valid_0's l2: 0.733874	valid_0's l1: 0.508788
[16]	valid_0's l2: 0.734309	valid_0's l1: 0.508222
Early stopping, best iteration is:
[11]	valid_0's l2: 0.732451	valid_0's l1: 0.509705


In [5]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
df_y_pred = pd.DataFrame(y_pred, columns = ['predicted'])
df_compare_addrcode = pd.concat([df_test_week_addrcode, df_y_pred], axis = 1)
df_compare_addrcode.columns = [['Week','Year','addrcode','actual','predicted']]
df_compare_addrcode.to_csv('LGBM_subdist_withoutCD.csv', encoding = 'utf-8')
# eval
print('The RMSE of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('The MAE of prediction is:', mean_absolute_error(y_test, y_pred))

Starting predicting...
The RMSE of prediction is: 0.8558334449512031
The MAE of prediction is: 0.5097053791891465


In [6]:
df_compare_addrcode

Unnamed: 0,Week,Year,addrcode,actual,predicted
0,7,2017,800101,3.434223,0.418336
1,8,2017,800101,0.457896,0.384151
2,9,2017,800101,0.686845,0.384151
3,10,2017,800101,0.686845,0.530001
4,11,2017,800101,0.228948,0.312388
5,12,2017,800101,1.373689,0.286441
6,13,2017,800101,1.144741,0.286441
7,14,2017,800101,0.457896,0.307667
8,15,2017,800101,1.373689,0.291069
9,16,2017,800101,1.373689,0.517168


### [With CD]

In [46]:
df_train =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_subdist.csv'), header=0, skiprows=0)
df_train = df_train.drop('Unnamed: 0', axis = 1)
df_test = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_subdist.csv'), header=0, skiprows=0)
df_test = df_test.drop('Unnamed: 0', axis = 1)
df_test_week_addrcode = df_test.iloc[:,[0,1,2,3]]

# Importing the dataset
X_train = df_train.iloc[:,[6,12,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]]
y_train = df_train.iloc[:,[3]]
X_test = df_test.iloc[:,[6,12,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]]
y_test = df_test.iloc[:,[3]]


# Splitting the dataset into the Training set and Test set
#from sklearn.cross_validation import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [47]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
#print('Saving model...')
# save model to file
#gbm.save_model('model.txt')

Starting training...
[1]	valid_0's l1: 0.516062	valid_0's l2: 0.738482
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 0.515167	valid_0's l2: 0.736851
[3]	valid_0's l1: 0.514037	valid_0's l2: 0.735482
[4]	valid_0's l1: 0.513525	valid_0's l2: 0.73457
[5]	valid_0's l1: 0.512624	valid_0's l2: 0.733468
[6]	valid_0's l1: 0.512263	valid_0's l2: 0.733072
[7]	valid_0's l1: 0.511783	valid_0's l2: 0.732915
[8]	valid_0's l1: 0.511177	valid_0's l2: 0.732875
[9]	valid_0's l1: 0.510546	valid_0's l2: 0.73291
[10]	valid_0's l1: 0.510378	valid_0's l2: 0.732559
[11]	valid_0's l1: 0.509976	valid_0's l2: 0.732669
[12]	valid_0's l1: 0.509589	valid_0's l2: 0.7329
[13]	valid_0's l1: 0.509913	valid_0's l2: 0.733239
[14]	valid_0's l1: 0.510281	valid_0's l2: 0.73331
[15]	valid_0's l1: 0.509998	valid_0's l2: 0.733721
Early stopping, best iteration is:
[10]	valid_0's l1: 0.510378	valid_0's l2: 0.732559


In [48]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
df_y_pred = pd.DataFrame(y_pred, columns = ['predicted'])
df_compare_addrcode = pd.concat([df_test_week_addrcode, df_y_pred], axis = 1)
df_compare_addrcode.columns = [['Week','Year','addrcode','actual','predicted']]
df_compare_addrcode.to_csv('LGBM_subdist_withCD.csv', encoding = 'utf-8')
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('The MAE of prediction is:', mean_absolute_error(y_test, y_pred))

Starting predicting...
The rmse of prediction is: 0.8558967769119842
The MAE of prediction is: 0.5103778142879005


## District

### [Without CD]

In [24]:
df_train_dist =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_dist.csv'), header=0, skiprows=0)
df_test_dist = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_dist.csv'), header=0, skiprows=0)
df_test_week_addrcode_dist = df_test_dist.iloc[:,[0,1,2,3]]


# Importing the dataset
X_train = df_train_dist.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]]
y_train = df_train_dist.iloc[:,[3]]
X_test = df_test_dist.iloc[:,[6,12,18,19,20,21,22,23,24,25,26]]
y_test = df_test_dist.iloc[:,[3]]


# Splitting the dataset into the Training set and Test set
#from sklearn.cross_validation import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [25]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
#print('Saving model...')
# save model to file
#gbm.save_model('model.txt')

Starting training...
[1]	valid_0's l1: 2.11377	valid_0's l2: 8.94474
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 2.09762	valid_0's l2: 8.82329
[3]	valid_0's l1: 2.0695	valid_0's l2: 8.60186
[4]	valid_0's l1: 2.04186	valid_0's l2: 8.40561
[5]	valid_0's l1: 2.01816	valid_0's l2: 8.23596
[6]	valid_0's l1: 1.99697	valid_0's l2: 8.09747
[7]	valid_0's l1: 1.97866	valid_0's l2: 7.97903
[8]	valid_0's l1: 1.96218	valid_0's l2: 7.88557
[9]	valid_0's l1: 1.94786	valid_0's l2: 7.79846
[10]	valid_0's l1: 1.93442	valid_0's l2: 7.72101
[11]	valid_0's l1: 1.92062	valid_0's l2: 7.65336
[12]	valid_0's l1: 1.90493	valid_0's l2: 7.5758
[13]	valid_0's l1: 1.89093	valid_0's l2: 7.51346
[14]	valid_0's l1: 1.87941	valid_0's l2: 7.46801
[15]	valid_0's l1: 1.8688	valid_0's l2: 7.42238
[16]	valid_0's l1: 1.85665	valid_0's l2: 7.36171
[17]	valid_0's l1: 1.84547	valid_0's l2: 7.30782
[18]	valid_0's l1: 1.8374	valid_0's l2: 7.27209
[19]	valid_0's l1: 1.82927	valid_0's l2: 7.23666


In [26]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
df_y_pred = pd.DataFrame(y_pred, columns = ['predicted'])
df_compare_addrcode_dist = pd.concat([df_test_week_addrcode_dist, df_y_pred], axis = 1)
df_compare_addrcode_dist.columns = [['addrcode','Week','Year','actual','predicted']]
df_compare_addrcode_dist.to_csv('LGBM_dist_withoutCD.csv', encoding = 'utf-8')
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('The MAE of prediction is:', mean_absolute_error(y_test, y_pred))

Starting predicting...
The rmse of prediction is: 2.6829901896725614
The MAE of prediction is: 1.8204314754903421


### [With CD] 

In [27]:
df_train_dist =  pd.read_csv(os.path.join('Data','Modeling','Training&Testing','train_dist.csv'), header=0, skiprows=0)
df_test_dist = pd.read_csv(os.path.join('Data','Modeling','Training&Testing','test_dist.csv'), header=0, skiprows=0)
df_test_week_addrcode_dist = df_test_dist.iloc[:,[0,1,2,3]]


# Importing the dataset
X_train = df_train_dist.iloc[:,[6,12,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]]
y_train = df_train_dist.iloc[:,[3]]
X_test = df_test_dist.iloc[:,[6,12,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]]
y_test = df_test_dist.iloc[:,[3]]


# Splitting the dataset into the Training set and Test set
#from sklearn.cross_validation import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [28]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
#print('Saving model...')
# save model to file
#gbm.save_model('model.txt')

Starting training...
[1]	valid_0's l1: 2.11413	valid_0's l2: 8.95521
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 2.08308	valid_0's l2: 8.7218
[3]	valid_0's l1: 2.05456	valid_0's l2: 8.50653
[4]	valid_0's l1: 2.02999	valid_0's l2: 8.33145
[5]	valid_0's l1: 2.00821	valid_0's l2: 8.17647
[6]	valid_0's l1: 1.98509	valid_0's l2: 8.02517
[7]	valid_0's l1: 1.96631	valid_0's l2: 7.91188
[8]	valid_0's l1: 1.94942	valid_0's l2: 7.81759
[9]	valid_0's l1: 1.93375	valid_0's l2: 7.72639
[10]	valid_0's l1: 1.92161	valid_0's l2: 7.65936
[11]	valid_0's l1: 1.90695	valid_0's l2: 7.57794
[12]	valid_0's l1: 1.89394	valid_0's l2: 7.5104
[13]	valid_0's l1: 1.88428	valid_0's l2: 7.46009
[14]	valid_0's l1: 1.87687	valid_0's l2: 7.42399
[15]	valid_0's l1: 1.86488	valid_0's l2: 7.36719
[16]	valid_0's l1: 1.85091	valid_0's l2: 7.29717
[17]	valid_0's l1: 1.83886	valid_0's l2: 7.23428
[18]	valid_0's l1: 1.8294	valid_0's l2: 7.191
[19]	valid_0's l1: 1.81876	valid_0's l2: 7.14161
[

In [29]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
df_y_pred = pd.DataFrame(y_pred, columns = ['predicted'])
df_compare_addrcode_dist = pd.concat([df_test_week_addrcode_dist, df_y_pred], axis = 1)
df_compare_addrcode_dist.columns = [['addrcode','Week','Year','actual','predicted']]
df_compare_addrcode_dist.to_csv('LGBM_dist_withCD.csv', encoding = 'utf-8')
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('The MAE of prediction is:', mean_absolute_error(y_test, y_pred))

Starting predicting...
The rmse of prediction is: 2.666239481217889
The MAE of prediction is: 1.8111404261879975


## Separate files

### [Subdistrict]

In [67]:
list_sub = glob.glob(os.path.join('Data','Modeling','Light GBM','Province','Subdistrict','*'))
list_sub

['Data/Modeling/Light GBM/Province/Subdistrict/LGBM_subdist_withoutCD.csv',
 'Data/Modeling/Light GBM/Province/Subdistrict/LGBM_subdist_withCD.csv']

In [68]:
list_sub[0][:-4][45:]

'LGBM_subdist_withoutCD'

In [37]:
df_available = pd.read_csv(os.path.join('Data','Data Statistics','available_addrcode_subdistrict.csv'))
df_available['addrcode'] = df_available['addrcode'].astype(str)
addrcode_list = df_available['addrcode']
addrcode_nakhon_sub = []

for i in range(len(addrcode_list)):
    if addrcode_list[i].startswith('80'):
        addrcode_nakhon_sub.append(addrcode_list[i])

In [70]:
for i in range(len(list_sub)):
    for j in range(len(addrcode_nakhon_sub)):
        df_result = pd.read_csv(list_sub[i])
        df_result['addrcode'] = df_result['addrcode'].astype(str)
        df_result = df_result.drop('Unnamed: 0', axis =1 )
        df_result = df_result.loc[df_result['addrcode'] == addrcode_nakhon_sub[j]]
        df_result = df_result.reset_index()
        df_result = df_result.drop('index', axis = 1)
        df_result.to_csv(list_sub[i][:-4][45:]+'_'+addrcode_nakhon_sub[j]+'.csv', encoding = 'utf-8')

### [District]

In [38]:
list_dist = glob.glob(os.path.join('Data','Modeling','Light GBM','Province','District','*'))
list_dist

['Data/Modeling/Light GBM/Province/District/LGBM_dist_withCD.csv',
 'Data/Modeling/Light GBM/Province/District/LGBM_dist_withoutCD.csv']

In [39]:
list_dist[0][:-4][42:]

'LGBM_dist_withCD'

In [40]:
addrcode_nakhon_dist = []

for i in range(len(addrcode_nakhon_sub)):
    addrcode_nakhon_sub[i] = addrcode_nakhon_sub[i][:-2]
    addrcode_nakhon_dist.append(addrcode_nakhon_sub[i]) 
    
addrcode_nakhon_dist = list(set(addrcode_nakhon_dist))

In [41]:
for i in range(len(list_dist)):
    for j in range(len(addrcode_nakhon_dist)):
        df_result = pd.read_csv(list_dist[i])
        df_result['addrcode'] = df_result['addrcode'].astype(str)
        df_result = df_result.drop('Unnamed: 0', axis =1 )
        df_result = df_result.loc[df_result['addrcode'] == addrcode_nakhon_dist[j]]
        df_result = df_result.reset_index()
        df_result = df_result.drop('index', axis = 1)
        df_result.to_csv(list_dist[i][:-4][42:]+'_'+addrcode_nakhon_dist[j]+'.csv', encoding = 'utf-8')

## Compute Different and MAE

## Seperated Level

#### [Subdistrict]

In [76]:
file_sub = glob.glob(os.path.join('Data','Modeling','Light GBM','Separated','Subdistrict','*'))
len(file_sub)

334

In [79]:
file_sub[0][46:]

'LGBM_subdist_withoutCD_801607.csv'

In [80]:
for i in range(len(file_sub)):
    df_diff = pd.read_csv(file_sub[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',None,None,None,None,df_diff['different'].sum()/len(df_diff)]
    df_diff.to_csv(file_sub[i])

### [District]

In [42]:
file_dist = glob.glob(os.path.join('Data','Modeling','Light GBM','Separated','District','*'))
len(file_dist)

46

In [43]:
file_dist[0][43:]

'LGBM_dist_withoutCD_8018.csv'

In [44]:
for i in range(len(file_dist)):
    df_diff = pd.read_csv(file_dist[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',None,None,None,None,df_diff['different'].sum()/len(df_diff)]
    df_diff.to_csv(file_dist[i])

## Province Level

### [Subdistrict]

In [60]:
file_pro_sub = glob.glob(os.path.join('Data','Modeling','Light GBM','Province','Subdistrict','*'))
len(file_pro_sub)

2

In [61]:
file_pro_sub[0][45:][:-4]

'LGBM_subdist_withoutCD'

In [62]:
for i in range(len(file_pro_sub)):
    df_diff = pd.read_csv(file_pro_sub[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',None,None,None,None,df_diff['different'].sum()/len(df_diff)]
    df_diff.to_csv(file_pro_sub[i][45:][:-4]+'_diff.csv')

### [District]

In [63]:
file_pro_dist = glob.glob(os.path.join('Data','Modeling','Light GBM','Province','District','*'))
len(file_pro_dist)

2

In [64]:
file_pro_dist[0][42:][:-4]

'LGBM_dist_withCD'

In [65]:
for i in range(len(file_pro_dist)):
    df_diff = pd.read_csv(file_pro_dist[i])
    df_diff = df_diff.drop('Unnamed: 0', axis = 1)
    df_diff['different'] = df_diff['predicted'] - df_diff['actual']
    df_diff['different'] = df_diff['different'].abs()
    df_diff.loc[len(df_diff)] = ['MAE',None,None,None,None,df_diff['different'].sum()/len(df_diff)]
    df_diff.to_csv(file_pro_dist[i][42:][:-4]+'_diff.csv')