# Import libraries

We will make extensive use of `pandas` and `LightGBM` throughout this demo. `pickle` will be used to save and load model files

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import csv
import pickle
from sklearn.metrics import mean_squared_error
import matplotlib
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

# Slack channel notifications

Import `SlackClient` and create basic function that will post a Slack notification in `channel` when code is finished running

In [2]:
from slackclient import SlackClient
def slack_message(message, channel):
    token = 'your_token'
    sc = SlackClient(token)
    sc.api_call('chat.postMessage', channel=channel, 
                text=message, username='My Sweet Bot',
                icon_emoji=':upside_down_face:')

# Import data and set data types

Set working directory and ensure schema is correct before importing train and test sets. `pd.to_datetime` automatically reads the date column `dates` - check this is correct afterwards, but it is usually pretty smart

In [None]:
data_dir = '/your/directory/'  
data_file = data_dir + 'data_file'

In [None]:
data = pd.read_csv(data_file, sep = "\t", parse_dates = ['dates'], date_parser = pd.to_datetime)

# Combine train and test set

Combine `train` and `test` data sets before parsing through one-hot encoder or dense vector encoding. This is especially important for one-hot encoding because we want to maintain the same set of columns across both train and test sets. These can be inconsistent if a particular level of a categorical variable is present in one data set but not the other

* `cat_cols` are categorical columns that will be used in model training
* `index_cols` are columns that are used for indexing purposes and will not be fit in the model
* `pred_cols` are the response variable columns
* `num_cols` are the numeric columns that will be used in model training

In [None]:
cat_cols = ['ATTRIBUTE_1','ATTRIBUTE_2','ATTRIBUTE_3']
index_cols = ['FACTOR_1','FACTOR_2','FACTOR_3']
pred_cols = ['RESPONSE']

num_cols = [x for x in list(data.columns.values) if x not in cat_cols if x not in fac_cols if x not in pred_cols]

# Convert categorial variables to dense vectors

In [None]:
data_cat = pd.DataFrame(data[cat_cols])

for feature in cat_cols: # Loop through all columns in the dataframe
    if data_cat[feature].dtype == 'object': # Only apply for columns with categorical strings
        data_cat[feature] = pd.Categorical(data[feature]).codes # Replace strings with an integer

# Prepare final dataframe before resplitting into train and test sets

Importantly, we want to ensure that `train_final` and `test_final` are the same rows of data as `train` and `test`. `DATE_SPLIT` is the date we want to use to split our train and test sets

In [None]:
data_num = data[num_cols]
data_final = pd.concat([data_cat, data_num], axis=1)
data_final['DATE'] = data['DATE']
data_final['RESPONSE'] = data['RESPONSE']
print data_final.shape

In [None]:
train_final = data_final[data_final['DATE'] <= 'DATE_SPLIT']
test_final = data_final[data_final['DATE'] >= 'DATE_SPLIT' ]

print(train_final.shape)
print(test_final.shape)

In [None]:
train = data[data['DATE'] <= 'DATE_SPLIT']
test = data[data['DATE'] >= 'DATE_SPLIT' ]

print(train.shape)
print(test.shape)

# Create design matrix and response vector

In [None]:
y_train = train_final['RESPONSE']
y_test = test_final['RESPONSE']
x_train = train_final.drop(['RESPONSE','DATE'], axis=1)
x_test = test_final.drop(['RESPONSE','DATE'], axis=1)

print x_train.columns.values

# Create Dataset objects for LightGBM

In [None]:
lgb_train = lgb.Dataset(data = x_train, label = y_train, free_raw_data = False)
lgb_test = lgb.Dataset(data = x_test, label = y_test, reference = lgb_train, free_raw_data = False)

# Set hyperparameters for LightGBM

Set hyperparameters for training GBM. Early stopping rounds have also been implemented, so we can be ambitious and increase `n_estimators` to `1000`

In [None]:
depth = 8
num_leaves = 2**depth - 1

params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'l2',
          'num_leaves': num_leaves,
          'max_depth': -1,
          'learning_rate': 0.02,
          'n_estimators': 1000,
          'min_split_gain': 0.05,
          'min_child_weight': 0.5,
          'subsample': 0.8,
          'colsample_bytree': 0.8,
          'reg_alpha': 0.2,
          'reg_lambda': 0.2,
          'seed': 100,
          'silent': False
}

# Train GBM

Train model against validation set. 

To do: Implement cross-validation

In [None]:
num_boost_round = 1000
early_stopping_rounds = 10
evals_result = {}

gbm = lgb.train(params,
                train_set = lgb_train,
                num_boost_round = num_boost_round,
                valid_sets = [lgb_train, lgb_test],
                valid_names = ['train', 'eval'],
                evals_result = evals_result,
                early_stopping_rounds = early_stopping_rounds,
                verbose_eval = True
               )

slack_message("Booster object completed!", 'channel')

# Plot feature importance and print values

Plot the top 30 features by `split` importance. Create dataframe that records the `split` and `gain` of each feature

In [None]:
lgb.plot_importance(gbm, max_num_features = 30, importance_type='split')

In [None]:
importance = pd.DataFrame()
importance['Feature'] = x_train.columns.values
importance['ImportanceWeight'] = gbm.feature_importance(importance_type = 'split')
importance['ImportanceGain'] = gbm.feature_importance(importance_type = 'gain')

importance.sort_values(by = 'ImportanceWeight', ascending = False, inplace = True)
importance.head()

# Plot L2 during training

In [None]:
lgb.plot_metric(evals_result, metric='l2')

# Produce predictions for train and test sets before measuring accuracy

Calculate predictions for both train and test sets, and then calculate MSE and RMSE for both datasets

In [None]:
gbm_train_preds = gbm.predict(x_train, num_iteration = gbm.best_iteration)
gbm_test_preds = gbm.predict(x_test, num_iteration = gbm.best_iteration)
print gbm_train_preds.shape
print gbm_test_preds.shape

In [None]:
print "\nModel Report"
print "MSE Train : %f" % mean_squared_error(y_train, gbm_train_preds)
print "MSE Test: %f" % mean_squared_error(y_test, gbm_test_preds)
print "RMSE Train: %f" % mean_squared_error(y_train, gbm_train_preds)**0.5
print "RMSE Test: %f" % mean_squared_error(y_test, gbm_test_preds)**0.5

# Save LGBM model file and write .csv files to working directory

Save LightGBM model file for future reference. Similar function to load previously saved files is commented out below. Then, write all files to the working directory

In [None]:
pickle.dump(gbm, open("gbm.pickle.dat", "wb"))

In [None]:
# gbm = pickle.load(open("gbm.pickle.dat", "rb"))
# gbm_train_preds = gbm.predict(x_train)
# gbm_test_preds = gbm.predict(x_test)

In [None]:
# print "\nModel Report"
# print "MSE Train : %f" % mean_squared_error(y_train, gbm_train_preds)
# print "MSE Test: %f" % mean_squared_error(y_test, gbm_test_preds)
# print "RMSE Train: %f" % mean_squared_error(y_train, gbm_train_preds)**0.5
# print "RMSE Test: %f" % mean_squared_error(y_test, gbm_test_preds)**0.5

In [None]:
train_preds = pd.DataFrame(gbm_train_preds)
test_preds = pd.DataFrame(gbm_test_preds)
train_preds.columns = ['RESPONSE']
test_preds.column = ['RESPONSE']

In [None]:
train.to_csv('LGBM Train.csv', sep=',')
train_preds.to_csv('LGBM Train Preds.csv', sep=',')
test.to_csv('LGBM Test.csv', sep=',')
test_preds.to_csv('LGBM Test Preds.csv', sep=',')
importance.to_csv('LGBM Feature Importance.csv', index = False)

slack_message("Files saved!", 'channel')