# 5) Model Improvement and Alternatives

In this notebook, we will look for different ways to improve the result, including but not limited to:

- Changing time interval for forward price changes;
- Using different number of hidden layers and hidden dimensions;
- Using other regressor algorithms by Sagemaker

To make things more convenient, a helper class `SagemakerEstimatorHelper` is provided in `model_helper.py` to shorten codes needed to evaluate a model.

In [None]:
import sagemaker
import pandas as pd
import numpy as np
import boto3
import sagemaker

from model_helper import *
from data_processing import *
from data_reader import *
from features_helper import *
from plotting_helper import *

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
sagemaker_session, role,  = sagemaker.Session(), sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
data_dir, prefix = 'processed_data', 'sagemaker/capstone_capstone'
output_path = f's3://{bucket}/{prefix}'
original_df = read_all_csvs()

Different alternatives will be labelled by a tag, and the training result will be stored in the `summary_dict`.

In [None]:
summary_dict = {}

---

### Original Version

In [None]:
helper = FeaturesHelper(original_df)
target_features_list = ['60m_chg_std', '60m_z_price', '60m_z_volume', '60m_draw_up', '60m_draw_down', '5m_smoothed_volume_chg', 'log_volume', 'close_to_high', 'close_to_low', 'close_to_open', 'adx', 'macd', 'fso']
helper.run_features_list(target_features_list, log=False)
df = preprocess_data(helper.get_result(bool_dropna=True), input_period=1)

input_df = df.copy()
train_df, test_df = split_train_test_df(input_df)
label_col, feature_cols = train_df.columns[0], train_df.columns[1:]

# Define model
helper = SagemakerEstimatorHelper(target_algorithm='neural_network', 
                                  output_path=output_path, 
                                  train_entry_point='train.py',
                                  predict_entry_point='predict.py',
                                  source_dir='source', 
                                  hyperparameters={'input_dim': len(target_features_list),
                                                   'hidden_dim_list': "8_6",
                                                   'output_dim': 1,
                                                   'epochs': 100})

# Upload data to S3
helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# Train and Deploy model
helper.est_fit()
helper.deploy()

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])
summary_dict['original'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])

# Delete endpoint
helper.delete_endpoint()

In [None]:
# Training set plotting
plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | Neural Network | Normal')

In [None]:
# Testing set plotting
plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | Neural Network | Normal')

---

### Alternative 1: Using more layers of neurons in the model

In [None]:
helper = FeaturesHelper(original_df)
target_features_list = ['60m_chg_std', '60m_z_price', '60m_z_volume', '60m_draw_up', '60m_draw_down', '5m_smoothed_volume_chg', 'log_volume', 'close_to_high', 'close_to_low', 'close_to_open', 'adx', 'macd', 'fso']
helper.run_features_list(target_features_list, log=False)
df = preprocess_data(helper.get_result(bool_dropna=True), input_period=1)

input_df = df.copy()
train_df, test_df = split_train_test_df(input_df)
label_col, feature_cols = train_df.columns[0], train_df.columns[1:]

# Define model
helper = SagemakerEstimatorHelper(target_algorithm='neural_network', 
                                  output_path=output_path, 
                                  train_entry_point='train.py',
                                  predict_entry_point='predict.py',
                                  source_dir='source', 
                                  hyperparameters={'input_dim': len(target_features_list),
                                                   'hidden_dim_list': "8_10_8_6_4",
                                                   'output_dim': 1,
                                                   'epochs': 100})

# Upload data to S3
helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# Train and Deploy model
helper.est_fit()
helper.deploy()

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])
summary_dict['more_layers'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])

# Delete endpoint
helper.delete_endpoint()

In [None]:
# Training set plotting
plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | Neural Network | More Layers')

In [None]:
# Testing set plotting
plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | Neural Network | More Layers')

---

### Alternative 2: Use fewer number of features (same number of layers)

In [None]:
helper = FeaturesHelper(original_df)
target_features_list = ['60m_chg_std', '60m_z_price', '60m_z_volume', 'close_to_high', 'close_to_low', 'close_to_open', 'macd', 'fso']
helper.run_features_list(target_features_list, log=True)
df = preprocess_data(helper.get_result(bool_dropna=True), input_period=1)

input_df = df.copy()
train_df, test_df = split_train_test_df(input_df)
label_col, feature_cols = train_df.columns[0], train_df.columns[1:]

# Define model
helper = SagemakerEstimatorHelper(target_algorithm='neural_network', 
                                  output_path=output_path, 
                                  train_entry_point='train.py',
                                  predict_entry_point='predict.py',
                                  source_dir='source', 
                                  hyperparameters={'input_dim': len(target_features_list),
                                                   'hidden_dim_list': "8_6",
                                                   'output_dim': 1,
                                                   'epochs': 100})

# Upload data to S3
helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# Train and Deploy model
helper.est_fit()
helper.deploy()

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])
summary_dict['less_features'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])

# Delete endpoint
helper.delete_endpoint()

In [None]:
# Training set plotting
plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | Neural Network | Less Features')

In [None]:
# Testing set plotting
plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | Neural Network | Less Features')

---

### Alternative 3: Use more number of features (same number of layers)

In [None]:
helper = FeaturesHelper(original_df)
target_features_list = helper.get_available_features()
helper.run_features_list(target_features_list, log=True)
df = preprocess_data(helper.get_result(bool_dropna=True), input_period=1)

input_df = df.copy()
train_df, test_df = split_train_test_df(input_df)
label_col, feature_cols = train_df.columns[0], train_df.columns[1:]
summary_dict['more_features'] = {}

# Define model
helper = SagemakerEstimatorHelper(target_algorithm='neural_network', 
                                  output_path=output_path, 
                                  train_entry_point='train.py',
                                  predict_entry_point='predict.py',
                                  source_dir='source', 
                                  hyperparameters={'input_dim': len(target_features_list),
                                                   'hidden_dim_list': "8_6",
                                                   'output_dim': 1,
                                                   'epochs': 100})

# Upload data to S3
helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# Train and Deploy model
helper.est_fit()
helper.deploy()

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])
summary_dict['more_features'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])


# Delete endpoint
helper.delete_endpoint()

In [None]:
# Training set plotting
plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | Neural Network | More Features')

In [None]:
# Testing set plotting
plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | Neural Network | More Features')

---

### Alternative 3: Predicting forward price changes of a different interval (5 minutes)

In [None]:
helper = FeaturesHelper(original_df)
target_features_list = ['60m_chg_std', '60m_z_price', '60m_z_volume', '60m_draw_up', '60m_draw_down', '5m_smoothed_volume_chg', 
                        'log_volume', 'close_to_high', 'close_to_low', 'close_to_open', 'adx', 'macd', 'fso']
helper.run_features_list(target_features_list, log=False)
df = preprocess_data(helper.get_result(bool_dropna=True), input_period=5)

input_df = df.copy()
train_df, test_df = split_train_test_df(input_df)
label_col, feature_cols = train_df.columns[0], train_df.columns[1:]

# Define model
helper = SagemakerEstimatorHelper(target_algorithm='neural_network', 
                                  output_path=output_path, 
                                  train_entry_point='train.py',
                                  predict_entry_point='predict.py',
                                  source_dir='source', 
                                  hyperparameters={'input_dim': len(target_features_list),
                                                   'hidden_dim_list': "8_6",
                                                   'output_dim': 1,
                                                   'epochs': 100})

# Upload data to S3
helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# Train and Deploy model
helper.est_fit()
helper.deploy()

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])
summary_dict['5_minutes'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])

# Delete endpoint
helper.delete_endpoint()

In [None]:
# Training set plotting
plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | Neural Network | 5 Minutes Forward Price Changes')

In [None]:
# Testing set plotting
plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | Neural Network | 5 Minutes Forward Price Changes')

---

### Alternative 4: Predicting forward price changes of a different interval (10 minutes)

In [None]:
helper = FeaturesHelper(original_df)
target_features_list = ['60m_chg_std', '60m_z_price', '60m_z_volume', '60m_draw_up', '60m_draw_down', '5m_smoothed_volume_chg', 
                        'log_volume', 'close_to_high', 'close_to_low', 'close_to_open', 'adx', 'macd', 'fso']
helper.run_features_list(target_features_list, log=False)
df = preprocess_data(helper.get_result(bool_dropna=True), input_period=10)

input_df = df.copy()
train_df, test_df = split_train_test_df(input_df)
label_col, feature_cols = train_df.columns[0], train_df.columns[1:]

# Define model
helper = SagemakerEstimatorHelper(target_algorithm='neural_network', 
                                  output_path=output_path, 
                                  train_entry_point='train.py',
                                  predict_entry_point='predict.py',
                                  source_dir='source', 
                                  hyperparameters={'input_dim': len(target_features_list),
                                                   'hidden_dim_list': "8_6",
                                                   'output_dim': 1,
                                                   'epochs': 100})

# Upload data to S3
helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# Train and Deploy model
helper.est_fit()
helper.deploy()

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])
summary_dict['10_minutes'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])


# Delete endpoint
helper.delete_endpoint()

In [None]:
# Training set plotting
plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | Neural Network | 10 Minutes Forward Price Changes')

In [None]:
# Testing set plotting
plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | Neural Network | 10 Minutes Forward Price Changes')

---

### Alternative 5: Predicting forward price changes using XGBoost

In [None]:
# helper = FeaturesHelper(original_df)
# target_features_list = ['60m_chg_std', '60m_z_price', '60m_z_volume', '60m_draw_up', '60m_draw_down', '5m_smoothed_volume_chg', 
#                         'log_volume', 'close_to_high', 'close_to_low', 'close_to_open', 'adx', 'macd', 'fso']
# helper.run_features_list(target_features_list, log=False)
# df = preprocess_data(helper.get_result(bool_dropna=True), input_period=10)

# input_df = df.copy()
# train_df, test_df = split_train_test_df(input_df)
# label_col, feature_cols = train_df.columns[0], train_df.columns[1:]

# # Define model
# helper = SagemakerEstimatorHelper(target_algorithm='xgboost', 
#                                   output_path=output_path)
# helper.set_hyperparameters(input_hyperparameters={'max_depth': 5,
#                                                   'eta': 0.2,
#                                                   'gamma': 4,
#                                                   'min_child_weight': 6,
#                                                   'subsample': 0.8,
#                                                   'early_stopping_rounds': 10,
#                                                   'num_round': 200})

# # Upload data to S3
# helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
# helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# # Train and Deploy model
# helper.est_fit()
# helper.deploy()

# # Predict after deployment
# train_preds = helper.predict_in_chunks(train_df[feature_cols])
# test_preds = helper.predict_in_chunks(test_df[feature_cols])
# summary_dict['xgboost'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])


# # Delete endpoint
# helper.delete_endpoint()

In [None]:
# Training set plotting
# plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | XGBoost | Normal')

In [None]:
# Testing set plotting
# plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | XGBoost | Normal')

---

### Alternative 6: Predicting forward price changes using LinearLearner

In [None]:
helper = FeaturesHelper(original_df)
target_features_list = ['60m_chg_std', '60m_z_price', '60m_z_volume', '60m_draw_up', '60m_draw_down', '5m_smoothed_volume_chg', 
                        'log_volume', 'close_to_high', 'close_to_low', 'close_to_open', 'adx', 'macd', 'fso']
helper.run_features_list(target_features_list, log=False)
df = preprocess_data(helper.get_result(bool_dropna=True), input_period=10)

input_df = df.copy()
train_df, test_df = split_train_test_df(input_df)
label_col, feature_cols = train_df.columns[0], train_df.columns[1:]

# Define model
helper = SagemakerEstimatorHelper(target_algorithm='linear_learner', 
                                  output_path=output_path, 
                                  hyperparameters={'epochs': 100})

# Upload data to S3
helper.upload_data(train_df, 'train', data_dir=data_dir, prefix=prefix, force_update=True)
helper.upload_data(test_df, 'test', data_dir=data_dir, prefix=prefix, force_update=True)

# Train and Deploy model
helper.est_fit()
helper.deploy()

# Predict after deployment
train_preds = helper.predict_in_chunks(train_df[feature_cols])
test_preds = helper.predict_in_chunks(test_df[feature_cols])
summary_dict['linear_learner'] = evalute_result(train_pred_array=train_preds, train_label_array=train_df[label_col], test_pred_array=test_preds, test_label_array=test_df[label_col])


# Delete endpoint
helper.delete_endpoint()

In [None]:
# Training set plotting
plot_predictions(predictions_array=train_preds, actuals_array=train_df[label_col], title='Train | Linear Learner | Normal')

In [None]:
# Testing set plotting
plot_predictions(predictions_array=test_preds, actuals_array=test_df[label_col], title='Test | Linear Learner | Normal')

### Wrap-up: Save summary to local directory

In [None]:
import json

with open('summary_dict.json', 'w') as f:
    json.dump(summary_dict, f)