In [None]:
import os
import boto3
import pandas as pd
import sys
from datetime import datetime
from datetime import timedelta 

from matplotlib import pyplot
import numpy as np
import matplotlib.pyplot as plt
import io
import time
from sklearn.preprocessing import MinMaxScaler
import psutil
from keras import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
import random
from keras.layers import Dense, Dropout, Flatten,Conv2D, MaxPooling2D
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from statistics import mean
import seaborn as sns

%matplotlib inline 

In [None]:
labelledDataCommonPath = <path_to_labelleddata_dir_on_s3>
featureCombinationName = 'Open_High_Low_Close_Volume_GADF_15PPND_New'

In [None]:
BUCKET_NAME = <s3_bucket_name>
s3Res = boto3.resource('s3')
s3Client = boto3.client('s3')
my_bucket = s3Res.Bucket(BUCKET_NAME)

INPUT_MATRIX_WIDTH = 21

In [None]:
prefix="{}/{}/".format(labelledDataCommonPath, featureCombinationName)

X_data=[]
Y_data=[]

for my_bucket_object in my_bucket.objects.filter(Prefix=prefix):
    if 'x.pkl' in my_bucket_object.key:
        print(my_bucket_object.key)
        response_X = s3Client.get_object(Bucket=BUCKET_NAME, Key=my_bucket_object.key)
        x_body=response_X['Body'].read()
        X_data.append(pickle.loads(x_body))
    elif 'y.pkl' in my_bucket_object.key:
        print(my_bucket_object.key)
        response_Y = s3Client.get_object(Bucket=BUCKET_NAME, Key=my_bucket_object.key)
        y_body=response_Y['Body'].read()
        Y_data.append(pickle.loads(y_body))

In [None]:
Y_dataBinary = np.array(Y_data)
Y_dataBinary

In [None]:
Y_dataArr = np.array(Y_data)
X_dataArr = np.array(X_data)

In [None]:
def create_model(activation='relu', dropout=True, pooling=True):
    cnn=Sequential()
    cnn.add(Conv2D(filters=32, kernel_size=(2,2), padding='same', activation=activation, input_shape=(INPUT_MATRIX_WIDTH, INPUT_MATRIX_WIDTH, 5)))
    
    if pooling==True:
        cnn.add(MaxPooling2D(pool_size=(2,2)))
    
    cnn.add(Conv2D(filters=32, kernel_size=(2,2), padding='same', activation=activation))
    
    if pooling==True:
        cnn.add(MaxPooling2D(pool_size=(2,2)))
    
    if dropout==True:
        cnn.add(Dropout(0.25))
    
    cnn.add(Flatten())
    cnn.add(Dense(128, activation=activation))
    
    if dropout==True:
        cnn.add(Dropout(0.5))
    
    cnn.add(Dense(1, activation='sigmoid'))
    cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return cnn

In [None]:
model = KerasClassifier(build_fn=create_model, epochs=25, verbose=0)

activation=['relu', 'tanh']
dropout=[True, False]
pooling=[True, False]
batch_size=[50, 100]

param_grid=dict(activation=activation, dropout=dropout, pooling=pooling, batch_size=batch_size)

grid=GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_dataArr, Y_dataBinary)

print("best %f using %s" %(grid_result.best_score_, grid_result.best_params_))

## 32, 32, 128, pooling=false, dropout=true

In [None]:
cnn = create_model(activation='relu', dropout=False, pooling=True)
cnn.summary()

In [None]:
kf = StratifiedKFold(n_splits=3)
history = []
confusions= []
classifReports= []

fold = 0

for train, test in kf.split(X_dataArr, Y_dataBinary):
    print('Running fold [%d]'.ljust(100,'*') %fold)
    fold +=1
    
    cnn = create_model(activation='relu', dropout=False, pooling=True)
    
    x_train, x_test = X_dataArr[train], X_dataArr[test]
    y_train, y_test = Y_dataBinary[train], Y_dataBinary[test]
    
    hist = cnn.fit(x=x_train, y=y_train, validation_split=0.2, epochs=40, batch_size=100, verbose=0)
    history.append(hist)
    
    y_pred = cnn.predict(x_test)

    y_pred_R = np.round(y_pred)
    conf = confusion_matrix(y_test, y_pred_R)
    confusions.append(conf)
    
    clfr = classification_report(y_test, y_pred_R, output_dict=True)
    print(clfr)
    classifReports.append(clfr)

In [None]:
j=2
plt.plot(history[j].history['acc'])
plt.plot(history[j].history['val_acc'])
plt.legend(['loss','val_loss'])

In [None]:
plt.plot(history[j].history['loss'])
plt.plot(history[j].history['val_loss'])
plt.legend(['loss','val_loss'])

In [None]:
#re running for epoch 22

kf = StratifiedKFold(n_splits=3)
history = []
confusions= []
classifReports= []

fold = 0

for train, test in kf.split(X_dataArr, Y_dataBinary):
    print('Running fold [%d]'.ljust(100,'*') %fold)
    fold +=1
    
    cnn = create_model(activation='relu', dropout=False, pooling=True)
    
    x_train, x_test = X_dataArr[train], X_dataArr[test]
    y_train, y_test = Y_dataBinary[train], Y_dataBinary[test]
    
    hist = cnn.fit(x=x_train, y=y_train, validation_split=0.2, epochs=22, batch_size=100, verbose=0)
    history.append(hist)
    
    y_pred = cnn.predict(x_test)

    y_pred_R = np.round(y_pred)
    conf = confusion_matrix(y_test, y_pred_R)
    confusions.append(conf)
    
    clfr = classification_report(y_test, y_pred_R, output_dict=True)
    print(clfr)
    classifReports.append(clfr)

In [None]:
f1=[ rep['1']['f1-score'] for rep in classifReports ]
recal=[ rep['1']['recall'] for rep in classifReports ]
prec=[ rep['1']['precision'] for rep in classifReports ]

print(mean(f1))
print(mean(recal))
print(mean(prec))

In [None]:
finConf=np.zeros((2,2), dtype=int)
for elem in confusions:
    for i in range(2):
        for j in range(2):
             finConf[i][j] += elem[i][j]
                
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(finConf/np.sum(finConf), annot=True, fmt='.2%', cmap='Blues')

In [None]:
macroPrec=[]
macroRecall=[]
macrof1=[]

for elem in classifReports:
    macroPrec.append(elem['macro avg']['precision'])
    macroRecall.append(elem['macro avg']['recall'])
    macrof1.append(elem['macro avg']['f1-score'])
    
print(np.mean(macroPrec))
print(np.mean(macroRecall))
print(np.mean(macrof1))


weighPrec=[]
weighRecall=[]
weighf1=[]

for elem in classifReports:
    weighPrec.append(elem['weighted avg']['precision'])
    weighRecall.append(elem['weighted avg']['recall'])
    weighf1.append(elem['weighted avg']['f1-score'])
    
print(np.mean(weighPrec))
print(np.mean(weighRecall))
print(np.mean(weighf1))

# Grid search 64,64,128

In [None]:
def create_model_64(dropout=True, pooling=True, dropoutp1=0.25, dropoutp2=0.5):
    cnn=Sequential()
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu', input_shape=(INPUT_MATRIX_WIDTH, INPUT_MATRIX_WIDTH, 5)))
    
    if pooling==True:
        cnn.add(MaxPooling2D(pool_size=(2,2)))
    
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu'))
    
    if pooling==True:
        cnn.add(MaxPooling2D(pool_size=(2,2)))
    
    if dropout==True:
        cnn.add(Dropout(dropoutp1))
    
    cnn.add(Flatten())
    cnn.add(Dense(128, activation='relu'))
    
    if dropout==True:
        cnn.add(Dropout(dropoutp2))
    
    cnn.add(Dense(1, activation='sigmoid'))
    cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return cnn

In [None]:
model = KerasClassifier(build_fn=create_model_64, epochs=25, verbose=0)

dropout=[True, False]
pooling=[True, False]
batch_size=[50, 100]
dropoutp1 =[0.25,0.5]
dropoutp2 =[0.25,0.5]

param_grid=dict(dropout=dropout, pooling=pooling, batch_size=batch_size, dropoutp1=dropoutp1, dropoutp2=dropoutp2)

grid=GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_dataArr, Y_dataBinary)

print("best %f using %s" %(grid_result.best_score_, grid_result.best_params_))

In [None]:
kf = StratifiedKFold(n_splits=3)
history = []
confusions= []
classifReports= []

fold = 0

for train, test in kf.split(X_dataArr, Y_dataBinary):
    print('Running fold [%d]'.ljust(100,'*') %fold)
    fold +=1
    
    cnn = create_model_64(dropout=True, pooling=True, dropoutp1=0.25, dropoutp2=0.5)
    
    x_train, x_test = X_dataArr[train], X_dataArr[test]
    y_train, y_test = Y_dataBinary[train], Y_dataBinary[test]
    
    hist = cnn.fit(x=x_train, y=y_train, validation_split=0.2, epochs=40, batch_size=50, verbose=0)
    history.append(hist)
    
    y_pred = cnn.predict(x_test)

    y_pred_R = np.round(y_pred)
    conf = confusion_matrix(y_test, y_pred_R)
    confusions.append(conf)
    
    clfr = classification_report(y_test, y_pred_R, output_dict=True)
    print(clfr)
    classifReports.append(clfr)

In [None]:
cnn = create_model_64(dropout=True, pooling=True, dropoutp1=0.25, dropoutp2=0.5)
cnn.summary()

In [None]:
j=2
plt.plot(history[j].history['acc'])
plt.plot(history[j].history['val_acc'])

plt.legend(['acc','val_acc'])

f1=[ rep['1']['f1-score'] for rep in classifReports ]
recal=[ rep['1']['recall'] for rep in classifReports ]
prec=[ rep['1']['precision'] for rep in classifReports ]

print(mean(f1))
print(mean(recal))
print(mean(prec))

In [None]:
plt.plot(history[j].history['loss'])
plt.plot(history[j].history['val_loss'])
plt.legend(['loss','val_loss'])

In [None]:
#re run for epoch 14

kf = StratifiedKFold(n_splits=3)
history = []
confusions= []
classifReports= []

fold = 0

for train, test in kf.split(X_dataArr, Y_dataBinary):
    print('Running fold [%d]'.ljust(100,'*') %fold)
    fold +=1
    
    cnn = create_model_64(dropout=True, pooling=True, dropoutp1=0.25, dropoutp2=0.5)
    
    x_train, x_test = X_dataArr[train], X_dataArr[test]
    y_train, y_test = Y_dataBinary[train], Y_dataBinary[test]
    
    hist = cnn.fit(x=x_train, y=y_train, validation_split=0.2, epochs=14, batch_size=50, verbose=0)
    history.append(hist)
    
    y_pred = cnn.predict(x_test)

    y_pred_R = np.round(y_pred)
    conf = confusion_matrix(y_test, y_pred_R)
    confusions.append(conf)
    
    clfr = classification_report(y_test, y_pred_R, output_dict=True)
    print(clfr)
    classifReports.append(clfr)

In [None]:
finConf=np.zeros((2,2), dtype=int)
for elem in confusions:
    for i in range(2):
        for j in range(2):
             finConf[i][j] += elem[i][j]
                
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(finConf/np.sum(finConf), annot=True, fmt='.2%', cmap='Blues')

In [None]:
macroPrec=[]
macroRecall=[]
macrof1=[]

for elem in classifReports:
    macroPrec.append(elem['macro avg']['precision'])
    macroRecall.append(elem['macro avg']['recall'])
    macrof1.append(elem['macro avg']['f1-score'])
    
print(np.mean(macroPrec))
print(np.mean(macroRecall))
print(np.mean(macrof1))


weighPrec=[]
weighRecall=[]
weighf1=[]

for elem in classifReports:
    weighPrec.append(elem['weighted avg']['precision'])
    weighRecall.append(elem['weighted avg']['recall'])
    weighf1.append(elem['weighted avg']['f1-score'])
    
print(np.mean(weighPrec))
print(np.mean(weighRecall))
print(np.mean(weighf1))

# 64, 64, 128, pooling=false, dropout=true

In [None]:
kf = StratifiedKFold(n_splits=3)
history = []
confusions= []
classifReports= []

fold = 0

for train, test in kf.split(X_dataArr, Y_dataBinary):
    print('Running fold [%d]'.ljust(100,'*') %fold)
    fold +=1
    
    cnn=Sequential()
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu', input_shape=(INPUT_MATRIX_WIDTH, INPUT_MATRIX_WIDTH, 5)))
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu'))
    cnn.add(Dropout(0.25))
    cnn.add(Flatten())
    cnn.add(Dense(128, activation='relu'))
    cnn.add(Dropout(0.5))
    cnn.add(Dense(1, activation='sigmoid'))
    cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    x_train, x_test = X_dataArr[train], X_dataArr[test]
    y_train, y_test = Y_dataBinary[train], Y_dataBinary[test]
    
    hist = cnn.fit(x=x_train, y=y_train, validation_split=0.2, epochs=40, batch_size=50, verbose=0)
    history.append(hist)
    
    y_pred = cnn.predict(x_test)

    y_pred_R = np.round(y_pred)
    conf = confusion_matrix(y_test, y_pred_R)
    confusions.append(conf)
    
    clfr = classification_report(y_test, y_pred_R, output_dict=True)
    print(clfr)
    classifReports.append(clfr)

In [None]:
j=2
plt.plot(history[j].history['acc'])
plt.plot(history[j].history['val_acc'])
plt.plot(history[j].history['loss'])
plt.plot(history[j].history['val_loss'])
plt.legend(['acc','val_acc','loss','val_loss'])

In [None]:
f1=[ rep['1']['f1-score'] for rep in classifReports ]
recal=[ rep['1']['recall'] for rep in classifReports ]
prec=[ rep['1']['precision'] for rep in classifReports ]

print(mean(f1))
print(mean(recal))
print(mean(prec))

# 64, 64, 256, pooling=false, dropout=true

In [None]:
kf = StratifiedKFold(n_splits=3)
history = []
confusions= []
classifReports= []

fold = 0

for train, test in kf.split(X_dataArr, Y_dataBinary):
    print('Running fold [%d]'.ljust(100,'*') %fold)
    fold +=1
    
    cnn=Sequential()
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu', input_shape=(INPUT_MATRIX_WIDTH, INPUT_MATRIX_WIDTH, 5)))
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu'))
    cnn.add(Dropout(0.25))
    cnn.add(Flatten())
    cnn.add(Dense(256, activation='relu'))
    cnn.add(Dropout(0.5))
    cnn.add(Dense(1, activation='sigmoid'))
    cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    x_train, x_test = X_dataArr[train], X_dataArr[test]
    y_train, y_test = Y_dataBinary[train], Y_dataBinary[test]
    
    hist = cnn.fit(x=x_train, y=y_train, validation_split=0.2, epochs=20, batch_size=50, verbose=0)
    history.append(hist)
    
    y_pred = cnn.predict(x_test)

    y_pred_R = np.round(y_pred)
    conf = confusion_matrix(y_test, y_pred_R)
    confusions.append(conf)
    
    clfr = classification_report(y_test, y_pred_R, output_dict=True)
    print(clfr)
    classifReports.append(clfr)

In [None]:
j=2
plt.plot(history[j].history['acc'])
plt.plot(history[j].history['val_acc'])
plt.plot(history[j].history['loss'])
plt.plot(history[j].history['val_loss'])
plt.legend(['acc','val_acc','loss','val_loss'])

In [None]:
f1=[ rep['1']['f1-score'] for rep in classifReports ]
recal=[ rep['1']['recall'] for rep in classifReports ]
prec=[ rep['1']['precision'] for rep in classifReports ]

print(mean(f1))
print(mean(recal))
print(mean(prec))