In [None]:
!pip install dateparser

In [None]:
!pip install tqdm
from tqdm import tqdm

In [None]:
!pip install pyts

In [None]:
import os
import boto3
import pandas as pd
import sys
from datetime import datetime
from datetime import timedelta 
import dateparser as dp
from matplotlib import pyplot
import numpy as np
import matplotlib.pyplot as plt
from pyts.image import GramianAngularField
from mpl_toolkits.axes_grid1 import ImageGrid
from pyts.image import MarkovTransitionField
from pyts.image import RecurrencePlot
import io
import matplotlib.image as mpimg
import time
from sklearn.preprocessing import MinMaxScaler
import gc
import psutil
import pickle
from keras import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
import random
from keras.layers import Dense, Dropout, Flatten,Conv2D, MaxPooling2D
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from statistics import mean
import seaborn as sns
from numpy import loadtxt
from keras.models import load_model

%matplotlib inline 

In [None]:
BUCKET_NAME = <s3_bucket_name>
DATA_POINTS_PER_WINDOW = 21
s3Res = boto3.resource('s3')
bucket = s3Res.Bucket(BUCKET_NAME)
commonPath = <s3_path>
labelledDataCommonPath = '{}/SMOTED_DATA'.format(commonPath)
tempDiskSaveLoc= '15PNDImage.png'
s3Client = boto3.client('s3')
INPUT_MATRIX_WIDTH = 21
ENCODED_FEATURES=2

In [None]:
def getGAFMatrix(df, feature, index, method='summation', span=10):
    X = [df.loc[(index - timedelta(minutes=span)) : (index + timedelta(minutes=span)), feature]]
    
    if len(X[0]) != DATA_POINTS_PER_WINDOW:
        raise Exception('GAF Length != %d, %d' %(DATA_POINTS_PER_WINDOW, len(X[0]))) 
    
    gaf = GramianAngularField(method = method, overlapping = False)
    x_gaf = gaf.fit_transform(X)
    return x_gaf
    

def getMTFMatrix(df, feature, index, bins=10, span=10):
    X = [df.loc[(index - timedelta(minutes=span)) : index, feature]]
    if len(X[0]) != DATA_POINTS_PER_WINDOW:
        raise Exception('MTF Length != %d, %d' %(DATA_POINTS_PER_WINDOW, len(X[0]))) 
    
    mtf = MarkovTransitionField(n_bins=bins, strategy='uniform', overlapping=False)
    x_mtf = mtf.fit_transform(X)
    return x_mtf


def getRecurrencePlotMatrix(df, feature, index, threshold=None, span=10):
    X = [df.loc[(index - timedelta(minutes=span)) : index, feature]]
    if len(X[0]) != DATA_POINTS_PER_WINDOW:
        raise Exception('RP Length != %d, %d' %(DATA_POINTS_PER_WINDOW, len(X[0])))
    
    rp = RecurrencePlot(threshold = threshold)
    x_rp = rp.fit_transform(X)
    return x_rp

In [None]:
my_bucket = s3Res.Bucket(BUCKET_NAME)

csvFileList = []

for my_bucket_object in my_bucket.objects.filter(Prefix=labelledDataCommonPath):
    if '.csv' in my_bucket_object.key:
        print(my_bucket_object.key)
        csvFileList.append(my_bucket_object.key)

In [None]:
modelSaveLocOnDisk = 'GADF_CNN5.h5'

gadfCnn5 = load_model(modelSaveLocOnDisk)
gadfCnn5.summary()

# Training main loop

In [None]:
import time
fileIndex = 0
encodedFeatures = ['Price', 'Volume']
encoded_feature_count = len(encodedFeatures)
minVicinity = 20

NUMBER_OF_FILES_USEDTO_TRAIN = 20

predictionDf = pd.DataFrame(columns = ['CoinName', 'PredictedLabel', 'Label'])

for file_name in csvFileList[NUMBER_OF_FILES_USEDTO_TRAIN:24]:
    print(file_name)
    
    coin_name = file_name.split('/')[-1].split('_')[1]
    fileIndex +=1
    obj = s3Client.get_object(Bucket = BUCKET_NAME, Key = file_name)
    df = pd.read_csv(obj['Body'], index_col='0', parse_dates=True)
    mismatches = []
    
    df['CoinName'] = coin_name
    df['PredictedLabel'] = 0
    
    for i in tqdm(df.index):
        
        start = time.time()
        
        mat = np.zeros((1, DATA_POINTS_PER_WINDOW, DATA_POINTS_PER_WINDOW, encoded_feature_count), 'float32')    
        try:
            price = getGAFMatrix(df, 'Price', i, method='difference', span=10)
            vol = getGAFMatrix(df, 'Volume', i, method='difference', span=10)
        except:
            print("An exception occurred for coin when GADF encoded {} at {}".format(coin_name, i.strftime('%Y-%m-%d %H%M%S') ))
            continue
        
        mat[0][:,:,0] = price[0]
        mat[0][:,:,1] = vol[0]
        
        y_pred_R = np.round(gadfCnn5.predict(mat))
        
        end = time.time()
        print(end - start)
        print("$$$")
        

        df.loc[i, 'PredictedLabel'] = y_pred_R[0][0]
        
        if (df.loc[i, 'Label'] != y_pred_R[0][0]):
            mismatches.append( (i.strftime('%Y-%m-%d %H%M%S'), df.loc[i, 'Label'], y_pred_R[0][0]) )
    
    if  (len(mismatches) > 50 ):
        print ('******** Number of mismatches for coin{} is high={} !!!'.format(coin_name, len(mismatches)))
        print(mismatches)
    
    predictionDf = pd.concat([predictionDf, df[['CoinName', 'PredictedLabel', 'Label']]], axis=0)
    
    print('-------------- processed files %d' %fileIndex)
    print(psutil.virtual_memory())

In [None]:
pred = predictionDf['PredictedLabel'].values
act = predictionDf['Label'].values

In [None]:
predictionDf.to_csv('gadfcnn5_prediction.csv', index=False)

In [None]:
conf = confusion_matrix(act, pred)
print(conf)

clfr = classification_report(act, pred, output_dict=True)
print(clfr)

In [None]:
import seaborn as sns
finConf=np.zeros((2,2), dtype=int)
for i in range(2):
    for j in range(2):
         finConf[i][j] += conf[i][j]
                
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(finConf/np.sum(finConf), annot=True, fmt='.2%', cmap='Blues')