In [None]:
!pip install dateparser

In [None]:
!pip install pyts

In [None]:
import os
import boto3
import pandas as pd
import sys
from datetime import datetime
from datetime import timedelta 
import dateparser as dp
from matplotlib import pyplot
import numpy as np
import matplotlib.pyplot as plt
from pyts.image import GramianAngularField
from mpl_toolkits.axes_grid1 import ImageGrid
from pyts.image import MarkovTransitionField
from pyts.image import RecurrencePlot
import io
import matplotlib.image as mpimg
import time
from sklearn.preprocessing import MinMaxScaler
import gc
import psutil
import pickle

from keras import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
import random
from keras.layers import Dense, Dropout, Flatten,Conv2D, MaxPooling2D
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from statistics import mean
import seaborn as sns

%matplotlib inline 

In [None]:
BUCKET_NAME = <s3_bucket_name>
DATA_POINTS_PER_WINDOW = 21
s3Res = boto3.resource('s3')
bucket = s3Res.Bucket(BUCKET_NAME)
labelledDataCommonPath = <path_to_labelleddata_dir_on_s3>
tempDiskSaveLoc = '<path>/tmpImage.png'
s3Client = boto3.client('s3')
INPUT_MATRIX_WIDTH = 21
ENCODED_FEATURES=2

In [None]:
def getGAFMatrix(df, feature, index, method='summation', span=10):
    X = [df.loc[(index - timedelta(minutes=span)) : (index + timedelta(minutes=span)), feature]]
    
    if len(X[0]) != DATA_POINTS_PER_WINDOW:
        print("GAF error..{} Length != {}, {} point={}".format(feature, DATA_POINTS_PER_WINDOW, len(X[0]), index))
        raise Exception('GAF Length != %d, %d' %(DATA_POINTS_PER_WINDOW, len(X[0]))) 
    
    gaf = GramianAngularField(method = method, overlapping = False)
    x_gaf = gaf.fit_transform(X)
    return x_gaf

In [None]:
my_bucket = s3Res.Bucket(BUCKET_NAME)

csvFileList = []

for my_bucket_object in my_bucket.objects.filter(Prefix=labelledDataCommonPath):
    if '.csv' in my_bucket_object.key:
        print(my_bucket_object.key)
        csvFileList.append(my_bucket_object.key)

In [None]:
def getSurroundingIndexesToPosIndex(posIndexes, df):
    surroundIndexesToPosIndexes = pd.Index([])
    
    for pos in posIndexes:
        sampleIndex = df.loc[(pos - timedelta(minutes=10)) : (pos + timedelta(minutes=10))].sample(1).index
        surroundIndexesToPosIndexes = surroundIndexesToPosIndexes.union(sampleIndex)
    
    return surroundIndexesToPosIndexes

# Training main loop

In [None]:
fileIndex = 0

encodedFeatures = ['Price', 'Volume']
encoded_feature_count = len(encodedFeatures)
minVicinity = 20

X_data=[]
Y_data=[]

NUMBER_OF_FILES_USEDTO_TRAIN = 20

for file_name in csvFileList[:NUMBER_OF_FILES_USEDTO_TRAIN]:
    print(file_name)
    
    coin_name = file_name.split('/')[-1].split('_')[1]
    fileIndex +=1
    obj = s3Client.get_object(Bucket = BUCKET_NAME, Key = file_name)
    df = pd.read_csv(obj['Body'], index_col='0', parse_dates=True)
    
    anomalyIndexes = df[df.Label==1].index
        
    for i in anomalyIndexes:
        mat = np.zeros((DATA_POINTS_PER_WINDOW, DATA_POINTS_PER_WINDOW, encoded_feature_count), 'float32')    
        try:
            price = getGAFMatrix(df, 'Price', i, method='summation', span=10)
            vol = getGAFMatrix(df, 'Volume', i, method='summation', span=10)
        except:
            print("Anomaly case={} exception occurred for coin when GASF {}".format(i.strftime('%Y-%m-%d_%H%M%S'), coin_name))
            continue
        
        mat[:,:,0]=price[0]
        mat[:,:,1]=vol[0]
        y=1
        X_data.append(mat)
        Y_data.append(y)
    
    if ( len(df[df.Label==0].index) > int(len(anomalyIndexes)/2) ):
        nonAnomalousIndexes = df[df.Label==0].sample(int(len(anomalyIndexes)/2), random_state=79).index
    else:
#         take a half from non anomalous indexes
        nonAnomalousIndexes = df[df.Label==0].sample(int(len(df[df.Label==0])/2), random_state=79).index
            
    surroundingIndexesToPosIndexes = getSurroundingIndexesToPosIndex(anomalyIndexes, df)
    nonAnomalousIndexes = nonAnomalousIndexes.union(surroundingIndexesToPosIndexes[:int(len(surroundingIndexesToPosIndexes)/2)])
    
    print("number of non anom cases={}".format(len(nonAnomalousIndexes)))
    print("number of non anom cases={}".format(len(anomalyIndexes)))
    
    for i in nonAnomalousIndexes:        
        mat = np.zeros((DATA_POINTS_PER_WINDOW, DATA_POINTS_PER_WINDOW, encoded_feature_count), 'float32')
        
        try:
            price = getGAFMatrix(df, 'Price', i, method='summation', span=10)
            vol = getGAFMatrix(df, 'Volume', i, method='summation', span=10)
        except:
            print("NonAnomaly case={} exception occurred for coin when GASF {}".format(i.strftime('%Y-%m-%d_%H%M%S'), coin_name))
            continue
        
        mat[:,:,0]=price[0]
        mat[:,:,1]=vol[0]
        y=0
        X_data.append(mat)
        Y_data.append(y)
    
    print('-------------- processed files %d' %fileIndex)
    print(psutil.virtual_memory())

In [None]:
Y_dataArr = np.array(Y_data)
X_dataArr = np.array(X_data)

In [None]:
ax = sns.countplot(x=Y_dataArr, palette="Set3")

In [None]:
def create_gadfcnn5_model():    
    cnn=Sequential()
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu', input_shape=(INPUT_MATRIX_WIDTH, INPUT_MATRIX_WIDTH, ENCODED_FEATURES)))
    cnn.add(Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='relu'))
    cnn.add(Dropout(0.25))
    cnn.add(Flatten())
    cnn.add(Dense(256, activation='relu'))
    cnn.add(Dropout(0.5))
    cnn.add(Dense(1, activation='sigmoid'))
    cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return cnn

In [None]:
kf = StratifiedKFold(n_splits=10)
history = []
confusions= []
classifReports= []

fold = 0

for train, test in kf.split(X_dataArr, Y_dataArr):
    print('Running fold [%d]'.ljust(100,'*') %fold)
    fold +=1
    
    cnn=create_gadfcnn5_model()
    
    x_train, x_test = X_dataArr[train], X_dataArr[test]
    y_train, y_test = Y_dataArr[train], Y_dataArr[test]
    
    hist = cnn.fit(x=x_train, y=y_train, validation_split=0.2, epochs=20, batch_size=500, verbose=0)
    history.append(hist)
    
    y_pred = cnn.predict(x_test)

    y_pred_R = np.round(y_pred)
    conf = confusion_matrix(y_test, y_pred_R)
    confusions.append(conf)
    
    clfr = classification_report(y_test, y_pred_R, output_dict=True)
    print(clfr)
    classifReports.append(clfr)

In [None]:
import statistics

f1s = [rep['macro avg']['f1-score'] for rep in classifReports]
recalls = [rep['macro avg']['recall'] for rep in classifReports]
precisions = [rep['macro avg']['precision'] for rep in classifReports]

print(statistics.variance(f1s))
print(statistics.variance(recalls))
print(statistics.variance(precisions))

In [None]:
print(statistics.stdev(f1s))
print(statistics.stdev(recalls))
print(statistics.stdev(precisions))

In [None]:
np.where(y_pred_R==1)

In [None]:
j=9
plt.plot(history[j].history['acc'])
plt.plot(history[j].history['val_acc'])

plt.legend(['acc','val_acc'])

In [None]:
j=9
plt.plot(history[j].history['loss'])
plt.plot(history[j].history['val_loss'])
plt.legend(['loss','val_loss'])

In [None]:
import seaborn as sns
finConf=np.zeros((2,2), dtype=int)
for elem in confusions:
    for i in range(2):
        for j in range(2):
             finConf[i][j] += elem[i][j]
                
labels = ['True Neg','False Pos','False Neg','True Pos']
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(finConf/np.sum(finConf), annot=True, fmt='.2%', cmap='Blues')

In [None]:
macroPrec=[]
macroRecall=[]
macrof1=[]

for elem in classifReports:
    macroPrec.append(elem['macro avg']['precision'])
    macroRecall.append(elem['macro avg']['recall'])
    macrof1.append(elem['macro avg']['f1-score'])
    
print(np.mean(macroPrec))
print(np.mean(macroRecall))
print(np.mean(macrof1))

In [None]:
weighPrec=[]
weighRecall=[]
weighf1=[]

for elem in classifReports:
    weighPrec.append(elem['weighted avg']['precision'])
    weighRecall.append(elem['weighted avg']['recall'])
    weighf1.append(elem['weighted avg']['f1-score'])
    
print(np.mean(weighPrec))
print(np.mean(weighRecall))
print(np.mean(weighf1))

# Do a model train using X and Y created from first 20 files 

In [None]:
gadfCnn5 = create_gadfcnn5_model()
hist = gadfCnn5.fit(x=X_dataArr, y=Y_dataArr, validation_split=0.2, epochs=10, batch_size=500, verbose=0)

In [None]:
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.legend(['acc','val_acc'])

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.legend(['loss','val_loss'])

In [None]:
modelSaveLocOnDisk = '<path>/GASF_CNN5.h5'
gadfCnn5.save(modelSaveLocOnDisk)
print("Saved model to disk at {}".format(modelSaveLocOnDisk))

In [None]:
from numpy import loadtxt
from keras.models import load_model

model = load_model(modelSaveLocOnDisk)
model.summary()

In [None]:
print(X_dataArr.shape)
print(Y_dataArr.shape)

In [None]:
# evaluate the model
score = model.evaluate(X_dataArr, Y_dataArr, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))