In [None]:
!pip install dateparser

In [None]:
!pip install imblearn

In [None]:
import os
import boto3
import pandas as pd
import sys
from datetime import datetime
from datetime import timedelta 
from matplotlib import pyplot
import numpy as np
import matplotlib.pyplot as plt
import io
import time
from sklearn.preprocessing import MinMaxScaler
import gc
import psutil

In [None]:
BUCKET_NAME = <s3_bucket_name>
DATA_POINTS_PER_WINDOW = 21
s3Res = boto3.resource('s3')
bucket = s3Res.Bucket(BUCKET_NAME)
labelledDataCommonPath = <path_to_labelleddata_dir_on_s3>
s3Client = boto3.client('s3')

In [None]:
my_bucket = s3Res.Bucket(BUCKET_NAME)

In [None]:
fileName='<path>/DetectedAnomalies_reviewed.csv'

obj = s3Client.get_object(Bucket = BUCKET_NAME, Key = fileName)
reviewedLabelsDf = pd.read_csv(obj['Body'])
reviewedLabelsDf.head()

In [None]:
[f for f in reviewedLabelsDf[reviewedLabelsDf.Agree==1].FileName]

In [None]:
csvFileList = []

for my_bucket_object in my_bucket.objects.filter(Prefix='{}/EnrichedData_15P_PND/'.format(commonPath)):
    if '.csv' in my_bucket_object.key:
        print(my_bucket_object.key)
        csvFileList.append(my_bucket_object.key)

In [None]:
def insertleftRightWindowFeatues(df, featureName, windowSize=10):
    for i in range(1, windowSize+1):
        df["{} t-{}".format(featureName, i)] = df[featureName].shift(-i).fillna(method='ffill')
    
    for i in range(1, windowSize+1):
        df["{} t+{}".format(featureName, i)] = df[featureName].shift(i).fillna(method='backfill')

In [None]:
bigDataframe = pd.DataFrame()

for file_name in reviewedLabelsDf.FileName.unique():
    coin_name = file_name.split('/')[-1].split('_')[1]
    
    print('Reviewed coin name={}'.format(coin_name))
    
    obj = s3Client.get_object(Bucket = BUCKET_NAME, Key = file_name)
    df = pd.read_csv(obj['Body'], index_col='0', parse_dates=True)
    df['Coin'] = coin_name
    
    insertleftRightWindowFeatues(df, 'Price', 10)
    insertleftRightWindowFeatues(df, 'Volume', 10)
    insertleftRightWindowFeatues(df, 'Close', 10)
    insertleftRightWindowFeatues(df, 'High', 10)
    
    df['Label'] = 0
    
    positiveIndexes = reviewedLabelsDf[(reviewedLabelsDf.FileName==file_name) & (reviewedLabelsDf.Agree==1)].TimestampIndexes
    print(positiveIndexes)
    
    for posIndex in positiveIndexes:
        df.loc[posIndex, 'Label'] = 1

    df.drop(['Number of Trades', 'isOpenToClosePriceAbove15P', 'isWindowClosingTurnoverSignificant', 'isLeftWindowPumping', 'isRightWindowDumping','Turnover','Open','Low'],
                axis=1, inplace=True)
     
    positiveCasesDf = df[df.Label==1]
    negativeCasesDf = df[df.Label==0].sample(frac=0.0001, random_state=33)
    
    negativeIndexes = reviewedLabelsDf[(reviewedLabelsDf.FileName==file_name) & (reviewedLabelsDf.Agree==0)].TimestampIndexes   
    negativeCasesTricky = df.loc[negativeIndexes]
    
    bigDataframe = pd.concat([bigDataframe, positiveCasesDf], axis=0)
    bigDataframe = pd.concat([bigDataframe, negativeCasesDf], axis=0)
    bigDataframe = pd.concat([bigDataframe, negativeCasesTricky], axis=0)
    
    del positiveCasesDf
    del negativeCasesDf
    del negativeCasesTricky
    del df

In [None]:
import seaborn as sns
from matplotlib.legend_handler import HandlerBase
from matplotlib.text import Text

ax=sns.countplot(x=bigDataframe.Label)
ax.set_yscale("log")

class TextHandler(HandlerBase):
    def create_artists(self, legend, tup ,xdescent, ydescent,
                        width, height, fontsize,trans):
        tx = Text(width/2.,height/2,tup[0], fontsize=fontsize,
                  ha="center", va="center", color=tup[1], fontweight="bold")
        return [tx]
    
handltext = ["0", "1"]
labels = ["Non-PND", "PND"]

t = ax.get_xticklabels()
labeldic = dict(zip(handltext, labels))
labels = [labeldic[h.get_text()]  for h in t]
handles = [(h.get_text(),c.get_fc()) for h,c in zip(t,ax.patches)]

ax.legend(handles, labels, handler_map={tuple : TextHandler()} )

In [None]:
from collections import Counter
from matplotlib import pyplot
from numpy import where

counter = Counter(bigDataframe.Label)

In [None]:
y = bigDataframe.Label

In [None]:
X = bigDataframe.loc[:,['Price t-10', 'Price t-9','Price t-8','Price t-7','Price t-6','Price t-5','Price t-4','Price t-3','Price t-2','Price t-1','Price',
                      'Price t+1','Price t+2','Price t+3','Price t+4','Price t+5','Price t+6','Price t+7','Price t+8','Price t+9','Price t+10',
                     'Volume t-10', 'Volume t-9', 'Volume t-8', 'Volume t-7', 'Volume t-6', 'Volume t-5', 'Volume t-4', 'Volume t-3', 
                     'Volume t-2', 'Volume t-1', 'Volume', 'Volume t+1', 'Volume t+2', 'Volume t+3', 'Volume t+4', 'Volume t+5', 'Volume t+6', 'Volume t+7', 'Volume t+8', 'Volume t+9', 'Volume t+10',
                     'Close t-10', 'Close t-9', 'Close t-8', 'Close t-7', 'Close t-6', 'Close t-5', 'Close t-4', 'Close t-3', 'Close t-2', 'Close t-1', 'Close', 'Close t+1', 'Close t+2',
                     'Close t+3', 'Close t+4', 'Close t+5', 'Close t+6', 'Close t+7', 'Close t+8', 'Close t+9', 'Close t+10',
                     'High t-10', 'High t-9', 'High t-8', 'High t-7', 'High t-6', 'High t-5', 'High t-4', 'High t-3', 'High t-2', 'High t-1', 'High', 
                     'High t+1', 'High t+2', 'High t+3', 'High t+4', 'High t+5', 'High t+6', 'High t+7', 'High t+8', 'High t+9', 'High t+10' ]]

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.9, random_state=17)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X_smote, y_smote = pipeline.fit_resample(X, y)

counter = Counter(y_smote)

In [None]:
finalDf = pd.concat([X_smote,y_smote], axis=1)

In [None]:
destS3Url = <dest_s3_url>

In [None]:
finalDf.to_csv(destS3Url + "SMOTE_Samples_from_ReviewedCases.csv", index=False)