In [None]:
!pip install dateparser

In [None]:
!pip install tqdm
from tqdm import tqdm

In [None]:
!pip install imblearn

In [None]:
import os
import boto3
import pandas as pd
import sys
from datetime import datetime
from datetime import timedelta 
from matplotlib import pyplot
import numpy as np
import matplotlib.pyplot as plt
import io
import time
from sklearn.preprocessing import MinMaxScaler
import gc
import psutil
import seaborn as sns
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
BUCKET_NAME = <s3_bucket_name>

DATA_POINTS_PER_WINDOW = 21

s3Res = boto3.resource('s3')
bucket = s3Res.Bucket(BUCKET_NAME)
commonPath = <s3_path>
labelledDataCommonPath = '{}/LabelledData15PPnd'.format(commonPath)

s3Client = boto3.client('s3')

In [None]:
my_bucket = s3Res.Bucket(BUCKET_NAME)

In [None]:
fileName = '{}/Price_Volume_15PPND_Labels/SMOTE_Samples_from_ReviewedCases.csv'.format(labelledDataCommonPath)
obj = s3Client.get_object(Bucket = BUCKET_NAME, Key = fileName)
smoteSamplesDf = pd.read_csv(obj['Body'])
smoteSamplesDf.head()

In [None]:
sns.countplot(x=smoteSamplesDf.Label)

In [None]:
smoteX = smoteSamplesDf.iloc[:,:-1]

In [None]:
smoteY=smoteSamplesDf.iloc[:,-1]

In [None]:
X_norm = (smoteX - smoteX.min())/(smoteX.max() - smoteX.min())

In [None]:
pca = sklearnPCA(n_components=2) #2-dimensional PCA
transformed = pd.DataFrame(pca.fit_transform(X_norm))

In [None]:
plt.scatter(transformed[smoteY==0][0], transformed[smoteY==0][1], label='No-PND', c='blue', marker='x')
plt.scatter(transformed[smoteY==1][0], transformed[smoteY==1][1], label='PND', c='red', marker='.')

plt.legend()
plt.show()

In [None]:
csvFileList = []

for my_bucket_object in my_bucket.objects.filter(Prefix='{}/EnrichedData_15P_PND/'.format(commonPath)):
    if '.csv' in my_bucket_object.key:
        print(my_bucket_object.key)
        csvFileList.append(my_bucket_object.key)

In [None]:
def applyWindowValues(targetDf, targetDfIndex, featureName, smoteDf, smoteIndex, windowSize = 10):  
    featureNames = []
    for i in reversed(range(1, windowSize+1)):
        featureNames.append('{} t-{}'.format(featureName, i))
    
    featureNames.append(featureName)
    
    for i in range(1, windowSize+1):
        featureNames.append('{} t+{}'.format(featureName, i))
    
    smoteIndex = pd.Index([smoteIndex])
    
    if ( len(targetDf.loc[(targetDfIndex - timedelta(minutes=windowSize)):(targetDfIndex +  timedelta(minutes=windowSize)), featureName]) != len((smoteDf.loc[smoteIndex,featureNames].values[0])) ):
        print("{} not changed due to inconistant length at {} returning..".format(featureName, targetDfIndex))
        return

    if ( (targetDf.loc[(targetDfIndex - timedelta(minutes=windowSize)), featureName]) == 0) :
        targetDf.loc[(targetDfIndex - timedelta(minutes=windowSize)):(targetDfIndex + timedelta(minutes=windowSize)), featureName] = (smoteDf.loc[smoteIndex,featureNames].values[0])
    elif ( (smoteDf.loc[smoteIndex, featureNames].values[0][0]) > 0 ):
        targetDf.loc[(targetDfIndex - timedelta(minutes=windowSize)):(targetDfIndex + timedelta(minutes=windowSize)), featureName] = (smoteDf.loc[smoteIndex,featureNames].values[0]) * (targetDf.loc[(targetDfIndex - timedelta(minutes=windowSize)), featureName]) / (smoteDf.loc[smoteIndex,featureNames].values[0][0])
    else:
        targetDf.loc[(targetDfIndex - timedelta(minutes=windowSize)):(targetDfIndex + timedelta(minutes=windowSize)), featureName] = (smoteDf.loc[smoteIndex,featureNames].values[0])

In [None]:
def pickDistributedIndexes(df, segment = 180, windowSize=10):
    selection = []
    tot = len(df.index)    
    
    for i in range(30, tot, segment):
        y = df.loc[(df.index[i] - timedelta(minutes=windowSize)):(df.index[i] + timedelta(minutes=windowSize))].sample(1).index[0]
        selection.append(y)
    
    return selection

In [None]:
def applySmoteSamples(targetDf, smoteDf, windowSize=10):
    targetSampleIndexes = pickDistributedIndexes(targetDf)
    
    print("selected target indexes len={}".format(len(targetSampleIndexes)))
    
    numberOfSampleToApply = len(targetSampleIndexes)
    
    smoteSampleIndexesNegative = smoteDf[smoteDf.Label==0].sample(int(numberOfSampleToApply/10)).index
    smoteSampleIndexesPositive = smoteDf[smoteDf.Label==1].sample(numberOfSampleToApply-int(numberOfSampleToApply/10)).index
    smoteSampleIndexes = smoteSampleIndexesPositive.union(smoteSampleIndexesNegative)
    
    print("Smote negative indexes len={}".format(len(smoteSampleIndexesNegative)))
    print("Smote positive indexes len={}".format(len(smoteSampleIndexesPositive)))

    for i in range(numberOfSampleToApply):
        x = pd.Index([smoteSampleIndexes[i]])
        targetDf.loc[targetSampleIndexes[i], 'Label'] = smoteDf.loc[x, 'Label'].values[0]
        
        applyWindowValues(targetDf, targetSampleIndexes[i] , 'Price', smoteDf, smoteSampleIndexes[i], windowSize)
        applyWindowValues(targetDf, targetSampleIndexes[i] , 'Volume', smoteDf, smoteSampleIndexes[i], windowSize)
        applyWindowValues(targetDf, targetSampleIndexes[i] , 'High', smoteDf, smoteSampleIndexes[i], windowSize)
        applyWindowValues(targetDf, targetSampleIndexes[i] , 'Close', smoteDf, smoteSampleIndexes[i], windowSize)

In [None]:
cpdf = sampleDf.copy()
applySmoteSamples(sampleDf, smoteSamplesDf, 10)
sampleDf.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
index=sampleDf[sampleDf.Label==1].index[6]
timeDeltaFromCenter=30
featureName='Volume'

cpdf.loc[(index - timedelta(minutes=timeDeltaFromCenter)):(index +  timedelta(minutes=timeDeltaFromCenter)), featureName].plot(kind='line', color='black', legend=True, marker='.', linewidth=0.75)
ax.legend(prop={"size":20}, loc='upper right')
plt.grid(axis='both', which='both', color='g', linestyle='--', linewidth=1)
fig.suptitle('Volume before SMOTE', fontsize=20)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
index=sampleDf[sampleDf.Label==1].index[6]
timeDeltaFromCenter=30
featureName='Volume'

sampleDf.loc[(index - timedelta(minutes=timeDeltaFromCenter)):(index +  timedelta(minutes=timeDeltaFromCenter)), featureName].plot(kind='line', color='black', legend=True, marker='.', linewidth=0.75)
ax.legend(prop={"size":20}, loc='upper right')
plt.grid(axis='both', which='both', color='g', linestyle='--', linewidth=1)
fig.suptitle('Volume After SMOTE', fontsize=20)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
index=sampleDf[sampleDf.Label==1].index[5]
timeDeltaFromCenter=10
featureName='Price'

cpdf.loc[(index - timedelta(minutes=2*timeDeltaFromCenter)):(index - timedelta(minutes=timeDeltaFromCenter)), featureName].plot(kind='line', color='black', legend=True, marker='.', linewidth=0.75)
ax.legend(prop={"size":20}, loc='upper right')
plt.grid(axis='both', which='both', color='g', linestyle='--', linewidth=1)
fig.suptitle('Price before SMOTE', fontsize=20)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
index=sampleDf[sampleDf.Label==1].index[5]
timeDeltaFromCenter=10
featureName='Price'

sampleDf.loc[(index - timedelta(minutes=2*timeDeltaFromCenter)):(index - timedelta(minutes=timeDeltaFromCenter)), featureName].plot(kind='line', color='black', legend=True, marker='.', linewidth=0.75)
ax.legend(prop={"size":20}, loc='upper right')
plt.grid(axis='both', which='both', color='g', linestyle='--', linewidth=1)
fig.suptitle('Price before SMOTE', fontsize=20)

In [None]:
def insertleftRightWindowFeatues(df, featureName, windowSize=10):
    for i in range(1, windowSize+1):
        df["{} t-{}".format(featureName, i)] = df[featureName].shift(-i).fillna(method='ffill')
    
    for i in range(1, windowSize+1):
        df["{} t+{}".format(featureName, i)] = df[featureName].shift(i).fillna(method='backfill')

In [None]:
destS3Url = <destination_s3_url>

In [None]:
for file_name in tqdm(csvFileList[150:]):
    new_fileName = 'Smoted_' + file_name.split('/')[-1].split('Enriched_')[-1]
    print(destS3Url + new_fileName)
    
    obj = s3Client.get_object(Bucket = BUCKET_NAME, Key = file_name )
    smotedDf = pd.read_csv(obj['Body'], index_col='0', parse_dates=True)

    smotedDf.drop(['Open','Low','Number of Trades','Turnover','isOpenToClosePriceAbove15P','isWindowClosingTurnoverSignificant','isLeftWindowPumping','isRightWindowDumping'], axis=1, inplace=True)
    smotedDf['Label'] = 0

    applySmoteSamples(smotedDf, smoteSamplesDf, 10)  

    smotedDf.to_csv(destS3Url + new_fileName, index=True)