# 2.c Data Preprocessing and Feature Engineering (Sliding Window (Close))

In [53]:
import numpy as np
import math
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import boto3 
import io  
from sagemaker import get_execution_role
role = get_execution_role()
# Library to suppress warnings or deprecation notes
import warnings
warnings.filterwarnings("ignore")

In [54]:
session = boto3.Session()

In [55]:
role

'arn:aws:iam::190183093544:role/service-role/AmazonSageMaker-ExecutionRole-20220602T141236'

### Getting Data from Yahoo Finance¶

In [56]:
btc = f'https://query1.finance.yahoo.com/v7/finance/download/BTC-USD?period1=1410912000&period2=1656633600&interval=1d&events=history&includeAdjustedClose=true'

### Load the Data

In [57]:
data= pd.read_csv(btc)


### Making new dataframe with requried features

In [58]:
data = data[['Close']]

In [59]:
data

Unnamed: 0,Close
0,457.334015
1,424.440002
2,394.795990
3,408.903992
4,398.821014
...,...
2840,20735.478516
2841,20280.634766
2842,20104.023438
2843,19784.726563


In [60]:
data.shape

(2845, 1)

In [61]:
data.head()

Unnamed: 0,Close
0,457.334015
1,424.440002
2,394.79599
3,408.903992
4,398.821014


### Creating a function for sliding window

In [62]:
def mv_window(row, col, i_start_a, win_1, dataset):
    import numpy as np
    X = np.zeros((row, col))
    for d in range(0,win_1):
        for i in range(0,row):
            X[i][d] = dataset['Close'][i+i_start_a+d]
    return X

In [63]:
#selecting the window and size
sliding=mv_window(2815, 30, 0, 30, data)

In [64]:
sliding = pd.DataFrame(data=sliding)

In [65]:
sliding.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
2810,28814.900391,29445.957031,31726.390625,31792.310547,29799.080078,30467.488281,29704.390625,29832.914063,29906.662109,31370.671875,...,20471.482422,19017.642578,20553.271484,20599.537109,20710.597656,19987.029297,21085.876953,21231.65625,21502.337891,21027.294922
2811,29445.957031,31726.390625,31792.310547,29799.080078,30467.488281,29704.390625,29832.914063,29906.662109,31370.671875,31155.478516,...,19017.642578,20553.271484,20599.537109,20710.597656,19987.029297,21085.876953,21231.65625,21502.337891,21027.294922,20735.478516
2812,31726.390625,31792.310547,29799.080078,30467.488281,29704.390625,29832.914063,29906.662109,31370.671875,31155.478516,30214.355469,...,20553.271484,20599.537109,20710.597656,19987.029297,21085.876953,21231.65625,21502.337891,21027.294922,20735.478516,20280.634766
2813,31792.310547,29799.080078,30467.488281,29704.390625,29832.914063,29906.662109,31370.671875,31155.478516,30214.355469,30111.998047,...,20599.537109,20710.597656,19987.029297,21085.876953,21231.65625,21502.337891,21027.294922,20735.478516,20280.634766,20104.023438
2814,29799.080078,30467.488281,29704.390625,29832.914063,29906.662109,31370.671875,31155.478516,30214.355469,30111.998047,29083.804688,...,20710.597656,19987.029297,21085.876953,21231.65625,21502.337891,21027.294922,20735.478516,20280.634766,20104.023438,19784.726563


In [66]:
#since first 30 get ommited
y=data["Close"][30:]

In [67]:
y=pd.DataFrame(y)

### Splitting the last 30 days as the validation set

In [68]:
y_val = y[2785:]

In [69]:
X_val=sliding[2785:]

In [80]:
#last 30 get ommited as they are the validation set
X=sliding[:2785]
y1=y[:2785]

### Split Data to prepare train and test set

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y1, test_size=0.30, random_state=1
)

In [72]:
print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)

Shape of Training set :  (1949, 30)
Shape of test set :  (836, 30)


### Export Data to S3 for Model Training

In [73]:
#for method 1
np.save("X_train_close.npy", X_train)
np.save("y_train_close.npy", y_train)
np.save("X_test_close.npy", X_test)
np.save("y_test_close.npy", y_test)
np.save("X_val_close.npy", X_val)
np.save("y_val_close.npy", y_val)

In [74]:
s3 = session.resource('s3') 
local_path = 'X_train_close.npy' 
s3_filename =  'X_train_close.npy' 
result = s3.Bucket('data-use-case-btc-prediction').upload_file(local_path,s3_filename)  
print(result)

None


In [75]:
s3 = session.resource('s3') 
local_path = 'y_train_close.npy' 
s3_filename =  'y_train_close.npy' 
result = s3.Bucket('data-use-case-btc-prediction').upload_file(local_path,s3_filename)  
print(result)

None


In [76]:
s3 = session.resource('s3') 
local_path = 'X_test_close.npy' 
s3_filename =  'X_test_close.npy' 
result = s3.Bucket('data-use-case-btc-prediction').upload_file(local_path,s3_filename)  
print(result)

None


In [77]:
s3 = session.resource('s3') 
local_path = 'y_test_close.npy' 
s3_filename =  'y_test_close.npy' 
result = s3.Bucket('data-use-case-btc-prediction').upload_file(local_path,s3_filename)  
print(result)

None


In [78]:
s3 = session.resource('s3') 
local_path = 'X_val_close.npy' 
s3_filename =  'X_val_close.npy' 
result = s3.Bucket('data-use-case-btc-prediction').upload_file(local_path,s3_filename)  
print(result)

None


In [79]:
s3 = session.resource('s3') 
local_path = 'y_val_close.npy' 
s3_filename =  'y_val_close.npy' 
result = s3.Bucket('data-use-case-btc-prediction').upload_file(local_path,s3_filename)  
print(result)

None
