In [1]:
import pandas as pd
import json

df = pd.read_csv('./data-source/crypto_data/LTC-USD.csv', 
                  names=['time', 'low', 'high', 'open', 'close', 'volume'])
print(df.head())

         time        low       high       open      close      volume
0  1528968660  96.580002  96.589996  96.589996  96.580002    9.647200
1  1528968720  96.449997  96.669998  96.589996  96.660004  314.387024
2  1528968780  96.470001  96.570000  96.570000  96.570000   77.129799
3  1528968840  96.449997  96.570000  96.570000  96.500000    7.216067
4  1528968900  96.279999  96.540001  96.500000  96.389999  524.539978


In [2]:
# create a dataframe with close and volume data of 4 cryptos combined from 4 different csv files
main_df = pd.DataFrame()

cryptos = ['BTC-USD', 'LTC-USD', 'BCH-USD', 'ETH-USD']
for crypto in cryptos:
    print(crypto)
    dataset = f'./data-source/crypto_data/{crypto}.csv'
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    df.rename(columns={'close': f'{crypto}_close', 'volume': f'{crypto}_volume'}, inplace=True)
    df.set_index('time', inplace=True)
    df = df[[f'{crypto}_close', f'{crypto}_volume']]
    
    if len(main_df) == 0:
        main_df = df 
    else:
        main_df = main_df.join(df)
        
main_df.fillna(method='ffill', inplace=True)
main_df.dropna(inplace=True)
print(main_df.head())

BTC-USD
LTC-USD
BCH-USD
ETH-USD
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000    

In [3]:
# Create a Target, choose how Far Out to Predict
# We can make prediction A Regression Question using a Linear Activation with the output layer
# Instead we go with Binary Classification Here:

SEQ_LEN = 60 # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3 # how far into future to predict
RATIO_TO_PREDICT = "LTC-USD"

# Simple Classification Function
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
    
# To do this we need a future column
# shift will just shift the columns for us
# A Negative 
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)

# Now we use the future values to make a target
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
time                                                                       
1528968720     870.859985       26.856577      486.01001       26.019083   
1528968780     870.099976        1.124300      486.00000        8.449400   
1528968840     870.789978        1.749862      485.75000       26.994646   
1528968900     870.000000        1.680500      486.00000       77.355759   
1528968960 

Since our sample data here is 1min sequential, we have to separate out the validation data.
what we did above was shuffle data then slice it which results in overfitting pour over into the validation set.
So we'll take the last 5% of the data, then balance and normalize it....

In [4]:
times = sorted(main_df.index.values) #get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))] #get the last 5% of the times

#make validation data where the index is in the last 5%
validation_main_df = main_df[(main_df.index >= last_5pct)]

#now the main_df contains all the data up to the last 5%
main_df = main_df[(main_df.index < last_5pct)]

In [7]:
#balance and normalize data

#train_x, train_y = preprocess_df(main_df) validation_x, validation_y = preprocess_df(validation_main_df)

from sklearn import preprocessing
from collections import deque

def preprocess_df(df):
    df = df.drop("future", 1) #don't need the fututre anymore
    
    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change()
            df.dropna(inplace=True) #remove nas created bt percentage change
            df[col] = preprocessing.scale(df[col].values) #scale between 0 and 1
            
    df.dropna(inplace=True) #cleanup NaNs again

In [8]:
#next we create our actual sequences

sequential_data = []
prev_days = deque(maxlen=SEQ_LEN)

for i in df.values:
    prev_days.append([n for n in i[:-1]]) #store all but the target
    if len(prev_days) == SEQ_LEN:
        sequential_data.append([np.array(prev_days), i[-1]])
        
random.shuffle(sequential_data) #shuffle for good measure

NameError: name 'np' is not defined