In [15]:
import pandas as pd
import os
import random
import numpy as np

from sklearn import preprocessing  # pip install sklearn ... if you don't have it!
from collections import deque

SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "LTC-USD"


def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic. Those nasty NaNs lo
    
    
    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.



def classify(current, future):
    # if price is higher in future
    if float(future) > float(current):
        return 1
    else:
        return 0

main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:  # begin iteration
#     print(ratio)
    dataset = f'crypto_data/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file
    
    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
# print(main_df.head())
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
# print(main_df.head())
times = sorted(main_df.index.values)  # get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times
print (last_5pct)

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

# train_x, train_y = preprocess_df(main_df)
# validation_x, validation_y = preprocess_df(validation_main_df)

'''
for c in main_df.columns:
    print (c)
'''

df id              time          low         high         open        close  \
0      1528968660  6489.549805  6489.560059  6489.560059  6489.549805   
1      1528968720  6487.370117  6489.560059  6489.549805  6487.379883   
2      1528968780  6479.410156  6487.370117  6487.370117  6479.410156   
3      1528968840  6479.410156  6479.419922  6479.419922  6479.410156   
4      1528968900  6475.930176  6479.979980  6479.410156  6479.979980   
5      1528968960  6477.959961  6480.000000  6477.959961  6480.000000   
6      1528969020  6477.220215  6480.000000  6479.990234  6477.220215   
7      1528969080  6477.220215  6480.000000  6477.220215  6480.000000   
8      1528969140  6479.990234  6479.990234  6479.990234  6479.990234   
9      1528969200  6477.259766  6479.990234  6479.990234  6478.660156   
10     1528969260  6478.649902  6478.660156  6478.660156  6478.660156   
11     1528969320  6478.660156  6479.339844  6478.660156  6479.339844   
12     1528969380  6479.339844  6479.350098  

KeyError: 'LTC-USD_close'

In [5]:
!ls

[34mcrypto_data[m[m     cryptornn.ipynb rnn.ipynb
