In [0]:
import pandas as pd
import os
import numpy as np

In [59]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_tO_PrEDICT = 'LTC-USD'

from sklearn import preprocessing  # pip install sklearn ... if you don't have it!
from collection import deque
import random

def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic. Those nasty NaNs love to creep in.
    
    sequential_data = []
  
    prev_days = deque(maxlen=SEQ_LEN)
   
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
    
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
      
    random.shuffle(sequential_data)

def classify(current, future):
  if float(future) > float(current):
    return 1
  else:
    return 0

ModuleNotFoundError: ignored

In [6]:
from google.colab import files

uploaded = files.upload()

Saving LTC-USD.csv to LTC-USD.csv


In [11]:
from google.colab import files

uploaded = files.upload()

Saving BCH-USD.csv to BCH-USD.csv
Saving BTC-USD.csv to BTC-USD.csv
Saving ETH-USD.csv to ETH-USD.csv


In [32]:
main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:  # begin iteration
    print(ratio)
    dataset = f'{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

        
main_df['future'] = main_df[f"{RATIO_tO_PrEDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)
main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)

main_df['target'] = list(map(classify, main_df[f'{RATIO_tO_PrEDICT}_close'], main_df['future']))
print(main_df.head())                                               
                                               

BTC-USD
LTC-USD
BCH-USD
ETH-USD
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
time                                                                       
1528968720     870.859985       26.856577      486.01001       26.019083   
1528968780     870.099976        1.124300      486.00000        8.449400   
1528968840     870.789978        1.749862      485.75000       26.994646   
1528968900     870.000000        1.680500      486.0000

In [36]:
df.info(), main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102831 entries, 1528968720 to 1535215260
Data columns (total 2 columns):
ETH-USD_close     102831 non-null float64
ETH-USD_volume    102831 non-null float64
dtypes: float64(2)
memory usage: 7.4 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 97723 entries, 1528968720 to 1535215200
Data columns (total 10 columns):
BTC-USD_close     97723 non-null float64
BTC-USD_volume    97723 non-null float64
LTC-USD_close     97723 non-null float64
LTC-USD_volume    97723 non-null float64
BCH-USD_close     97723 non-null float64
BCH-USD_volume    97723 non-null float64
ETH-USD_close     97723 non-null float64
ETH-USD_volume    97723 non-null float64
future            97723 non-null float64
target            97723 non-null int64
dtypes: float64(9), int64(1)
memory usage: 8.2 MB


(None, None)

In [37]:
df.head(), main_df.head()

(            ETH-USD_close  ETH-USD_volume
 time                                     
 1528968720      486.01001       26.019083
 1528968780      486.00000        8.449400
 1528968840      485.75000       26.994646
 1528968900      486.00000       77.355759
 1528968960      486.00000        7.503300,
             BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
 time                                                                       
 1528968720    6487.379883        7.706374      96.660004      314.387024   
 1528968780    6479.410156        3.088252      96.570000       77.129799   
 1528968840    6479.410156        1.404100      96.500000        7.216067   
 1528968900    6479.979980        0.753000      96.389999      524.539978   
 1528968960    6480.000000        1.490900      96.519997       16.991997   
 
             BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  \
 time                                                                       
 15

In [38]:
   print(main_df[[f'{RATIO_tO_PrEDICT}_close', 'future', 'target']].head(10))  # how did we do??


            LTC-USD_close     future  target
time                                        
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1
1528968960      96.519997  96.400002       0
1528969020      96.440002  96.400002       0
1528969080      96.470001  96.400002       0
1528969140      96.400002  96.400002       0
1528969200      96.400002  96.400002       0
1528969260      96.400002  96.449997       1


Normalizing and creating sequences for our cryptocurrency predicting RNN

In [0]:
times = sorted(main_df.index.values)  # get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

Now balancing and normalizing the data


now we need to create our actual ***sequences***