In [4]:
# Load the Drive helper and mount
from google.colab import drive

# This will proot for authorization
drive.mount('/content/drive')

import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np

SEQ_LEN = 60 # how long of a proceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3 # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "LTC-USD"

def classify(current, future):
  if float(future) > float(current):
    return 1
  else:
    return 0

def preprocess_df(df):
  df = df.drop('future', 1) # don't need this anymore
  
  for col in df.columns: # go through all of the columns
    if col != "target": # normalize all ... except for the target itself
      df[col] = df[col].pct_change() # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
      df.dropna(inplace=True) # remove the nas created by pct_change
      df[col] = preprocessing.scale(df[col].values) # scale between 0 and 1
      
  df.dropna(inplace=True) # cleanup again... jic. Those nasty NaNs love to creep in.
  
  sequential_data = [] # this is a list that will CONTAIN the sequences
  prev_days = deque(maxlen=SEQ_LEN) # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in
  
  for i in df.values: # iterate over the values
    prev_days.append([n for n in i[:-1]]) # store all but the target
    if len(prev_days) == SEQ_LEN: # make sure we have 60 sequences
      sequential_data.append([np.array(prev_days), i[-1]]) # append those bad boys
      
  random.shuffle(sequential_data) # shuffle for good measure
  
  buys = []
  sells = []
  
  for seq, tartet in sequential_data:
    if target == 0:
      sells.append([seq, target])
    elif target == 1:
      buys.append([seq, target])
      
# take featuresets and combine them into sequences of 60 of these featuresets

main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"] # the 4 ratios we want to consider
for ratio in ratios: # begin iteration
  dataset = f"/content/drive/My Drive/Colab Notebooks/crypto/crypto_data/{ratio}.csv" # get full path to the file
  
  df = pd.read_csv(dataset, names=["time", "low", "high", "open", "close", "volume"]) # read in specific file
  
  # renaming volume and close to include the ticker so we can see which close/volume is which
  df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)
  
  df.set_index("time", inplace=True) # set time as index so we can join them on this shared time
  df = df[[f"{ratio}_close", f"{ratio}_volume"]] # ignore the other columns besides price and volume
  
  if len(main_df) == 0: # if the dataframe is empty
    main_df = df # then it's just the
  else: # otherwise, join this data to the main one
    main_df = main_df.join(df)
  
main_df["future"] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

main_df["target"] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df["future"]))

# print(main_df[[f"{RATIO_TO_PREDICT}_close", "future", "target"]].head(10))

# Scale, normalize and put data in sequences

times = sorted(main_df.index.values) # get the times
last_5pct = times[-int(0.05*len(times))] # get the last 5% of the times
# print(last_5pct)

validation_main_df = main_df[(main_df.index >= last_5pct)] # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)] # now the main_df is all the data up to the last 5%

preprocess_df(main_df)
# train_x, train_y = preprocess_df(main_df)
# validation_x, validation_y = preprocess_df(validation_main_df)


Mounted at /content/drive
1534922100
