In [1]:
import os
import pandas as pd 
from collections import deque
import random 

In [2]:
merged_df = pd.DataFrame()

In [3]:
crytos = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]

## DATASET PREPROCESSING

In [4]:
for crypto in crytos:
    print(f"Processing {crypto}")
    dataset_path = f"data/{crypto}.csv"
    
    df = pd.read_csv(dataset_path, names=['time', 'low', 'high', 'open', 'close', 'volume'])
    print(df.head())
    df.rename(columns={"close" : f"{crypto}_close", "volume":f"{crypto}_volume"}, inplace=True)
    
    df.set_index("time", inplace=True)    
    df=df[[f"{crypto}_close", f"{crypto}_volume"]]
    
    if len(merged_df)==0:
        merged_df = df
    else:
        merged_df = merged_df.join(df)
        
merged_df.fillna(method='ffill', inplace=True)
merged_df.dropna(inplace=True)



Processing BTC-USD
         time          low         high         open        close    volume
0  1528968660  6489.549805  6489.560059  6489.560059  6489.549805  0.587100
1  1528968720  6487.370117  6489.560059  6489.549805  6487.379883  7.706374
2  1528968780  6479.410156  6487.370117  6487.370117  6479.410156  3.088252
3  1528968840  6479.410156  6479.419922  6479.419922  6479.410156  1.404100
4  1528968900  6475.930176  6479.979980  6479.410156  6479.979980  0.753000
Processing LTC-USD
         time        low       high       open      close      volume
0  1528968660  96.580002  96.589996  96.589996  96.580002    9.647200
1  1528968720  96.449997  96.669998  96.589996  96.660004  314.387024
2  1528968780  96.470001  96.570000  96.570000  96.570000   77.129799
3  1528968840  96.449997  96.570000  96.570000  96.500000    7.216067
4  1528968900  96.279999  96.540001  96.500000  96.389999  524.539978
Processing BCH-USD
         time         low        high        open       close     v

  merged_df.fillna(method='ffill', inplace=True)


In [5]:
merged_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033


In [6]:
for c in merged_df.columns:
    print(c)

BTC-USD_close
BTC-USD_volume
LTC-USD_close
LTC-USD_volume
BCH-USD_close
BCH-USD_volume
ETH-USD_close
ETH-USD_volume


In [7]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
CRYPTO_TO_PREDICT = "LTC-USD" 

In [8]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [9]:
merged_df['future'] = merged_df[f"{CRYPTO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

In [10]:
merged_df['target'] = list(map(classify, merged_df[f"{CRYPTO_TO_PREDICT}_close"], 
                                        merged_df["future"]))

In [11]:
merged_df.head()


Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759,96.470001,1
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033,96.400002,0


## NORMALIZING AND CREATING SEQUENCES


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [12]:
times = sorted(merged_df.index.values)
last_5pct = times[-int(0.05*len(times))]

validation_df = merged_df[(merged_df.index >= last_5pct)] 
merged_df = merged_df[(merged_df.index < last_5pct)]

In [13]:
merged_df

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.010010,26.019083,96.389999,0
1528968780,6479.410156,3.088252,96.570000,77.129799,870.099976,1.124300,486.000000,8.449400,96.519997,0
1528968840,6479.410156,1.404100,96.500000,7.216067,870.789978,1.749862,485.750000,26.994646,96.440002,0
1528968900,6479.979980,0.753000,96.389999,524.539978,870.000000,1.680500,486.000000,77.355759,96.470001,1
1528968960,6480.000000,1.490900,96.519997,16.991997,869.989990,1.669014,486.000000,7.503300,96.400002,0
...,...,...,...,...,...,...,...,...,...,...
1534921800,6686.250000,0.478039,57.509998,18.782650,551.299988,0.336000,285.230011,38.141129,57.509998,0
1534921860,6686.250000,0.440793,57.500000,8.449425,551.299988,0.010847,285.489990,17.549879,57.509998,1
1534921920,6686.250000,2.678847,57.509998,6.070000,551.299988,5.713912,285.739990,6.953944,57.509998,0
1534921980,6686.250000,0.220156,57.509998,15.697691,551.299988,5.713912,286.000000,24.460905,57.509998,0


In [27]:
from sklearn import preprocessing
import numpy as np 

def preprocessing_df(df):
    df = df.drop("future", axis=1)
    for col in df.columns:
        if col!="target":
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            
            df[col] = preprocessing.scale(df[col].values)
            
    df.dropna(inplace=True)
    
    seq_data = []
    previous_days = deque(maxlen=SEQ_LEN)
    
    for i in df.values:
        previous_days.append([n for n in i[:-1]])
        if len(previous_days) == SEQ_LEN:
            seq_data.append([np.array(previous_days), i[-1]])
            
    random.shuffle(seq_data)
    
    buys = []  
    sells = []  

    for seq, target in seq_data: 
        if target == 0: 
            sells.append([seq, target])  
        elif target == 1:  
            buys.append([seq, target])  

    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    seq_data = buys+sells
    
    random.shuffle(seq_data)
    
    X = []
    y = []
    
    for seq, target in seq_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X),y
        

In [28]:
X_train, y_train = preprocessing_df(merged_df)
X_val, y_val = preprocessing_df(validation_df)

print(f"train data: {len(X_train)} validation: {len(X_val)}")
print(f"Dont buys: {y_train.count(0)}, buys: {y_train.count(1)}")
print(f"VALIDATION Dont buys: {y_val.count(0)}, buys: {y_val.count(1)}")

train data: 77922 validation: 3860
Dont buys: 38961, buys: 38961
VALIDATION Dont buys: 1930, buys: 1930
