In [1]:
import math
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import time

from datetime import date
from matplotlib import pyplot as plt
from numpy.random import seed
from pylab import rcParams
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook
from sklearn.preprocessing import MinMaxScaler
from tensorflow import set_random_seed
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.utils import plot_model

Using TensorFlow backend.


In [2]:
def get_x(data, N):
    x= []
    for i in range(N, len(data)):
        x.append(data[i-N:i])
    x = np.array(x)
    return x

def get_y(data, N):
    y = []
    for i in range(N, len(data)-1):
        if(data[i]-data[i-1] > 0):
            y.append([0,1])
        elif(data[i]-data[i-1] < 0):
            y.append([1,0])
        else:
            y.append([0,0])
    y = np.array(y)
    return y

def get_y_single(data, N):
    y = []
    for i in range(N, len(data)-1):
        if(data[i]-data[i-1] > 0):
            y.append([1])
        elif(data[i]-data[i-1] < 0):
            y.append([-1])
        else:
            y.append([0])
    y = np.array(y)
    return y



In [17]:
class LSTMMODEL:
    test_size = 0.2                # proportion of dataset to be used as test set
    cv_size = 0.2                  # proportion of dataset to be used as cross-validation set
    N = 20                          # for feature at day t, we use lags from t-1, t-2, ..., t-N as features. 
    lstm_units=32
    dropout_prob=0.5                 
    optimizer='adagrad'
    epochs=10
    batch_size=100
    # model_seed = 100
    close_scaler = MinMaxScaler(feature_range=(0, 1))

    def __init__(self):
        self.data_path = ""

    def initData(self,path):
        self.data_path = path
        df = pd.read_csv(self.data_path, sep = ",")

        num_cv = int(self.cv_size*len(df))
        num_test = int(self.test_size*len(df))
        num_train = len(df) - num_cv - num_test

        self.open_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(np.array(df['open']).reshape(-1,1))
        self.close_scaled = self.close_scaler.fit_transform(np.array(df['close']).reshape(-1,1))
        self.high_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(np.array(df['high']).reshape(-1,1))
        self.low_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(np.array(df['low']).reshape(-1,1))
        # self.amount_scaled = MinMaxScaler(feature_range=(0, 1)).fit_transform(np.array(df['amount']).reshape(-1,1))
        close = np.array(df['close']).reshape(-1,1)
        last_close = close[:-1]
        last_close = np.vstack((close[0], last_close))
        self.change = (close - last_close)/close
        self.change_scaled = MinMaxScaler(feature_range=(-1, 1)).fit_transform(self.change)

        combined_train_cv = np.hstack((self.open_scaled[:num_train+num_cv-1],self.close_scaled[:num_train+num_cv-1],self.high_scaled[:num_train+num_cv-1],self.low_scaled[:num_train+num_cv-1]))
        combined_test_cv = np.hstack((self.open_scaled[num_train+num_cv:-1],self.close_scaled[num_train+num_cv:-1],self.high_scaled[num_train+num_cv:-1],self.low_scaled[num_train+num_cv:-1]))
        print("train_cv.shape:\n",combined_train_cv.shape)
        print("test.shape:\n",combined_test_cv.shape)
        self.x_train_cv = get_x(combined_train_cv, self.N)
        # self.x_train_cv = get_x(self.change_scaled[:num_train+num_cv-1], self.N)
        self.y_train_cv = get_y(np.array(df['close'])[:num_train+num_cv],self.N).reshape(-1,2)

        self.x_test = get_x(combined_test_cv, self.N)
        # self.x_test = get_x(self.change_scaled[num_train+num_cv:-1], self.N)
        self.y_test = get_y(np.array(df['close'])[num_train+num_cv:],self.N).reshape(-1,2)
        
#         print("close:\n", np.array(df['close']).reshape(-1,1)[:10])
#         print("x_train:\n",get_x(self.change[:num_train+num_cv-1], self.N)[:10])
#         print("y_train_cv:\n",'\n',self.y_train_cv[:10])
        
        print("train_cv.shape:\n", self.x_train_cv.shape, self.y_train_cv.shape)
        print("test.shape:\n", self.x_test.shape, self.y_test.shape)

    def trainAndPred(self):
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=10, mode='auto')
        model = Sequential()
        model.add(LSTM(units=self.lstm_units, return_sequences=True, input_shape=(self.x_train_cv.shape[1], self.x_train_cv.shape[2])))
        model.add(Dropout(self.dropout_prob))
        model.add(LSTM(units=self.lstm_units)) 
        model.add(Dropout(self.dropout_prob))
        model.add(Dense(2, kernel_initializer='random_uniform',
                bias_initializer='zeros'))
        model.add(Activation('softmax'))
        
        model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
        model.summary()
        
        model.fit(self.x_train_cv, self.y_train_cv, epochs=self.epochs, batch_size=self.batch_size, verbose=2, callbacks=[reduce_lr])

        res = model.predict(self.x_test)
        
        print("originRes:\n",'\n',res)
#         res[res>=0.5] = 1
#         res[res<=-0.5] = 2
#         res[res<0.5] = 0
#         res[res==2] = -1

#         print("finalRes:\n",res[:5],'\n',self.y_test[:5])

#         df = pd.DataFrame(np.hstack((res,self.y_test)))
#         df.columns = ["pred_close","real_close"]

#         rc = df.loc[:, "real_close"]
#         pc = df.loc[:, "pred_close"]
#         count = 0
#         for i in range(1, df.shape[0]):
#             if(rc[i] == pc[i]):
#                 count+=1
#         print(count, rc.shape[0]-1)
#         print(count/(rc.shape[0]-1))
        
        ###

        # rcParams['figure.figsize'] = 10, 8
        # ax = df.plot(y=['pred_close','real_close'], color=['red','blue'], grid=True)
        # plt.show()

        # df.to_csv("new_pred_data.csv")


In [18]:
lstmModel = LSTMMODEL()
lstmModel.initData("./rb000.csv")



train_cv.shape:
 (1910, 4)
test.shape:
 (476, 4)
train_cv.shape:
 (1890, 20, 4) (1890, 2)
test.shape:
 (456, 20, 4) (456, 2)


In [5]:
# lstmModel.trainAndPred()

In [6]:
lstmModel.x_test[:5]

array([[[0.43037619, 0.45395845, 0.44760312, 0.42623413],
        [0.45480067, 0.44357103, 0.45540691, 0.44682652],
        [0.44076362, 0.47782145, 0.48940914, 0.44090268],
        [0.48371701, 0.50842223, 0.5041806 , 0.48349788],
        [0.51122965, 0.50982594, 0.5039019 , 0.49760226]],

       [[0.45480067, 0.44357103, 0.45540691, 0.44682652],
        [0.44076362, 0.47782145, 0.48940914, 0.44090268],
        [0.48371701, 0.50842223, 0.5041806 , 0.48349788],
        [0.51122965, 0.50982594, 0.5039019 , 0.49760226],
        [0.50926446, 0.49550814, 0.50613155, 0.48857546]],

       [[0.44076362, 0.47782145, 0.48940914, 0.44090268],
        [0.48371701, 0.50842223, 0.5041806 , 0.48349788],
        [0.51122965, 0.50982594, 0.5039019 , 0.49760226],
        [0.50926446, 0.49550814, 0.50613155, 0.48857546],
        [0.48905109, 0.49270073, 0.5203456 , 0.49111425]],

       [[0.48371701, 0.50842223, 0.5041806 , 0.48349788],
        [0.51122965, 0.50982594, 0.5039019 , 0.49760226],
        

In [32]:
# KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

train_len = int(0.8*len(lstmModel.close_scaled))
x_train = get_x(lstmModel.close_scaled[:1910], 20).reshape(-1,20)
y_train = lstmModel.y_train_cv
x_test = get_x(lstmModel.close_scaled[1911:-1], 20).reshape(-1,20)
y_test = lstmModel.y_test

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(x_train, y_train)

res = neigh.predict(x_test)

count = 0
x=[]
for i in range(len(res)):
    if(res[i][0] == 0 and res[i][1] == 1):
        x.append(1)
    elif(res[i][0] == 1 and res[i][1] == 0):
        x.append(-1)
    else:
        x.append(0)
    if(res[i][0] == lstmModel.y_test[i][0] and res[i][1] == lstmModel.y_test[i][1]):
        count += 1
df = pd.DataFrame(np.array(x))
df.to_csv('./knn.csv')
print(count/len(res))
res

0.48464912280701755


array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1,

In [8]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

train_len = int(0.8*len(lstmModel.close_scaled))
x_train = get_x(lstmModel.close_scaled[:1910], 5).reshape(-1,5)
y_train = lstmModel.y_train_cv
x_test = get_x(lstmModel.close_scaled[1912:], 5).reshape(-1,5)
y_test = lstmModel.y_test

logreg = LogisticRegression(random_state=42)
logreg.fit(x_train, y_train)

res = logreg.predict(x_test)

x_train



ValueError: bad input shape (1905, 2)

In [28]:
# SVM
from sklearn.svm import SVC

train_len = int(0.8*len(lstmModel.close_scaled))
x_train = get_x(lstmModel.close_scaled[:1910], 20).reshape(-1,20)
y_train = lstmModel.y_train_cv
x_test = get_x(lstmModel.close_scaled[1911:-1], 5).reshape(-1,20)
y_test = lstmModel.y_test

svm_linear = SVC( kernel = 'linear')
svm_linear.fit(x_train, y_train)

res = logreg.predict(x_test)

res


ValueError: bad input shape (1890, 2)

In [33]:
# Decision Tree
from sklearn import tree

train_len = int(0.8*len(lstmModel.close_scaled))
x_train = get_x(lstmModel.close_scaled[:1910], 20).reshape(-1,20)
y_train = lstmModel.y_train_cv
x_test = get_x(lstmModel.close_scaled[1911:-1], 20).reshape(-1,20)
y_test = lstmModel.y_test

dtc = tree.DecisionTreeClassifier(random_state=42)
dtc.fit(x_train, y_train)

res = dtc.predict(x_test)

count = 0
x=[]
for i in range(len(res)):
    if(res[i][0] == 0 and res[i][1] == 1):
        x.append(1)
    elif(res[i][0] == 1 and res[i][1] == 0):
        x.append(-1)
    else:
        x.append(0)
    if(res[i][0] == lstmModel.y_test[i][0] and res[i][1] == lstmModel.y_test[i][1]):
        count += 1
df = pd.DataFrame(np.array(x))
df.to_csv('./decisionTree.csv')
print(count/len(res))
res

0.5241228070175439


array([[1., 0.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.

In [34]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier 

train_len = int(0.8*len(lstmModel.close_scaled))
x_train = get_x(lstmModel.close_scaled[:1910], 20).reshape(-1,20)
y_train = lstmModel.y_train_cv
x_test = get_x(lstmModel.close_scaled[1911:-1], 20).reshape(-1,20)
y_test = lstmModel.y_test

forest_reg = RandomForestClassifier(random_state=42)
forest_reg.fit(x_train, y_train)

res = forest_reg.predict(x_test)


count = 0
x=[]
for i in range(len(res)):
    if(res[i][0] == 0 and res[i][1] == 1):
        x.append(1)
    elif(res[i][0] == 1 and res[i][1] == 0):
        x.append(-1)
    else:
        x.append(0)
    if(res[i][0] == lstmModel.y_test[i][0] and res[i][1] == lstmModel.y_test[i][1]):
        count += 1
df = pd.DataFrame(np.array(x))
df.to_csv('./randomForest.csv')
print(count/len(res))
res



0.3530701754385965


array([[0., 1.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [0., 1.],
       [0., 0.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.

In [23]:
# True Trend
res = lstmModel.y_test
x=[]
for i in range(len(res)):
    if(res[i][0] == 0 and res[i][1] == 1):
        x.append(1)
    elif(res[i][0] == 1 and res[i][1] == 0):
        x.append(-1)
    else:
        x.append(0)
df = pd.DataFrame(np.array(x))
df.to_csv('./trueTrend.csv')


array([[1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0,

array([[0.44357103, 0.47782145, 0.50842223, 0.50982594, 0.49550814],
       [0.47782145, 0.50842223, 0.50982594, 0.49550814, 0.49270073],
       [0.50842223, 0.50982594, 0.49550814, 0.49270073, 0.51852892],
       ...,
       [0.53172375, 0.52751263, 0.52807412, 0.5300393 , 0.53705783],
       [0.52751263, 0.52807412, 0.5300393 , 0.53705783, 0.54716451],
       [0.52807412, 0.5300393 , 0.53705783, 0.54716451, 0.53144301]])