In [264]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import os, datetime
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
print('Tensorflow version: {}'.format(tf.__version__))
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler

Tensorflow version: 2.9.1


In [265]:
batch_size = 32
seq_len = 128

d_k = 256
d_v = 256
n_heads = 12
ff_dim = 256

In [266]:
stock_ticket = "GLBS"
start = datetime.datetime(2012,1,1)
end = datetime.datetime(2022,7,19)
df = web.DataReader(stock_ticket,"yahoo",start,end)
df.drop(columns=['Adj Close'], inplace=True)
df

Unnamed: 0_level_0,High,Low,Open,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-03,15960.00,12600.00,12920.00,15600.00,8.0
2012-01-04,18720.00,15960.00,15960.00,16760.00,16.0
2012-01-05,17920.00,16040.00,17920.00,16800.00,4.0
2012-01-06,17880.00,16000.00,16560.00,17200.00,1.0
2012-01-09,18000.00,16720.00,17720.00,17120.00,3.0
...,...,...,...,...,...
2022-07-13,1.54,1.48,1.52,1.51,69700.0
2022-07-14,1.53,1.49,1.53,1.50,44600.0
2022-07-15,1.65,1.50,1.50,1.62,481000.0
2022-07-18,1.73,1.63,1.67,1.64,297300.0


In [267]:
x_sc = MinMaxScaler()
train_set = x_sc.fit_transform(df.values)
y_sc = MinMaxScaler()
test_set = y_sc.fit_transform(df['Close'].values.reshape(-1,1))

In [268]:
last_10pct = int(len(df)*0.8)
last_20pct = int(len(df)*0.65) # Last 20% of series
# Convert pandas columns into arrays
train_data =train_set[:last_20pct]
val_data = train_set[last_20pct:last_10pct]
test_data = train_set[last_10pct:]
print('Training data shape: {}'.format(train_data.shape))
print('Validation data shape: {}'.format(val_data.shape))
print('Test data shape: {}'.format(test_data.shape))

Training data shape: (1724, 5)
Validation data shape: (398, 5)
Test data shape: (531, 5)


In [269]:
# Training data
n_future = 30
X_train, y_train = [], []
for i in range(seq_len, len(train_data)-n_future+1):
    X_train.append(train_data[i-seq_len:i]) # Chunks of training data with a length of 128 df-rows
    y_train.append(train_data[i:i+n_future, 3]) #Value of 4th column (Close Price) of df-row 128+1
X_train, y_train = np.array(X_train), np.array(y_train)

###############################################################################

# Validation data
X_val, y_val = [], []
for i in range(seq_len, len(val_data)-n_future+1):
    X_val.append(val_data[i-seq_len:i])
    y_val.append(val_data[i:i+n_future, 3])
X_val, y_val = np.array(X_val), np.array(y_val)

###############################################################################

# Test data
X_test, y_test = [], []
for i in range(seq_len, len(test_data)-n_future+1):
    X_test.append(test_data[i-seq_len:i])
    y_test.append(test_data[i:i+n_future, 3])    
X_test, y_test = np.array(X_test), np.array(y_test)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(1635, 60, 5) (1635, 30)
(309, 60, 5) (309, 30)


In [270]:
class Time2Vector(Layer):
    def __init__(self, seq_len, **kwargs):
        super(Time2Vector, self).__init__()
        self.seq_len = seq_len

    def build(self, input_shape):
        self.weights_linear = self.add_weight(name='weight_linear',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)
    
        self.bias_linear = self.add_weight(name='bias_linear',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)
    
        self.weights_periodic = self.add_weight(name='weight_periodic',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)
    
        self.bias_periodic = self.add_weight(name='bias_periodic',
                                shape=(int(self.seq_len),),
                                initializer='uniform',
                                trainable=True)

    def call(self, x):
        x = tf.math.reduce_mean(x[:,:,:4], axis=-1) 
        time_linear = self.weights_linear * x + self.bias_linear # Linear time feature
        time_linear = tf.expand_dims(time_linear, axis=-1) # Add dimension (batch, seq_len, 1)
    
        time_periodic = tf.math.sin(tf.multiply(x, self.weights_periodic) + self.bias_periodic)
        time_periodic = tf.expand_dims(time_periodic, axis=-1) # Add dimension (batch, seq_len, 1)
        return tf.concat([time_linear, time_periodic], axis=-1) # shape = (batch, seq_len, 2)
   
    def get_config(self): # Needed for saving and loading model with custom layer
        config = super().get_config().copy()
        config.update({'seq_len': self.seq_len})
        return config

In [271]:
class SingleAttention(Layer):
    def __init__(self, d_k, d_v):
        super(SingleAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v

    def build(self, input_shape):
        self.query = Dense(self.d_k, 
                       input_shape=input_shape, 
                       kernel_initializer='glorot_uniform', 
                       bias_initializer='glorot_uniform')
    
        self.key = Dense(self.d_k, 
                     input_shape=input_shape, 
                     kernel_initializer='glorot_uniform', 
                     bias_initializer='glorot_uniform')
    
        self.value = Dense(self.d_v, 
                       input_shape=input_shape, 
                       kernel_initializer='glorot_uniform', 
                       bias_initializer='glorot_uniform')

    def call(self, inputs): # inputs = (in_seq, in_seq, in_seq)
        q = self.query(inputs[0])
        k = self.key(inputs[1])

        attn_weights = tf.matmul(q, k, transpose_b=True)
        attn_weights = tf.map_fn(lambda x: x/np.sqrt(self.d_k), attn_weights)
        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
    
        v = self.value(inputs[2])
        attn_out = tf.matmul(attn_weights, v)
        return attn_out    

#############################################################################

class MultiAttention(Layer):
    def __init__(self, d_k, d_v, n_heads):
        super(MultiAttention, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.attn_heads = list()

    def build(self, input_shape):
        for n in range(self.n_heads):
            self.attn_heads.append(SingleAttention(self.d_k, self.d_v))  
    
    # input_shape[0]=(batch, seq_len, 7), input_shape[0][-1]=7 
        self.linear = Dense(input_shape[0][-1], 
                        input_shape=input_shape, 
                        kernel_initializer='glorot_uniform', 
                        bias_initializer='glorot_uniform')

    def call(self, inputs):
        attn = [self.attn_heads[i](inputs) for i in range(self.n_heads)]
        concat_attn = tf.concat(attn, axis=-1)
        multi_linear = self.linear(concat_attn)
        return multi_linear   

#############################################################################

class TransformerEncoder(Layer):
    def __init__(self, d_k, d_v, n_heads, ff_dim, dropout=0.1, **kwargs):
        super(TransformerEncoder, self).__init__()
        self.d_k = d_k
        self.d_v = d_v
        self.n_heads = n_heads
        self.ff_dim = ff_dim
        self.attn_heads = list()
        self.dropout_rate = dropout

    def build(self, input_shape):
        self.attn_multi = MultiAttention(self.d_k, self.d_v, self.n_heads)
        self.attn_dropout = Dropout(self.dropout_rate)
        self.attn_normalize = LayerNormalization(input_shape=input_shape, epsilon=1e-6)

        self.ff_conv1D_1 = Conv1D(filters=self.ff_dim, kernel_size=1, activation='relu')
        # input_shape[0]=(batch, seq_len, 7), input_shape[0][-1] = 7 
        self.ff_conv1D_2 = Conv1D(filters=input_shape[0][-1], kernel_size=1) 
        self.ff_dropout = Dropout(self.dropout_rate)
        self.ff_normalize = LayerNormalization(input_shape=input_shape, epsilon=1e-6)    
  
    def call(self, inputs): # inputs = (in_seq, in_seq, in_seq)
        attn_layer = self.attn_multi(inputs)
        attn_layer = self.attn_dropout(attn_layer)
        attn_layer = self.attn_normalize(inputs[0] + attn_layer)

        ff_layer = self.ff_conv1D_1(attn_layer)
        ff_layer = self.ff_conv1D_2(ff_layer)
        ff_layer = self.ff_dropout(ff_layer)
        ff_layer = self.ff_normalize(inputs[0] + ff_layer)
        return ff_layer 

    def get_config(self): # Needed for saving and loading model with custom layer
        config = super().get_config().copy()
        config.update({'d_k': self.d_k,
                   'd_v': self.d_v,
                   'n_heads': self.n_heads,
                   'ff_dim': self.ff_dim,
                   'attn_heads': self.attn_heads,
                   'dropout_rate': self.dropout_rate})
        return config         

In [272]:
def create_model():
    time_embedding = Time2Vector(seq_len)
    attn_layer1 = TransformerEncoder(d_k, d_v, n_heads, ff_dim)
    attn_layer2 = TransformerEncoder(d_k, d_v, n_heads, ff_dim)
    attn_layer3 = TransformerEncoder(d_k, d_v, n_heads, ff_dim)

    in_seq = Input(shape=(seq_len, 5))
    x = time_embedding(in_seq)
    x = Concatenate(axis=-1)([in_seq, x])
    x = attn_layer1((x, x, x))
    x = attn_layer2((x, x, x))
    x = attn_layer3((x, x, x))
    x = GlobalAveragePooling1D(data_format='channels_first')(x)
    x = Dropout(0.1)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.1)(x)
    out = Dense(n_future, activation='linear')(x)

    model = Model(inputs=in_seq, outputs=out)
    model.compile(loss='mse', optimizer='adam', metrics=['mae', 'mape'])
    return model


model = create_model()
model.summary()

callback = tf.keras.callbacks.ModelCheckpoint('Transformer+TimeEmbedding.hdf5', 
                                              monitor='val_loss', 
                                              save_best_only=True, 
                                              verbose=1)

history = model.fit(X_train, y_train, 
                    batch_size=batch_size, 
                    epochs=20, 
                    callbacks=[callback],
                    validation_data=(X_val, y_val))  




Model: "model_10"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_11 (InputLayer)          [(None, 60, 5)]      0           []                               
                                                                                                  
 time2_vector_28 (Time2Vector)  (None, 60, 2)        240         ['input_11[0][0]']               
                                                                                                  
 concatenate_10 (Concatenate)   (None, 60, 7)        0           ['input_11[0][0]',               
                                                                  'time2_vector_28[0][0]']        
                                                                                                  
 transformer_encoder_84 (Transf  (None, 60, 7)       99114       ['concatenate_10[0][0]',  

# 预测！

In [273]:
model = tf.keras.models.load_model('Transformer+TimeEmbedding.hdf5',
                                   custom_objects={'Time2Vector': Time2Vector, 
                                                   'SingleAttention': SingleAttention,
                                                   'MultiAttention': MultiAttention,
                                                   'TransformerEncoder': TransformerEncoder})


#Calculate predication for training, validation and test data
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)
test_pred = model.predict(X_test)

#Print evaluation metrics for all datasets
train_eval = model.evaluate(X_train, y_train, verbose=0)
val_eval = model.evaluate(X_val, y_val, verbose=0)
test_eval = model.evaluate(X_test, y_test, verbose=0)
print(' ')
print('Evaluation metrics')
print('Training Data - Loss: {:.4f}, MAE: {:.4f}, MAPE: {:.4f}'.format(train_eval[0], train_eval[1], train_eval[2]))
print('Validation Data - Loss: {:.4f}, MAE: {:.4f}, MAPE: {:.4f}'.format(val_eval[0], val_eval[1], val_eval[2]))
print('Test Data - Loss: {:.4f}, MAE: {:.4f}, MAPE: {:.4f}'.format(test_eval[0], test_eval[1], test_eval[2]))

 
Evaluation metrics
Training Data - Loss: 0.0061, MAE: 0.0557, MAPE: 32.3928
Validation Data - Loss: 0.0008, MAE: 0.0271, MAPE: 581.3891
Test Data - Loss: 0.0013, MAE: 0.0353, MAPE: 92426.3828


In [274]:
import sklearn.metrics as metrics

real_price = y_sc.inverse_transform(y_test)
test_pred = y_sc.inverse_transform(test_pred)
period_pred_accuracy = metrics.mean_absolute_percentage_error(real_price,test_pred)
print(real_price)
print(test_pred)

[[12.         12.         12.         ... 14.         13.
  13.        ]
 [12.         12.         12.         ... 13.         13.
  13.        ]
 [12.         12.         12.         ... 13.         13.
  13.        ]
 ...
 [ 2.03999996  2.33999991  2.3900001  ...  1.50999999  1.5
   1.62      ]
 [ 2.33999991  2.3900001   2.43000007 ...  1.5         1.62
   1.63999999]
 [ 2.3900001   2.43000007  2.4000001  ...  1.62        1.63999999
   1.88999999]]
[[667.1186  811.92914 681.7896  ... 963.8876  850.8713  911.79675]
 [667.13794 811.9795  681.8116  ... 963.90985 850.8911  911.79926]
 [667.1383  812.0065  681.7478  ... 963.89685 850.8601  911.79205]
 ...
 [667.3059  812.104   681.9637  ... 964.0984  851.0137  911.942  ]
 [667.3102  812.0371  682.0063  ... 964.0705  851.06415 911.9086 ]
 [667.2904  812.0615  682.0264  ... 964.0731  851.06775 911.8886 ]]


In [275]:
#30 days prediction error in total
print(period_pred_accuracy)

253.13395864611553


In [277]:
#预测并输出预测值
f_X_test = np.array([test_data[len(test_data)-seq_len:]])
f_pred = model.predict(f_X_test)
f_pred = y_sc.inverse_transform(f_pred)
f_pred = [item[i] for item in f_pred for i in range(30)]
date_index = [df.index[-1]+datetime.timedelta(days=i) for i in range(1,30+1)]
prediction_list = pd.DataFrame(f_pred,index = date_index,columns=['Prediction Pirce'])



In [278]:
data_output = df[['Close','Volume']]
data_output = pd.merge(data_output,prediction_list,left_index=True,right_index=True,how='outer')

In [279]:
path = '/Users/ningxuying/Desktop/Data/'+str(stock_ticket)+'变形金刚.csv'
data_output.to_csv(path)

In [280]:
import joblib
randomclassifier = joblib.load('/Users/ningxuying/Desktop/model/SentimentAnalysis.model')

In [281]:
#Scraping
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd

# Let's pick a company ticker symbol (AMZN for Amazon)
company_ticker = stock_ticket
# Add the ticker symbol to the "finviz" search box url
url = ("http://finviz.com/quote.ashx?t=" + company_ticker.lower())
# Most websites block requests that are without a User-Agent header (these simulate a typical browser)

# Send a Request to the url and return an html file
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

# open and read the request
webpage = urlopen(req).read()

# make a soup using BeautifulSoup from webpage
html = soup(webpage, "html.parser")

# Extract the 'class' = 'fullview-news-outer' from our html code, and create a dataframe from it
news = pd.read_html(str(html), attrs={'class': 'fullview-news-outer'})[0]
# extract the links for each news by finding all the "a" tags and 'class' = 'tab-link-news'
links = []
for a in html.find_all('a', class_="tab-link-news"):
    links.append(a['href'])

# Clean up our news dataframe
news.columns = ['Date', 'News_Headline']
news['Article_Link'] = links

In [282]:
#backtest for 30 days!
#backtest 1
print("Backtest Result")
predict_price = test_pred_price['Prediction Pirce']
invest_length = 30
real_price = df['Close'][-30-1:]
real_open_price = df['Open'][-30-1:]
print("Invest Length:",invest_length,"Days")
list(predict_price).insert(0,df['Close'][-30-1])
buy = 0
net_return = 0
right_decision=0
#预测出来下一天涨，就买入
#预测出来下一天涨，就买入
for i in range(1,30):
    if predict_price[i] > predict_price[i-1]:
        buy = buy +1
        net_return = net_return + (real_price[i+1]-real_open_price[i])
        if real_price[i] > real_price[i-1]:
            right_decision = right_decision+1
print("Total Investment Times:",buy,"\nTimes That has the Right Decision:",right_decision,"\nRate of the Right Decision:",right_decision/buy)
# print(buy)
print("Net Return:",net_return)
return_rate = net_return/real_open_price[0]
print("Return Rate:",return_rate)

Backtest Result
Invest Length: 30 Days
Total Investment Times: 17 
Times That has the Right Decision: 7 
Rate of the Right Decision: 0.4117647058823529
Net Return: -0.5499997138977051
Return Rate: -0.2696077079325603


In [283]:
#other to save
other_data = {'ticket':[stock_ticket],
    'Backtest Length':[invest_length],
              'Total Investment Times':[buy],
              'Rate of the Right Decision':[right_decision],
              'Backtest Net Return':[net_return],
              "Backtest Return Rate(%)":[return_rate*100],
               "Next Day's Predicted Price":[prediction_list['Prediction Pirce'][0]],
              "Daily Stock Price Prediction Accuracy(%)":['NULL'],
             "30 Day's Stock Price Prediction Accuracy(%)":[100-period_pred_accuracy*100],
             'Expected Net Return in Future 30 Days':[prediction_list['Prediction Pirce'][-1]-df['Close'][-1]],
             "Expected Return Rate in Future 30 Days(%)":[(prediction_list['Prediction Pirce'][-1]-df['Close'][-1])/df['Close'][-1]*100]}
news_data = {'News Dates':news['Date'][:5],
             'News Headlines':news['News_Headline'][:5],
             'News Links':news['Article_Link'][:5]}
other_data = pd.DataFrame(other_data)
news_data = pd.DataFrame(news_data)
other_data = pd.concat([other_data,news_data],axis = 1,join = 'outer')
od_path = '/Users/ningxuying/Desktop/OD/'+str(stock_ticket)+'变形金刚.csv'
other_data.to_csv(od_path)