# Classification With Transformers

In [52]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [53]:
data_train = pd.read_csv("./data/aapl_5m_train.csv").dropna()
data_test = pd.read_csv("./data/aapl_5m_test.csv").dropna()

In [54]:
data_train.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Gmtoffset,Datetime,Open,High,Low,Close,Volume
0,0,1609770600,0,2021-01-04 14:30:00,133.570007,133.611602,132.389999,132.809997,6624663.0
1,1,1609770900,0,2021-01-04 14:35:00,132.75,132.75,131.809997,131.889999,2541553.0
2,2,1609771200,0,2021-01-04 14:40:00,131.5,132.339996,131.5,132.059997,2492415.0
3,3,1609771500,0,2021-01-04 14:45:00,132.0,132.25,131.899993,132.25,1859131.0
4,4,1609771800,0,2021-01-04 14:50:00,132.0,132.018096,131.520004,131.589996,1780105.0


## Normalizing Data

In [55]:
train_mean = data_train.loc[:, ["Open", "High", "Low", "Close"]].mean()
train_std = data_train.loc[:, ["Open", "High", "Low", "Close"]].std()

norm_data_train = (data_train.loc[:, ["Open", "High", "Low", "Close"]] - train_mean) / train_std
norm_data_test = (data_test.loc[:, ["Open", "High", "Low", "Close"]] - train_mean) / train_std

## Generating our features

In [56]:
lags = 5

X_train = pd.DataFrame()
X_test = pd.DataFrame()

for lag in range(lags):
    X_train[f"Open_{lag}"] = norm_data_train.Open.shift(lag)
    X_train[f"High_{lag}"] = norm_data_train.High.shift(lag)
    X_train[f"Low_{lag}"] = norm_data_train.Low.shift(lag)
    X_train[f"Close_{lag}"] = norm_data_train.Close.shift(lag)
    
    X_test[f"Open_{lag}"] = norm_data_test.Open.shift(lag)
    X_test[f"High_{lag}"] = norm_data_test.High.shift(lag)
    X_test[f"Low_{lag}"] = norm_data_test.Low.shift(lag)
    X_test[f"Close_{lag}"] = norm_data_test.Close.shift(lag)

Y_train = (X_train.Close_0 * (1 + 0.01) < X_train.Close_0.shift(-1)).astype(float)
Y_test = (X_test.Close_0 * (1 + 0.01) < X_test.Close_0.shift(-1)).astype(float)

# Removing nans and last value
X_train = X_train.iloc[5:-1, :].values
X_test = X_test.iloc[5:-1, :].values

Y_train = Y_train.iloc[5:-1].values.reshape(-1, 1)
Y_test = Y_test.iloc[5:-1].values.reshape(-1, 1)

## Reshaping Tensors

In [57]:
features = X_train.shape[1]

X_train = X_train.reshape(-1, features, 1)
X_test = X_test.reshape(-1, features, 1)

## Regular DNN

In [102]:
dnn = tf.keras.models.Sequential(layers=[
    tf.keras.layers.Dense(units=120, activation="relu", input_shape=(features, 1)),
    tf.keras.layers.Dense(units=60, activation="relu"),
    tf.keras.layers.Dense(units=60, activation="relu"),
    tf.keras.layers.Dense(units=30, activation="relu"),
    tf.keras.layers.Dense(units=1, activation="softmax")
])
# metric = tf.keras.metrics.SparseCategoricalAccuracy()

dnn.compile(optimizer="adam",
            loss="sparse_categorical_crossentropy",
            metrics=["sparse_categorical_accuracy"])

In [103]:
dnn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_50 (Dense)            (None, 20, 120)           240       
                                                                 
 dense_51 (Dense)            (None, 20, 60)            7260      
                                                                 
 dense_52 (Dense)            (None, 20, 60)            3660      
                                                                 
 dense_53 (Dense)            (None, 20, 30)            1830      
                                                                 
 dense_54 (Dense)            (None, 20, 1)             31        
                                                                 
Total params: 13021 (50.86 KB)
Trainable params: 13021 (50.86 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [105]:
dnn.fit(X_train, Y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x2a3c95bd0>

### Another way of creating a model

In [108]:
l1 = tf.keras.layers.Input(shape=(features, 1))
l2 = tf.keras.layers.Dense(units=120, activation="relu")(l1)
l3 = tf.keras.layers.Dense(units=60, activation="relu")(l2)
l4 = tf.keras.layers.Dense(units=30, activation="relu")(l3)

outputs = tf.keras.layers.Dense(units=1, activation="softmax")(l4)

dnn2 = tf.keras.Model(inputs=l1, outputs=outputs)

In [109]:
dnn2.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 20, 1)]           0         
                                                                 
 dense_59 (Dense)            (None, 20, 120)           240       
                                                                 
 dense_60 (Dense)            (None, 20, 60)            7260      
                                                                 
 dense_61 (Dense)            (None, 20, 30)            1830      
                                                                 
 dense_62 (Dense)            (None, 20, 1)             31        
                                                                 
Total params: 9361 (36.57 KB)
Trainable params: 9361 (36.57 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [110]:
dnn2.compile(optimizer="adam",
             loss="sparse_categorical_crossentropy",
             metrics=["sparse_categorical_accuracy"])

In [111]:
dnn2.fit(X_train, Y_train)



<keras.src.callbacks.History at 0x2a430eb50>

## Classification Model

In [58]:
def create_transformer(inputs, head_size, num_heads, dnn_dim):
    # Stacking layers
    l1 = tf.keras.layers.MultiHeadAttention(key_dim=head_size,
                                            num_heads=num_heads,
                                            dropout=0.2)(inputs, inputs)
    l2 = tf.keras.layers.Dropout(0.2)(l1)
    l3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(l2)
    
    res = l3 + inputs
    
    # Traditional DNN
    l4 = tf.keras.layers.Conv1D(filters=4, kernel_size=1, activation="relu")(res)
    l5 = tf.keras.layers.Dropout(0.2)(l4)
    l6 = tf.keras.layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(l5)
    l7 = tf.keras.layers.LayerNormalization(epsilon=1e-6)(l6)
    return l7 + res

In [59]:
input_shape = X_train.shape[1:]

# Hyperparams
head_size = 256
num_heads = 4
num_transformer_blocks = 4
dnn_dim = 4
units = 128


# Defining input_shape as Input layer
input_layer = tf.keras.layers.Input(input_shape)

# Creating our transformers based on the input layer
transformer_layers = input_layer

for _ in range(num_transformer_blocks):
    # Stacking transformers
    transformer_layers = create_transformer(inputs=transformer_layers,
                                            head_size=head_size,
                                            num_heads=num_heads,
                                            dnn_dim=dnn_dim)

# Adding global pooling
pooling_layer = tf.keras.layers.GlobalAveragePooling1D(data_format="channels_last")\
                                                      (transformer_layers)

# Adding MLP layers
l1 = tf.keras.layers.Dense(units=128, activation="leaky_relu")(pooling_layer)
l2 = tf.keras.layers.Dropout(0.3)(l1)
l3 = tf.keras.layers.Dense(units=128, activation="leaky_relu")(l2)

# Last layer, units = 2 for True and False values
outputs = tf.keras.layers.Dense(units=2, activation="softmax")(l3)

# Model
model = tf.keras.Model(inputs=input_layer,
                       outputs=outputs,
                       name="transformers_classification")

metric = tf.keras.metrics.SparseCategoricalAccuracy()
adam_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-4)
#callbacks = [tf.keras.callbacks.EarlyStopping(monitor="loss",
#                                              patience=10,
#                                              restore_best_weights=True)]

In [60]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=adam_optimizer,
    metrics=[metric],
)

In [61]:
model.summary()

Model: "transformers_classification"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 20, 1)]              0         []                            
                                                                                                  
 multi_head_attention_32 (M  (None, 20, 1)                7169      ['input_9[0][0]',             
 ultiHeadAttention)                                                  'input_9[0][0]']             
                                                                                                  
 dropout_72 (Dropout)        (None, 20, 1)                0         ['multi_head_attention_32[0][0
                                                                    ]']                           
                                                                        

 dropout_77 (Dropout)        (None, 20, 4)                0         ['conv1d_68[0][0]']           
                                                                                                  
 conv1d_69 (Conv1D)          (None, 20, 1)                5         ['dropout_77[0][0]']          
                                                                                                  
 layer_normalization_69 (La  (None, 20, 1)                2         ['conv1d_69[0][0]']           
 yerNormalization)                                                                                
                                                                                                  
 tf.__operators__.add_69 (T  (None, 20, 1)                0         ['layer_normalization_69[0][0]
 FOpLambda)                                                         ',                            
                                                                     'tf.__operators__.add_68[0][0
          

In [62]:
model.fit(
    X_train,
    Y_train,
    epochs=100,
    batch_size=64,
    # callbacks=callbacks,
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x2a0963d50>

In [12]:
model.save("transformer_classifier.keras")

In [63]:
y_hat_train = model.predict(X_train)



In [88]:
sum(y_hat_train.argmax(axis=1) == 0)

18598