### Project: Apple Share Price Prediction (Part 2)
Aims: predict the next 10 days (2 weeks) share price based on the last 5 years data

Features used: 
- Open Price
- High Price
- Low Price
- Volume, IXIC (NASDAQ Index)
- GSPC (S&P 500 Index)
- VIX (Volatility Index)
- DX-Y.NYB (US Dollor Index)
- TNX (US Treasury Yield)
- SOX (PHLX Semiconductor Index)

Note: Only using the closed prices for all other index


# Note: TensorFlow is incompatible with Python 3.12, need to be Python 3.10 or 3.11

### Model A1: Feed-Forward Neural Network (MLP) with PyTorch (not in this file)

### Model A2: Feed-Forward Neural Network (MLP) with TensorFlow and Karas (in this file)

### Model B: LSTM/Sequence Model (not in this file)

### Model C: Transformer Model (not in this file)

In [4]:
import yfinance as yf
apple = yf.Ticker("AAPL")
apple_data = apple.history(period = "5y")
tickers = [ "^IXIC", "^GSPC", "DJI", "^VIX", "DX-Y.NYB", "^TNX", "^SOX"]
others_data = yf.download(tickers, period = "5y")["Close"] # Only using the Close Prices for all indexes


  others_data = yf.download(tickers, period = "5y")["Close"] # Only using the Close Prices for all indexes
[*********************100%***********************]  7 of 7 completed


### Filling the missing value with previous available values

In [5]:
apple_data = apple_data.ffill()

# Checking if there is any missing value in apple_data
apple_data.isna().sum()

Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
dtype: int64

In [6]:
others_data.ffill()
others_data.isna().sum()

# Note too many missing values in DJI, dropping the DJI column

Ticker
DJI         966
DX-Y.NYB      0
^GSPC         3
^IXIC         3
^SOX          3
^TNX          2
^VIX          2
dtype: int64

In [7]:
others_data.drop(columns = ['DJI'], inplace = True)

In [8]:
others_data.isna().sum()

Ticker
DX-Y.NYB    0
^GSPC       3
^IXIC       3
^SOX        3
^TNX        2
^VIX        2
dtype: int64

In [9]:
others_data.bfill()

Ticker,DX-Y.NYB,^GSPC,^IXIC,^SOX,^TNX,^VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-14,90.709999,3647.489990,12440.040039,2736.250000,0.892,24.719999
2020-12-15,90.470001,3694.620117,12595.059570,2774.790039,0.923,22.889999
2020-12-16,90.449997,3701.169922,12658.190430,2773.419922,0.920,22.500000
2020-12-17,89.820000,3722.479980,12764.750000,2778.139893,0.930,21.930000
2020-12-18,90.019997,3709.409912,12755.639648,2764.739990,0.948,21.570000
...,...,...,...,...,...,...
2025-12-08,99.089996,6846.509766,23545.900391,7375.220215,4.172,16.660000
2025-12-09,99.220001,6840.509766,23576.490234,7372.509766,4.186,16.930000
2025-12-10,98.790001,6886.680176,23654.150391,7467.490234,4.164,15.770000
2025-12-11,98.349998,6901.000000,23593.859375,7411.479980,4.141,14.850000


In [10]:
others_data = others_data.dropna()

In [11]:
others_data.isna().sum()


Ticker
DX-Y.NYB    0
^GSPC       0
^IXIC       0
^SOX        0
^TNX        0
^VIX        0
dtype: int64

In [12]:
print(apple_data.index.tz)  
print(others_data.index.tz)

# Comment: apple_data times are timezone-awared while others_data times are timezone-naive (no timezone)

America/New_York
None


In [13]:
# Convert the times in apple_data to timezone-naive

apple_data.index = apple_data.index.tz_localize(None)

In [14]:
full_df = apple_data.join(others_data, how = "inner")
full_df.drop(columns = ["Dividends", "Stock Splits"], inplace = True)

In [15]:
len(full_df)

1255

In [16]:
full_df.tail(10)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,DX-Y.NYB,^GSPC,^IXIC,^SOX,^TNX,^VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2025-11-28,277.26001,279.0,275.98999,278.850006,20135600,99.459999,6849.089844,23365.689453,7025.149902,4.017,16.35
2025-12-01,278.01001,283.420013,276.140015,283.100006,46587700,99.410004,6812.629883,23275.919922,7020.529785,4.096,17.24
2025-12-02,283.0,287.399994,282.630005,286.190002,53669500,99.360001,6829.370117,23413.669922,7149.470215,4.086,16.59
2025-12-03,286.200012,288.619995,283.299988,284.149994,43538700,98.849998,6849.720215,23454.089844,7280.509766,4.057,16.08
2025-12-04,284.100006,284.730011,278.589996,280.700012,43989100,98.989998,6857.120117,23505.140625,7215.970215,4.108,15.78
2025-12-05,280.540009,281.140015,278.049988,278.779999,47265800,98.989998,6870.399902,23578.130859,7294.839844,4.139,15.41
2025-12-08,278.130005,279.670013,276.149994,277.890015,38211800,99.089996,6846.509766,23545.900391,7375.220215,4.172,16.66
2025-12-09,278.160004,280.029999,276.920013,277.179993,32193300,99.220001,6840.509766,23576.490234,7372.509766,4.186,16.93
2025-12-10,277.75,279.75,276.440002,278.779999,33038300,98.790001,6886.680176,23654.150391,7467.490234,4.164,15.77
2025-12-11,279.100006,279.589996,273.809998,278.029999,33207600,98.349998,6901.0,23593.859375,7411.47998,4.141,14.85


In [17]:
y = full_df["Close"]
X = full_df.drop(columns = ["Close"])

### Preprocess the data

Putting 30 days of data into 1 row as X ('Open', 'High', 'Low', 'Volume', 'DX-Y.NYB', '^GSPC', '^IXIC', '^SOX', '^TNX', '^VIX'), the next 10 days data ("Close") as y

In [18]:
import numpy as np

# Need create an overlapping window for X (30 days) to predict y (10 days)
# Reshape the whole dataset such that X = [[day1],[day2],....,[day30]], [day2,....,day31], y = [day31,....,day40], [day32,...,day41]

# Window 1 (t = 0 → 4):
# X₀ = [100, 102, 101, 103, 104]
# y₀ = next 10 days

# Window 2 (t = 1 → 5):
# X₁ = [102, 101, 103, 104, 106]
# y₁ = next 10 days

window_x = 30
window_y = 10

X = []
y = []

cols = ['Open', 'High', 'Low', 'Volume', 'DX-Y.NYB', '^GSPC', '^IXIC', '^SOX', '^TNX', '^VIX']

# range(30,3200) creates a range object starting from 30,31,32,......3199

# full_df[col].iloc[i-window:i].values return list of arrays wirh each row treated as an array

for i in range(window_x, len(full_df) - window_y + 1):
    X.append(full_df[cols].iloc[i-window_x : i].values)
    y.append(full_df[["Close"]].iloc[i:i + window_y].values)

X = np.array(X)

y = np.array(y)


In [19]:
num_samples = X.shape[0] #1206 rows
window_x = X.shape[1] # 30 days of data in 1 row
num_features = X.shape[2] # 10 features per day

# Flatten X,now there are 1206 rows, with each row having 300 values (10 features * 30 days)
X = X.reshape(num_samples, window_x * num_features)


# Flatten y
y = y.reshape(y.shape[0], y.shape[1])

In [20]:
X.shape

(1216, 300)

In [21]:
y.shape

(1216, 10)

### Scale the data

### MLP: Standard Scaler, inputs centered at around 0 to prevent gradient explodes or vanishes
Gradient vanishing means during backpropagation the gradients become extremely small as they move backward thru the network, resuiting in model learning very slow or not learning at all.

Gradient explosion means that the gradient become extremely large, they model jumps around instead of learning gradually.

### LSTM: MinMaxScaler
### Transformer: MinMaxScaler
LSTM and Transformer contain sigmoid and softmax, which break when values arenot bounded.

There will be some extreme values after standard scaling, e.g. -3, -5, -7

LSTM input gate: sigmoid, sigmoid(x) = 1 / (1 + exp(-x))

LSTM forget gate: sigmoid

LSTM output gate: sigmoid

LSTM candidate state: tanh, tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))

Transformer self-attention: Attention = softmax(QKᵀ / sqrt(d))

### Model A2 - Feed-Forward Neural Network (MLP) with TensorFlow and Karas

### Split the training, validation and test datasets

In [24]:
# Training set = first 70%, validation set = next 15%, test set = next 15%
# Note: 80-10-10 fails when the validation set is too small to be reliable.
n = len(X)
train_end = int(0.7 * n)
validation_end = int(0.85 * n)

X_train = X[:train_end]
X_validation = X[train_end:validation_end]
X_test = X[validation_end:]

y_train = y[:train_end]
y_validation = y[train_end:validation_end]
y_test  = y[validation_end:]

### Scale the data 

In [25]:
# Need to scale the training, validation and test data seperately such that there is no data leakage
# Scale X
from sklearn.preprocessing import StandardScaler

scaler_X = StandardScaler()

scaler_X.fit(X_train) # fit() learns the statistic from your training data, i.e. means and SDs

scaled_X_train_mlp = scaler_X.transform(X_train) # transform() used the learned statistics to apply the scaling

scaled_X_validation_mlp  = scaler_X.transform(X_validation) # It uses the learned means and SDs to scale the test data

scaled_X_test_mlp  = scaler_X.transform(X_test) # It uses the learned means and SDs to scale the test data

# Scale y

scaler_y = StandardScaler()

scaler_y.fit(y_train) 

scaled_y_train_mlp = scaler_y.transform(y_train)

scaled_y_validation_mlp = scaler_y.transform(y_validation)

scaled_y_test_mlp = scaler_y.transform(y_test)

### Transfrom the training and testing datasets (both X and y) into tensors (must do before passing them into the model)

In [27]:
import tensorflow as tf
# torch.tensor() is equivalent to tf.convert_to_tensor()
# For NNs, you must convert to float32

X_train_tensor = tf.convert_to_tensor(scaled_X_train_mlp, dtype = tf.float32)

X_validation_tensor = tf.convert_to_tensor(scaled_X_validation_mlp, dtype = tf.float32)

X_test_tensor = tf.convert_to_tensor(scaled_X_test_mlp, dtype = tf.float32)

y_train_tensor = tf.convert_to_tensor(scaled_y_train_mlp, dtype = tf.float32)

y_validation_tensor = tf.convert_to_tensor(scaled_y_validation_mlp, dtype = tf.float32)

y_test_tensor = tf.convert_to_tensor(scaled_y_test_mlp, dtype = tf.float32)


2025-12-13 00:55:54.212721: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-12-13 00:55:54.212974: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-12-13 00:55:54.213058: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-12-13 00:55:54.213550: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-12-13 00:55:54.213569: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Create Datasets and and load train_dataset and val_dataset (for scalability and performance)

In [29]:
# tf.data.Dataset.from_tensor_slices() takes the full tensors X, and y, splits them into individual samples, creates pairs(X_i, y_i)
# .prefetch(tf.data.AUTOTUNE): GPU does not wait for data, while GPU trains on batch N, CPU prepares batch N + 1


train_dataset = tf.data.Dataset.from_tensor_slices((X_train_tensor, y_train_tensor)).batch(64).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((X_validation_tensor, y_validation_tensor)).batch(64).prefetch(tf.data.AUTOTUNE)

#### Dynamic MLP class with Keras 
- better than using Sequential() becasue it gives you more control

In [None]:
# import tensorflow as tf

# from tensorflow.keras import layers, models

# input_dim = 300

# model = models.Sequential([
#         layers.Input(shape = (input_dim,)),
#         layers.Dense(128, activation = "gelu"),
#         layers.Dense(128, activation = "gelu"),
#         layers.Dense(10, activation = "linear")
# ])

In [None]:
class FlexibleMLP(tf.keras.Model): # kf.keras.Model is the parent class
    def __init__(self, input_dim, hidden_layers, activations, dropouts, output_dim):
        super().__init__()  # super().__init__() call the constructor(__init__) of the parent class, the FlexibleMLP gets all public methods and behaviours (e.g. self.*)

        if not (len(hidden_layers) == len(activations) == len(dropouts)):
            raise ValueError("Lengths must match")

        act_map = {
            "relu": tf.keras.layers.ReLU(),
            "gelu": tf.keras.layers.GELU(),
            "tanh": tf.keras.layers.Activation("tanh"),
            "sigmoid": tf.keras.layers.Activation("sigmoid"),
            "elu": tf.keras.layers.ELU(),
        }

        self.layers_list = []

        for h, act, d in zip(hidden_layers, activations, dropouts):
            self.layers_list.append(tf.keras.layers.Dense(h))
            self.layers_list.append(act_map[act]) # Note: activation function is applied after the neuron, which decided whether there will be output passed to the next Dense layer (if not dropout)
            if d > 0:
                self.layers_list.append(tf.keras.layers.Dropout(d))

        self.out = tf.keras.layers.Dense(output_dim)

    def call(self, x, training=False):
        for layer in self.layers_list:
            if isinstance(layer, tf.keras.layers.Dropout):
                x = layer(x, training=training)
            else:
                x = layer(x)
        return self.out(x)


### Use Optuna to find the best hyperparameters first

In [31]:
import optuna
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

def objective(trial):

    n_layers = trial.suggest_int("n_layers", 1, 4)
    base_units = trial.suggest_int("base_units", 32, 512)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2, log = True)

    hidden_layers = []
    activations = []
    dropouts = []

    units = base_units

    for i in range(n_layers):
        hidden_layers.append(units)

        activations.append(
            trial.suggest_categorical(
                f"activation_l{i}",
                ["relu", "gelu", "tanh", "elu"]
            )
        )

        dropouts.append(
            trial.suggest_float(
                f"dropout_l{i}",
                0.0, 0.5
            )
        )

        units = max(units // 2, 16)  # funnel

    model = FlexibleMLP(
        input_dim=X_train_tensor.shape[1],
        hidden_layers=hidden_layers,
        activations=activations,
        dropouts=dropouts,
        output_dim=1
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss="mse"
    )

    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=50,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                patience=5,
                restore_best_weights=True
            )
        ],
        verbose=0
    )

    return min(history.history["val_loss"])


In [None]:
import optuna

study = optuna.create_study(direction = "minimize")

[I 2025-12-13 01:29:03,288] A new study created in memory with name: no-name-a3cc1dd1-2610-4e6e-a626-a417ba376ea2


In [33]:
study.best_value

ValueError: No trials are completed yet.

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
input_dim = 300
model = models.Sequential([
        layers.Input(shape = (input_dim,)),
        layers.Dense(128, activation = "gelu"),
        layers.Dense(128, activation = "gelu"),
        layers.Dense(10, activation = "linear")
])

2025-12-12 22:31:55.834328: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-12-12 22:31:55.834362: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-12-12 22:31:55.834368: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-12-12 22:31:55.834398: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-12-12 22:31:55.834407: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
