In [12]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.model_selection import train_test_split

In [13]:
combined_stocks_df = pd.read_csv("filtered_stocks_combined.csv")

In [14]:
combined_stocks_df

Unnamed: 0,Date,index,Open,High,Low,Close,Volume,OpenInt,ticker
0,2011-01-03,45725,41.600,42.293,41.600,42.204,123930383,0,AAPL
1,2011-01-04,45726,42.579,42.579,42.023,42.426,86135637,0,AAPL
2,2011-01-05,45727,42.260,42.817,42.196,42.772,70669988,0,AAPL
3,2011-01-06,45728,42.902,42.932,42.632,42.739,83619699,0,AAPL
4,2011-01-07,45729,42.787,43.073,42.503,43.045,86506108,0,AAPL
...,...,...,...,...,...,...,...,...,...
75495,2016-12-23,14745947,87.556,87.625,87.201,87.383,4428429,0,XOM
75496,2016-12-27,14745948,87.499,87.768,87.257,87.423,5100402,0,XOM
75497,2016-12-28,14745949,87.354,87.768,86.949,86.989,6834213,0,XOM
75498,2016-12-29,14745950,86.797,87.277,86.728,87.036,6938299,0,XOM


In [15]:
# Convert Date to datetime and set as index
combined_stocks_df["Date"] = pd.to_datetime(combined_stocks_df["Date"])
combined_stocks_df.set_index("Date", inplace=True)

# Drop unnecessary columns if any
combined_stocks_df = combined_stocks_df.drop(columns=["index"])  # Optional

# Pivot to multi-level columns: Ticker as level 1, feature as level 2
stocks_df = combined_stocks_df.pivot_table(
    index=combined_stocks_df.index,
    columns="ticker",
    values=[col for col in combined_stocks_df.columns if col != "ticker"]
)

# Sort columns for clarity
stocks_df = stocks_df.sort_index(axis=1, level=0)


# Swap the column MultiIndex levels
stocks_df_leveled = stocks_df.swaplevel(axis=1)

# Sort by ticker (Level 0)
stocks_df_leveled = stocks_df_leveled.sort_index(axis=1, level=0)

# Preview the new structure
stocks_df_leveled.head()


ticker,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,ABT,ABT,ABT,ABT,...,WMT,WMT,WMT,WMT,XOM,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,Close,High,Low,Open,OpenInt,Volume,Close,High,Low,Open,...,Low,Open,OpenInt,Volume,Close,High,Low,Open,OpenInt,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2011-01-03,42.204,42.293,41.6,41.6,0.0,123930383.0,19.358,19.549,19.338,19.549,...,46.027,46.089,0.0,16789275.0,60.341,60.535,59.604,59.668,0.0,28807445.0
2011-01-04,42.426,42.579,42.023,42.579,0.0,86135637.0,19.54,19.561,19.35,19.448,...,46.139,46.419,0.0,14296931.0,60.625,60.665,60.235,60.469,0.0,24744869.0
2011-01-05,42.772,42.817,42.196,42.26,0.0,70669988.0,19.54,19.764,19.46,19.52,...,46.174,46.539,0.0,16723328.0,60.462,60.6,60.009,60.43,0.0,20448359.0
2011-01-06,42.739,42.932,42.632,42.902,0.0,83619699.0,19.5,19.682,19.342,19.634,...,45.712,46.166,0.0,18335156.0,60.853,61.052,60.439,60.625,0.0,27829692.0
2011-01-07,43.045,43.073,42.503,42.787,0.0,86506108.0,19.582,19.615,19.444,19.489,...,45.698,45.81,0.0,9374462.0,61.182,61.431,60.777,60.876,0.0,23838996.0


In [16]:
# 3. Time-based split into 60% train, 20% val, 20% test
train_dict, val_dict, test_dict = {}, {}, {}

for ticker in stocks_df_leveled.columns.levels[0]:
    stocks_df_leveled.loc[:, (ticker, 'log_return')] = np.log(
    stocks_df_leveled[ticker]['Close'] / stocks_df_leveled[ticker]['Close'].shift(1)
)

    df = stocks_df_leveled[ticker].dropna().sort_index()
    total_len = len(df)
    train_end = int(total_len * 0.6)
    val_end = train_end + int(total_len * 0.2)

    train_dict[ticker] = df.iloc[:train_end]
    val_dict[ticker] = df.iloc[train_end:val_end]
    test_dict[ticker] = df.iloc[val_end:]

train_df = pd.concat(train_dict, names=["Ticker", "Date"])
val_df = pd.concat(val_dict, names=["Ticker", "Date"])
test_df = pd.concat(test_dict, names=["Ticker", "Date"])


In [17]:
train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,High,Low,Open,OpenInt,Volume,log_return
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAPL,2011-01-04,42.426,42.579,42.023,42.579,0.0,86135637.0,0.005246
AAPL,2011-01-05,42.772,42.817,42.196,42.260,0.0,70669988.0,0.008122
AAPL,2011-01-06,42.739,42.932,42.632,42.902,0.0,83619699.0,-0.000772
AAPL,2011-01-07,43.045,43.073,42.503,42.787,0.0,86506108.0,0.007134
AAPL,2011-01-10,43.855,43.956,43.179,43.393,0.0,124888228.0,0.018643
...,...,...,...,...,...,...,...,...
XOM,2014-08-04,88.145,88.411,86.542,86.921,0.0,13804459.0,0.013374
XOM,2014-08-05,86.445,87.625,86.077,87.492,0.0,14847864.0,-0.019475
XOM,2014-08-06,87.133,87.723,86.445,86.445,0.0,11264688.0,0.007927
XOM,2014-08-07,86.507,87.889,85.944,87.828,0.0,11379351.0,-0.007210


In [20]:
# Function to create rolling window sequences
def create_rolling_window_data(df, window_size=10, target_shift=1):
    X, y = [], []
    tickers = df.columns.levels[0]  # Level 0 is tickers after swaplevel
    num_tickers = len(tickers)
    
    for i in range(window_size, len(df) - target_shift):
        window = df.iloc[i - window_size:i]
        X.append(window.values.flatten())
        
        next_day = df.iloc[i + target_shift - 1]
        targets = []
        for ticker in tickers:
            targets.append(next_day[(ticker, 'Open')])
            targets.append(next_day[(ticker, 'Close')])
        y.append(targets)
    
    return np.array(X), np.array(y), num_tickers

# Create sequences from the preprocessed DataFrame
X, y, num_tickers = create_rolling_window_data(stocks_df_leveled)

# Split data into train, validation, and test sets (60%, 20%, 20%)
total_size = len(X)
train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Build the MLP model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(256, activation='relu'),
    Dense(num_tickers * 2)  # Output layer: number of tickers * 2 (open and close)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    verbose=1
)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Test Mean Squared Error (MSE): {mse:.4f}")
print(f"Test Mean Absolute Error (MAE): {mae:.4f}")

# Calculate directional accuracy for close prices
actual_close = y_test[:, 1::2]  # Close prices (every second element)
predicted_close = y_pred[:, 1::2]
features = ['Open', 'Close', 'High', 'Low', 'Volume']
close_idx = features.index('Close')  # 1
num_features = len(features)  # 5
window_size = 10
positions = [(window_size - 1) * num_tickers * num_features + i * num_features + close_idx for i in range(num_tickers)]
last_close = X_test[:, positions]

actual_change = actual_close - last_close
predicted_change = predicted_close - last_close
directional_accuracy = np.mean(np.sign(actual_change) == np.sign(predicted_change))

print(f"Directional Accuracy for Close Prices: {directional_accuracy:.4f}")

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 7971.8940 - val_loss: 16212.7705
Epoch 2/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 8118.3633 - val_loss: 16183.0166
Epoch 3/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 7890.3804 - val_loss: 16124.9229
Epoch 4/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 7971.2441 - val_loss: 16017.2490
Epoch 5/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 7909.8320 - val_loss: 15840.0723
Epoch 6/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 7580.3950 - val_loss: 15581.1104
Epoch 7/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 7383.7891 - val_loss: 15234.1221
Epoch 8/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 7265.6162 - val_loss: 14802.7

In [21]:
# Function to create rolling window sequences (unchanged)
def create_rolling_window_data(df, window_size=10, target_shift=1):
    X, y = [], []
    tickers = df.columns.levels[0]
    num_tickers = len(tickers)
    
    for i in range(window_size, len(df) - target_shift):
        window = df.iloc[i - window_size:i]
        X.append(window.values.flatten())
        next_day = df.iloc[i + target_shift - 1]
        targets = []
        for ticker in tickers:
            targets.append(next_day[(ticker, 'Open')])
            targets.append(next_day[(ticker, 'Close')])
        y.append(targets)
    
    return np.array(X), np.array(y), num_tickers

# Function to train and evaluate a model for a given window size
def evaluate_window_size(df, window_size):
    X, y, num_tickers = create_rolling_window_data(df, window_size=window_size)
    
    # Split data (60% train, 20% val, 20% test)
    total_size = len(X)
    train_size = int(0.6 * total_size)
    val_size = int(0.2 * total_size)
    test_size = total_size - train_size - val_size
    
    X_train, y_train = X[:train_size], y[:train_size]
    X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
    X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]
    
    # Normalize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Build and train model
    model = Sequential([
        Dense(512, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dense(256, activation='relu'),
        Dense(num_tickers * 2)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train_scaled, y_train, validation_data=(X_val_scaled, y_val), 
              epochs=50, batch_size=32, verbose=0)  # Silent training
    
    # Evaluate on validation set
    y_val_pred = model.predict(X_val_scaled, verbose=0)
    val_mpe = np.mean((y_val - y_val_pred) / (y_val + 1e-10)) * 100
    return val_mpe, X, y, num_tickers

# Test window sizes and find the best
window_sizes = range(5, 21)  # Test 5 to 20 days
mpe_scores = {}
for window_size in window_sizes:
    val_mpe, X, y, num_tickers = evaluate_window_size(stocks_df_leveled, window_size)
    mpe_scores[window_size] = val_mpe
    print(f"Window Size {window_size}: Validation MPE = {val_mpe:.4f}%")

best_window_size = min(mpe_scores, key=mpe_scores.get)  # Lowest MPE (least negative or smallest positive)
print(f"Best Window Size: {best_window_size} with Validation MPE: {mpe_scores[best_window_size]:.4f}%")

# Final model with best window size
X, y, num_tickers = create_rolling_window_data(stocks_df_leveled, window_size=best_window_size)
total_size = len(X)
train_size = int(0.6 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size

X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(256, activation='relu'),
    Dense(num_tickers * 2)
])
model.compile(optimizer='adam', loss='mse')
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=50,
    batch_size=32,
    verbose=1
)

# Evaluate with MPE
y_pred = model.predict(X_test_scaled)
mpe = np.mean((y_test - y_pred) / (y_test + 1e-10)) * 100
print(f"Test Mean Percentage Error (MPE): {mpe:.4f}%")

# Directional accuracy
actual_close = y_test[:, 1::2]
predicted_close = y_pred[:, 1::2]
features = ['Open', 'Close', 'High', 'Low', 'Volume']
close_idx = features.index('Close')
num_features = len(features)
positions = [(best_window_size - 1) * num_tickers * num_features + i * num_features + close_idx for i in range(num_tickers)]
last_close = X_test[:, positions]
actual_change = actual_close - last_close
predicted_change = predicted_close - last_close
directional_accuracy = np.mean(np.sign(actual_change) == np.sign(predicted_change))
print(f"Directional Accuracy for Close Prices: {directional_accuracy:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 5: Validation MPE = 24.3605%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 6: Validation MPE = 25.4227%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 7: Validation MPE = 24.4748%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 8: Validation MPE = 25.6162%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 9: Validation MPE = 25.2775%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 10: Validation MPE = 25.5835%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 11: Validation MPE = 24.7604%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 12: Validation MPE = 25.0203%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 13: Validation MPE = 25.2813%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 14: Validation MPE = 25.2146%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 15: Validation MPE = 24.3716%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 16: Validation MPE = 25.1255%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 17: Validation MPE = 24.8159%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 18: Validation MPE = 25.2200%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 19: Validation MPE = 24.7027%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 20: Validation MPE = 24.6962%
Best Window Size: 5 with Validation MPE: 24.3605%
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 6173.6865 - val_loss: 16180.3271
Epoch 2/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 7760.9883 - val_loss: 16158.6445
Epoch 3/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 7966.4126 - val_loss: 16135.2588
Epoch 4/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 7821.6406 - val_loss: 16108.7266
Epoch 5/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 7789.2051 - val_loss: 16076.6318
Epoch 6/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 7881.1211 - val_loss: 16036.3887
Epoch 7/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 7868.3691 - val_loss: 15985.8799
Epoch 8/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 7704.4951 - val_loss: 15922.5547
Epoch 9/50