In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.model_selection import train_test_split


In [2]:
combined_stocks_df = pd.read_csv("filtered_stocks_combined.csv")

combined_stocks_df

Unnamed: 0,Date,index,Open,High,Low,Close,Volume,OpenInt,ticker
0,2011-01-03,45725,41.600,42.293,41.600,42.204,123930383,0,AAPL
1,2011-01-04,45726,42.579,42.579,42.023,42.426,86135637,0,AAPL
2,2011-01-05,45727,42.260,42.817,42.196,42.772,70669988,0,AAPL
3,2011-01-06,45728,42.902,42.932,42.632,42.739,83619699,0,AAPL
4,2011-01-07,45729,42.787,43.073,42.503,43.045,86506108,0,AAPL
...,...,...,...,...,...,...,...,...,...
75495,2016-12-23,14745947,87.556,87.625,87.201,87.383,4428429,0,XOM
75496,2016-12-27,14745948,87.499,87.768,87.257,87.423,5100402,0,XOM
75497,2016-12-28,14745949,87.354,87.768,86.949,86.989,6834213,0,XOM
75498,2016-12-29,14745950,86.797,87.277,86.728,87.036,6938299,0,XOM


In [3]:
# Convert Date to datetime and set as index
combined_stocks_df["Date"] = pd.to_datetime(combined_stocks_df["Date"])
combined_stocks_df.set_index("Date", inplace=True)

# Drop unnecessary columns if any
combined_stocks_df = combined_stocks_df.drop(columns=["index"])  # Optional

# Pivot to multi-level columns: Ticker as level 1, feature as level 2
stocks_df = combined_stocks_df.pivot_table(
    index=combined_stocks_df.index,
    columns="ticker",
    values=[col for col in combined_stocks_df.columns if col != "ticker"]
)

# Sort columns for clarity
stocks_df = stocks_df.sort_index(axis=1, level=0)

# Swap the column MultiIndex levels
stocks_df_leveled = stocks_df.swaplevel(axis=1)

# Sort by ticker (Level 0)
stocks_df_leveled = stocks_df_leveled.sort_index(axis=1, level=0)

# Preview the new structure
stocks_df_leveled.head()

ticker,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,ABT,ABT,ABT,ABT,...,WMT,WMT,WMT,WMT,XOM,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,Close,High,Low,Open,OpenInt,Volume,Close,High,Low,Open,...,Low,Open,OpenInt,Volume,Close,High,Low,Open,OpenInt,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2011-01-03,42.204,42.293,41.6,41.6,0.0,123930383.0,19.358,19.549,19.338,19.549,...,46.027,46.089,0.0,16789275.0,60.341,60.535,59.604,59.668,0.0,28807445.0
2011-01-04,42.426,42.579,42.023,42.579,0.0,86135637.0,19.54,19.561,19.35,19.448,...,46.139,46.419,0.0,14296931.0,60.625,60.665,60.235,60.469,0.0,24744869.0
2011-01-05,42.772,42.817,42.196,42.26,0.0,70669988.0,19.54,19.764,19.46,19.52,...,46.174,46.539,0.0,16723328.0,60.462,60.6,60.009,60.43,0.0,20448359.0
2011-01-06,42.739,42.932,42.632,42.902,0.0,83619699.0,19.5,19.682,19.342,19.634,...,45.712,46.166,0.0,18335156.0,60.853,61.052,60.439,60.625,0.0,27829692.0
2011-01-07,43.045,43.073,42.503,42.787,0.0,86506108.0,19.582,19.615,19.444,19.489,...,45.698,45.81,0.0,9374462.0,61.182,61.431,60.777,60.876,0.0,23838996.0


In [4]:
# 3. Time-based split into 60% train, 20% val, 20% test
train_dict, val_dict, test_dict = {}, {}, {}

for ticker in stocks_df_leveled.columns.levels[0]:
    stocks_df_leveled.loc[:, (ticker, 'log_return')] = np.log(
    stocks_df_leveled[ticker]['Close'] / stocks_df_leveled[ticker]['Close'].shift(1)
)

    df = stocks_df_leveled[ticker].dropna().sort_index()
    total_len = len(df)
    train_end = int(total_len * 0.6)
    val_end = train_end + int(total_len * 0.2)

    train_dict[ticker] = df.iloc[:train_end]
    val_dict[ticker] = df.iloc[train_end:val_end]
    test_dict[ticker] = df.iloc[val_end:]

train_df = pd.concat(train_dict, names=["Ticker", "Date"])
val_df = pd.concat(val_dict, names=["Ticker", "Date"])
test_df = pd.concat(test_dict, names=["Ticker", "Date"])

train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,High,Low,Open,OpenInt,Volume,log_return
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAPL,2011-01-04,42.426,42.579,42.023,42.579,0.0,86135637.0,0.005246
AAPL,2011-01-05,42.772,42.817,42.196,42.260,0.0,70669988.0,0.008122
AAPL,2011-01-06,42.739,42.932,42.632,42.902,0.0,83619699.0,-0.000772
AAPL,2011-01-07,43.045,43.073,42.503,42.787,0.0,86506108.0,0.007134
AAPL,2011-01-10,43.855,43.956,43.179,43.393,0.0,124888228.0,0.018643
...,...,...,...,...,...,...,...,...
XOM,2014-08-04,88.145,88.411,86.542,86.921,0.0,13804459.0,0.013374
XOM,2014-08-05,86.445,87.625,86.077,87.492,0.0,14847864.0,-0.019475
XOM,2014-08-06,87.133,87.723,86.445,86.445,0.0,11264688.0,0.007927
XOM,2014-08-07,86.507,87.889,85.944,87.828,0.0,11379351.0,-0.007210


## LSTM Model

In [9]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [10]:
import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

In [6]:
# Function to create sequences for LSTM input
def create_sequences(df, seq_length=10):
    X, y = [], []
    for i in range(len(df) - seq_length):
        X.append(df.iloc[i:i+seq_length, :-1].values)  # Features (excluding target)
        y.append(1 if df.iloc[i+seq_length, -1] > 0 else 0)  # Label: 1 (Long) or 0 (Short)
    return np.array(X), np.array(y)

# Select relevant columns for modeling
features = ["Close", "High", "Low", "Open", "Volume", "log_return"]
target = ["log_return"]

# Normalize features
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
val_df[features] = scaler.transform(val_df[features])
test_df[features] = scaler.transform(test_df[features])

# Create sequences
seq_length = 10  # Use past 10 days to predict next day
X_train, y_train = create_sequences(train_df[features + target], seq_length)
X_val, y_val = create_sequences(val_df[features + target], seq_length)
X_test, y_test = create_sequences(test_df[features + target], seq_length)

# Shuffle training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Build LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(seq_length, len(features))),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")  # Binary classification: Long (1) / Short (0)
])

# Compile model
model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# Train model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Evaluate model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Predictions
y_pred = (model.predict(X_test) > 0.5).astype(int)

Epoch 1/20


  super().__init__(**kwargs)


[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 15ms/step - accuracy: 0.5080 - loss: 0.6940 - val_accuracy: 0.5009 - val_loss: 0.6935
Epoch 2/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 14ms/step - accuracy: 0.5144 - loss: 0.6929 - val_accuracy: 0.4995 - val_loss: 0.6934
Epoch 3/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 21ms/step - accuracy: 0.5086 - loss: 0.6929 - val_accuracy: 0.5047 - val_loss: 0.6932
Epoch 4/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.5123 - loss: 0.6929 - val_accuracy: 0.5005 - val_loss: 0.6932
Epoch 5/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.5105 - loss: 0.6926 - val_accuracy: 0.5078 - val_loss: 0.6927
Epoch 6/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 13ms/step - accuracy: 0.5150 - loss: 0.6924 - val_accuracy: 0.5064 - val_loss: 0.6932
Epoch 7/20
[1m

In [12]:
# Select features and target
features = ["Close", "High", "Low", "Open", "Volume", "log_return"]
target = ["log_return"]

# Normalize features
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
val_df[features] = scaler.transform(val_df[features])
test_df[features] = scaler.transform(test_df[features])

# Create sequences
seq_length = 10
X_train, y_train = create_sequences(train_df[features + target], seq_length)
X_val, y_val = create_sequences(val_df[features + target], seq_length)
X_test, y_test = create_sequences(test_df[features + target], seq_length)

# Shuffle training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Define model builder function for Keras Tuner
def build_model(hp):
    model = Sequential()
    model.add(LSTM(
        units=hp.Int("lstm_units_1", min_value=32, max_value=128, step=32),
        return_sequences=True, input_shape=(seq_length, len(features))
    ))
    model.add(Dropout(hp.Float("dropout_1", 0.1, 0.5, step=0.1)))

    model.add(LSTM(
        units=hp.Int("lstm_units_2", min_value=16, max_value=64, step=16),
        return_sequences=False
    ))
    model.add(Dropout(hp.Float("dropout_2", 0.1, 0.5, step=0.1)))

    model.add(Dense(
        units=hp.Int("dense_units", min_value=8, max_value=32, step=8),
        activation="relu"
    ))

    model.add(Dense(1, activation="sigmoid"))  # Binary classification

    model.compile(
        loss="binary_crossentropy",
        optimizer=Adam(hp.Choice("learning_rate", [0.001, 0.0005, 0.0001])),
        metrics=["accuracy"]
    )

    return model

# Initialize Keras Tuner
tuner = kt.Hyperband(
    build_model,
    objective="val_accuracy",
    max_epochs=20,
    factor=3,
    directory="kt_lstm_tuning",
    project_name="lstm_stock_prediction"
)

# Perform hyperparameter search
tuner.search(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

# Get the best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build and train the final model
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Evaluate on test data
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

# Predictions
y_pred = (best_model.predict(X_test) > 0.5).astype(int)

# Calculate model accuracy based on actual next day Open & Close prices
def calculate_accuracy(test_df, y_pred, seq_length):
    correct_predictions = 0
    total_samples = len(y_pred)

    for i in range(seq_length, len(test_df) - 1):
        actual_position = 1 if test_df.iloc[i + 1]['Close'] > test_df.iloc[i + 1]['Open'] else 0
        predicted_position = y_pred[i - seq_length]
        if actual_position == predicted_position:
            correct_predictions += 1

    return correct_predictions / total_samples

# Compute and print accuracy
final_accuracy = calculate_accuracy(test_df, y_pred, seq_length)
print(f"Final Model Accuracy: {final_accuracy:.2f}")

Trial 30 Complete [00h 11m 07s]
val_accuracy: 0.5132313966751099

Best val_accuracy So Far: 0.519481360912323
Total elapsed time: 02h 01m 30s
Epoch 1/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 27ms/step - accuracy: 0.5000 - loss: 0.6948 - val_accuracy: 0.4938 - val_loss: 0.6938
Epoch 2/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 26ms/step - accuracy: 0.5037 - loss: 0.6935 - val_accuracy: 0.4970 - val_loss: 0.6934
Epoch 3/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 26ms/step - accuracy: 0.5087 - loss: 0.6931 - val_accuracy: 0.5066 - val_loss: 0.6929
Epoch 4/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 27ms/step - accuracy: 0.5124 - loss: 0.6926 - val_accuracy: 0.5095 - val_loss: 0.6930
Epoch 5/20
[1m1414/1414[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 27ms/step - accuracy: 0.5088 - loss: 0.6932 - val_accuracy: 0.5116 - val_loss: 0.6924
Epoch 6/20
[1m1414/1414[0m