In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

In [2]:
# Load data
df = pd.read_csv("filtered_stocks_combined.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['ticker', 'Date'])

# Create target variable (1 if current close > previous close)
df['prev_close'] = df.groupby('ticker')['Close'].shift(1)
df['target'] = (df['Close'] > df['prev_close']).astype(int)
df = df.dropna(subset=['prev_close'])

# Features to use
features = ['Open', 'High', 'Low', 'Volume']

In [3]:
def create_sequences(data, features, window_size=14):
    """
    data: DataFrame containing columns 'ticker', 'Date', <features>, 'target'
    features: list of columns to be used as model features
    window_size: number of days to include in each sequence
    
    Returns:
        X: 3D NumPy array of shape (num_samples, window_size, num_features)
        y: 1D NumPy array of shape (num_samples,)
        groups: a list (or array) storing the group/ticker for each sample
    """
    X, y, groups = [], [], []
    
    # Group data by ticker
    for ticker, group in data.groupby('ticker'):
        group = group.sort_values('Date')
        
        # We only iterate up to len(group) - window_size
        for i in range(len(group) - window_size):
            seq_features = group.iloc[i : i + window_size][features].values
            seq_target = group.iloc[i + window_size]['target']  # target is the label AFTER the 14th day
            X.append(seq_features)
            y.append(seq_target)
            groups.append(ticker)
    
    return np.array(X), np.array(y), np.array(groups)

In [4]:
window_size = 10
X_all, y_all, all_groups = create_sequences(df, features, window_size=window_size)

print("X_all shape:", X_all.shape)  # (num_samples, 14, num_features)
print("y_all shape:", y_all.shape)  # (num_samples,)
print("Num tickers in sequences:", len(np.unique(all_groups)))

X_all shape: (74950, 10, 4)
y_all shape: (74950,)
Num tickers in sequences: 50


In [5]:
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(splitter.split(X_all, y_all, groups=all_groups))

X_train, X_test = X_all[train_idx], X_all[test_idx]
y_train, y_test = y_all[train_idx], y_all[test_idx]

In [6]:
n_train_samples, seq_len, n_features = X_train.shape

# Reshape to 2D for fitting
X_train_2d = X_train.reshape(-1, n_features)  # shape: (n_train_samples*seq_len, n_features)
scaler = StandardScaler()
X_train_2d_scaled = scaler.fit_transform(X_train_2d)

# Reshape back to 3D
X_train_scaled = X_train_2d_scaled.reshape(n_train_samples, seq_len, n_features)

# Now do the same for X_test
n_test_samples = X_test.shape[0]
X_test_2d = X_test.reshape(-1, n_features)
X_test_2d_scaled = scaler.transform(X_test_2d)
X_test_scaled = X_test_2d_scaled.reshape(n_test_samples, seq_len, n_features)

In [7]:
model = Sequential()
model.add(SimpleRNN(32, input_shape=(window_size, n_features), return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    verbose=1
)

Epoch 1/10


  super().__init__(**kwargs)


[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.5059 - loss: 0.6967 - val_accuracy: 0.5162 - val_loss: 0.6933
Epoch 2/10
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5109 - loss: 0.6937 - val_accuracy: 0.5123 - val_loss: 0.6931
Epoch 3/10
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5131 - loss: 0.6932 - val_accuracy: 0.5055 - val_loss: 0.6936
Epoch 4/10
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5114 - loss: 0.6933 - val_accuracy: 0.5120 - val_loss: 0.6928
Epoch 5/10
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5147 - loss: 0.6931 - val_accuracy: 0.5128 - val_loss: 0.6929
Epoch 6/10
[1m1687/1687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.5181 - loss: 0.6928 - val_accuracy: 0.5137 - val_loss: 0.6927
Epoch 7/10
[1m1687/1687[0

In [8]:
# Predict probabilities
y_proba = model.predict(X_test_scaled).ravel()  # shape: (n_test_samples,)
y_pred = (y_proba >= 0.5).astype(int)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {auc:.4f}\n")

print("Confusion Matrix:")
print(pd.DataFrame(cm,
                   index=['Actual Down', 'Actual Up'],
                   columns=['Predicted Down', 'Predicted Up']))

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.5121
F1 Score: 0.6648
AUC-ROC: 0.5082

Confusion Matrix:
             Predicted Down  Predicted Up
Actual Down             426          6857
Actual Up               456          7251


In [9]:
# # Load and prepare data
# df = pd.read_csv("filtered_stocks_combined.csv")
# df['Date'] = pd.to_datetime(df['Date'])
# df = df.sort_values(['ticker', 'Date'])

# # Create target variable (1 if current close > previous close)
# df['prev_close'] = df.groupby('ticker')['Close'].shift(1)
# df['target'] = (df['Close'] > df['prev_close']).astype(int)
# df = df.dropna(subset=['prev_close'])

# # Feature engineering
# features = ['Open', 'High', 'Low', 'Volume']
# X = df[features]
# y = df['target']

# # Time-aware split preserving ticker groups
# splitter = GroupShuffleSplit(test_size=0.2, random_state=42)
# train_idx, test_idx = next(splitter.split(X, y, groups=df['ticker']))

# X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
# y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# # Feature scaling
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)