**Import libraries**

In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout



**Dataset**

In [2]:
df = pd.read_csv("C:/Users/ashua/Desktop/Stock Market Recommendation System/data/final_processed_data.csv")

print(df.shape)
print(df.columns)

(2445, 20)
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Return', 'SMA_20', 'SMA_50', 'Volatility', 'RSI_14',
       'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower', 'BB_Width',
       'Future_Return', 'Signal'],
      dtype='object')


**Identify & drop leakage columns (NO MORE KeyError)**

In [3]:
#Feature–Target Separation (NO LEAKAGE)
TARGET = "Signal"

# Remove target, date, and any future-looking columns
drop_cols = [TARGET, "Date", "Future_Return"]

X = df.drop(columns=drop_cols, errors="ignore")
y = df[TARGET]

**Encode Signal (handles -1/0/1 or Buy/Sell/Hold)**

In [4]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

**TIME-SERIES TRAIN–TEST SPLIT**

In [5]:
split_idx = int(len(X) * 0.8)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_train = y_encoded[:split_idx]
y_test  = y_encoded[split_idx:]

**Scale features (for LR & LSTM)**

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

**Model 1: Logistic Regression**

In [7]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

lr_preds = lr_model.predict(X_test_scaled)

print("Logistic Regression Results")
print(classification_report(y_test, lr_preds))

Logistic Regression Results
              precision    recall  f1-score   support

           0       0.41      0.38      0.39       220
           1       0.52      0.55      0.53       269

    accuracy                           0.47       489
   macro avg       0.46      0.46      0.46       489
weighted avg       0.47      0.47      0.47       489



**Model 2: Random Forest**

In [8]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

print("Random Forest Results")
print(classification_report(y_test, rf_preds))

Random Forest Results
              precision    recall  f1-score   support

           0       0.44      0.88      0.59       220
           1       0.47      0.09      0.15       269

    accuracy                           0.44       489
   macro avg       0.46      0.48      0.37       489
weighted avg       0.46      0.44      0.35       489



**Model 3: XGBoost (auto binary/multiclass)**

In [9]:
xgb_model = xgb.XGBClassifier(
    eval_metric="logloss",
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)
xgb_preds = xgb_model.predict(X_test_scaled)

print("XGBoost Results")
print(classification_report(y_test, xgb_preds))


XGBoost Results
              precision    recall  f1-score   support

           0       0.45      0.81      0.58       220
           1       0.55      0.19      0.28       269

    accuracy                           0.47       489
   macro avg       0.50      0.50      0.43       489
weighted avg       0.50      0.47      0.42       489



**Model 4: LSTM (safe version, timesteps=1)**

In [10]:
def create_sequences(X, y, lookback=30):
    Xs, ys = [], []
    for i in range(lookback, len(X)):
        Xs.append(X[i-lookback:i])
        ys.append(y[i])
    return np.array(Xs), np.array(ys)

X_train_lstm, y_train_lstm = create_sequences(X_train_scaled, y_train, 30)
X_test_lstm, y_test_lstm   = create_sequences(X_test_scaled, y_test, 30)

In [11]:
lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(16, activation="relu"),
    Dense(len(le.classes_), activation="softmax")
])

lstm_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history = lstm_model.fit(
    X_train_lstm,
    y_train_lstm,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_lstm, y_test_lstm),
    verbose=1
)


Epoch 1/20


  super().__init__(**kwargs)


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.5296 - loss: 0.6935 - val_accuracy: 0.4793 - val_loss: 0.7138
Epoch 2/20
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5384 - loss: 0.6923 - val_accuracy: 0.5076 - val_loss: 0.6940
Epoch 3/20
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5436 - loss: 0.6908 - val_accuracy: 0.5599 - val_loss: 0.6924
Epoch 4/20
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5291 - loss: 0.6932 - val_accuracy: 0.5142 - val_loss: 0.6926
Epoch 5/20
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.5265 - loss: 0.6898 - val_accuracy: 0.5011 - val_loss: 0.6943
Epoch 6/20
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5571 - loss: 0.6884 - val_accuracy: 0.5425 - val_loss: 0.6890
Epoch 7/20
[1m61/61[0m [32m━━━━━━━━━━━━━━━

**Final comparison table**

In [16]:
lr_acc = accuracy_score(y_test, lr_preds)
lr_f1  = f1_score(y_test, lr_preds, average="weighted")

rf_acc = accuracy_score(y_test, rf_preds)
rf_f1  = f1_score(y_test, rf_preds, average="weighted")

xgb_acc = accuracy_score(y_test, xgb_preds)
xgb_f1  = f1_score(y_test, xgb_preds, average="weighted")

lstm_preds = np.argmax(lstm_model.predict(X_test_lstm), axis=1)

lstm_acc = accuracy_score(y_test_lstm, lstm_preds)
lstm_f1  = f1_score(y_test_lstm, lstm_preds, average="weighted")

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step


In [17]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Random Forest",
        "XGBoost",
        "LSTM"
    ],
    "Accuracy": [
        lr_acc,
        rf_acc,
        xgb_acc,
        lstm_acc
    ],
    "F1 Score": [
        lr_f1,
        rf_f1,
        xgb_f1,
        lstm_f1
    ]
}).sort_values(by="F1 Score", ascending=False)

results


Unnamed: 0,Model,Accuracy,F1 Score
0,Logistic Regression,0.472393,0.470057
2,XGBoost,0.468303,0.415007
1,Random Forest,0.443763,0.346437
3,LSTM,0.433551,0.342201


In [18]:
final_test_df = df.iloc[split_idx:].copy()
final_test_df["Predicted_Signal"] = lr_preds

final_test_df.to_csv("C:/Users/ashua/Desktop/Stock Market Recommendation System/data/backtest_input.csv")

print("Saved Logistic Regression predictions for backtesting")

Saved Logistic Regression predictions for backtesting
