In [6]:
!pip -q install xgboost


**Import libraries**

In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout



**Dataset**

In [10]:
df = pd.read_csv("/content/final_processed_data.csv")
print("Shape:", df.shape)
print("Columns sample:", list(df.columns)[:30])
df.head()



Shape: (2445, 20)
Columns sample: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Return', 'SMA_20', 'SMA_50', 'Volatility', 'RSI_14', 'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower', 'BB_Width', 'Future_Return', 'Signal']


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Return,SMA_20,SMA_50,Volatility,RSI_14,MACD,MACD_Signal,BB_Upper,BB_Lower,BB_Width,Future_Return,Signal
0,2016-04-06 00:00:00-04:00,24.988303,25.158322,24.754809,25.153788,105616400,0.0,0.0,0.010473,24.109527,22.81388,0.009507,70.501084,0.504108,0.409157,25.504253,22.714802,2.789451,-0.02181,0
1,2016-04-07 00:00:00-04:00,24.924822,25.031367,24.509976,24.605186,127207600,0.0,0.0,-0.02181,24.19363,22.855091,0.011196,59.50052,0.469311,0.421188,25.486253,22.901008,2.585245,0.001106,1
2,2016-04-08 00:00:00-04:00,24.689068,24.884022,24.521314,24.632395,94326800,0.0,0.0,0.001106,24.278527,22.926473,0.011188,59.500544,0.438871,0.424725,25.439245,23.117808,2.321436,0.003313,1
3,2016-04-11 00:00:00-04:00,24.702667,25.074442,24.67093,24.714001,117630000,0.0,0.0,0.003313,24.355149,22.996465,0.011061,60.527953,0.416531,0.423086,25.408305,23.301992,2.106313,0.013025,1
4,2016-04-12 00:00:00-04:00,24.786545,25.049509,24.632396,25.035908,108929200,0.0,0.0,0.013025,24.444919,23.058241,0.011272,62.093523,0.41996,0.422461,25.399489,23.490349,1.90914,0.014487,1


**Identify & drop leakage columns (NO MORE KeyError)**

In [11]:
# ---- REQUIRED: target column ----
TARGET = "Signal"

# ---- Find possible leakage column names ----
possible_future_cols = [
    "future_returns", "future_return", "Future_Return", "Future_Returns",
    "futureReturns", "returns_future", "FutureReturn"
]

found_future_cols = [c for c in possible_future_cols if c in df.columns]

drop_cols = [TARGET] + found_future_cols

print("Found leakage columns:", found_future_cols)
print("Dropping columns:", drop_cols)

# Features + target
X = df.drop(columns=drop_cols, errors="ignore")
y = df[TARGET]
print("X shape:", X.shape, " | y shape:", y.shape)



Found leakage columns: ['Future_Return']
Dropping columns: ['Signal', 'Future_Return']
X shape: (2445, 18)  | y shape: (2445,)


**Encode Signal (handles -1/0/1 or Buy/Sell/Hold)**

In [12]:
le = LabelEncoder()
y_enc = le.fit_transform(y)

print("Signal classes:", list(le.classes_))
print("Encoded classes:", np.unique(y_enc))



Signal classes: [np.int64(0), np.int64(1)]
Encoded classes: [0 1]


**Train-test split (time-series safe, no shuffle)**

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, shuffle=False
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (1956, 18) Test: (489, 18)


**Scale features (for LR & LSTM)**

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=[np.number]))
X_test_scaled  = scaler.transform(X_test.select_dtypes(include=[np.number]))


**Model 1: Logistic Regression**

In [16]:
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_scaled, y_train)
pred_lr = lr.predict(X_test_scaled)

lr_acc = accuracy_score(y_test, pred_lr)
print("Logistic Regression Accuracy:", lr_acc)
print(classification_report(y_test, pred_lr, target_names=[str(c) for c in le.classes_]))


Logistic Regression Accuracy: 0.4723926380368098
              precision    recall  f1-score   support

           0       0.41      0.38      0.39       220
           1       0.52      0.55      0.53       269

    accuracy                           0.47       489
   macro avg       0.46      0.46      0.46       489
weighted avg       0.47      0.47      0.47       489



**Model 2: Random Forest**

In [18]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    random_state=42
)

X_train_num = X_train.select_dtypes(include=[np.number])
X_test_num  = X_test.select_dtypes(include=[np.number])

rf = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
rf.fit(X_train_num, y_train)

pred_rf = rf.predict(X_test_num)

rf_acc = accuracy_score(y_test, pred_rf)
print("Random Forest Accuracy:", rf_acc)
print(classification_report(y_test, pred_rf))

Random Forest Accuracy: 0.4376278118609407
              precision    recall  f1-score   support

           0       0.44      0.84      0.57       220
           1       0.45      0.11      0.17       269

    accuracy                           0.44       489
   macro avg       0.44      0.47      0.37       489
weighted avg       0.45      0.44      0.35       489



**Model 3: XGBoost (auto binary/multiclass)**

In [20]:
X_train_num = X_train.select_dtypes(include=[np.number])
X_test_num  = X_test.select_dtypes(include=[np.number])

n_classes = len(np.unique(y_train))

if n_classes == 2:
    xgb_model = xgb.XGBClassifier(
        n_estimators=400, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42
    )
else:
    xgb_model = xgb.XGBClassifier(
        n_estimators=400, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        objective="multi:softmax",
        num_class=n_classes,
        eval_metric="mlogloss",
        random_state=42
    )

xgb_model.fit(X_train_num, y_train)
pred_xgb = xgb_model.predict(X_test_num)

xgb_acc = accuracy_score(y_test, pred_xgb)
print("XGBoost Accuracy:", xgb_acc)
print(classification_report(y_test, pred_xgb))

XGBoost Accuracy: 0.44785276073619634
              precision    recall  f1-score   support

           0       0.44      0.80      0.57       220
           1       0.49      0.16      0.24       269

    accuracy                           0.45       489
   macro avg       0.47      0.48      0.40       489
weighted avg       0.47      0.45      0.39       489



**Model 4: LSTM (safe version, timesteps=1)**

In [21]:
# reshape to (samples, timesteps, features)
X_train_lstm = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_lstm  = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

n_classes = len(np.unique(y_train))

lstm = Sequential([
    LSTM(64, input_shape=(1, X_train_scaled.shape[1])),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dense(n_classes, activation="softmax")
])

lstm.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

history = lstm.fit(
    X_train_lstm, y_train,
    epochs=15,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)

lstm_loss, lstm_acc = lstm.evaluate(X_test_lstm, y_test, verbose=0)
print("LSTM Accuracy:", lstm_acc)

  super().__init__(**kwargs)


Epoch 1/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 37ms/step - accuracy: 0.4902 - loss: 0.6940 - val_accuracy: 0.5357 - val_loss: 0.6896
Epoch 2/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5576 - loss: 0.6854 - val_accuracy: 0.5561 - val_loss: 0.6908
Epoch 3/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5345 - loss: 0.6903 - val_accuracy: 0.5204 - val_loss: 0.6903
Epoch 4/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.5367 - loss: 0.6901 - val_accuracy: 0.5816 - val_loss: 0.6898
Epoch 5/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5516 - loss: 0.6877 - val_accuracy: 0.5561 - val_loss: 0.6891
Epoch 6/15
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5554 - loss: 0.6820 - val_accuracy: 0.5459 - val_loss: 0.6898
Epoch 7/15
[1m55/55[0m [32m━━━━

**Final comparison table**

In [22]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "XGBoost", "LSTM"],
    "Accuracy": [lr_acc, rf_acc, xgb_acc, lstm_acc]
}).sort_values("Accuracy", ascending=False)

results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.472393
3,LSTM,0.456033
2,XGBoost,0.447853
1,Random Forest,0.437628



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

