In [None]:
import sys
print(sys.executable)

In [None]:
import sys
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"


import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import matplotlib.dates as mdates  # For formatting x-axis with years
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
import tensorflow as tf
from keras.api.models import Sequential
from keras.api.layers import *
from keras.api.optimizers import Adam
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
def load_data(ticker, start, end):
    data = yf.download(ticker, start=start, end=end, auto_adjust=False)

    if data.empty:
        print(f"Error: No data retrieved for {ticker}. Check API availability.")
        sys.exit(1)  # Stop the script if data is missing

    data.reset_index(inplace=True)
    return data
def create_sequences(X, y, dates, time_steps):
    X_seq, y_seq, date_seq = [], [], []
    for i in range(len(X) - time_steps):
        X_seq.append(X[i:i + time_steps])
        y_seq.append(y[i + time_steps])
        date_seq.append(dates[i + time_steps])  # Store corresponding date
    return np.array(X_seq), np.array(y_seq), np.array(date_seq)


In [None]:
main_ticker = "^GSPC"
tickers = ["AAPL", "MSFT", "AMZN", "GOOG", "TSLA", "^IRX", "^VIX", "^TNX"]
TRAIN_START = "2015-01-01"
TRAIN_END = "2025-02-01"
data = load_data(tickers, TRAIN_START, TRAIN_END)

main_data = load_data(main_ticker, TRAIN_START, TRAIN_END)
main_data["Target"] = main_data["Adj Close"].shift(-1)
data_adj_close = data["Adj Close"]
# Print the structure of the dataset
print(data_adj_close.head())
for ticker in tickers:
    main_data[ticker] = data_adj_close[ticker]

data = main_data
# --- Technical Indicators ---
data['SMA10'] = data['Adj Close'].rolling(window=10).mean()
data['SMA50'] = data['Adj Close'].rolling(window=50).mean()

# RSI (14-day)
delta = data['Adj Close'].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)
avg_gain = gain.rolling(window=14, min_periods=14).mean()
avg_loss = loss.rolling(window=14, min_periods=14).mean()
rs = avg_gain / avg_loss
data['RSI'] = 100 - (100 / (1 + rs))

# MACD
data['EMA12'] = data['Adj Close'].ewm(span=12, adjust=False).mean()
data['EMA26'] = data['Adj Close'].ewm(span=26, adjust=False).mean()
data['MACD'] = data['EMA12'] - data['EMA26']

# Volatility (20-day standard deviation)
data['Returns'] = data['Adj Close'].pct_change()
data['Volatility'] = data['Returns'].rolling(window=20).std()

# Drop missing values
data.dropna(inplace=True)


In [None]:


features = [col for col in data.columns if "Target" not in col and "Adj Close" not in col and "Date" not in col]
# Define target variable (Adj Close of ^GSPC)
# If MultiIndex, flatten it
if isinstance(data.columns, pd.MultiIndex):
    data.columns = ['_'.join(col).strip() for col in data.columns.values]  # Convert MultiIndex to single level

# Check if 'Adj Close' is renamed
print("Available columns in data after flattening:", data.columns)

# Define the target variable correctly
adj_close_col = next((col for col in data.columns if "Adj Close" in col), None)
if adj_close_col is None:
    raise KeyError("Could not find 'Adj Close' in data. Check available columns.")

y = data[adj_close_col].values.ravel()
# Select features (excluding non-numeric columns)
X = data.drop(columns=["Date_", adj_close_col], errors="ignore")
# Normalize features
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)

# Normalize target variable y
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()  # Reshape required for a single feature

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42, shuffle=False)

# ---------------------------
# 2. Feature Selection using RFE
# ---------------------------

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Use RFE to select the top 10 features
rfe = RFE(model, n_features_to_select=10)
X_train_selected = rfe.fit_transform(X_train, y_train)
X_test_selected = rfe.transform(X_test)


print(rfe)
# Print selected features
selected_features = np.array(X.columns)[rfe.support_]

print(f"Selected Features: {selected_features}")

In [None]:

X_raw = data[selected_features].values

target_col = next((col for col in data.columns if "Target" in col), None)

if target_col is None:
    raise KeyError("Could not find 'Target' in data. Check available columns.")

y_raw = data[target_col].values.reshape(-1, 1)


date_col = next((col for col in data.columns if "Date" in col), None)
if date_col:
    data[date_col] = pd.to_datetime(data[date_col])
else:
    raise KeyError("Could not find 'Date' column in data. Check available columns.")


scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X_raw)

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y_raw)

time_steps = 90
date_col = next((col for col in data.columns if "Date" in col), None)
if date_col:
    X_seq, y_seq, dates_seq = create_sequences(X_scaled, y_scaled, data[date_col].values, time_steps)
else:
    raise KeyError("Could not find a column containing 'Date'. Check available columns.")

X_train, X_test, y_train, y_test, dates_train, dates_test = train_test_split(
    X_seq, y_seq, dates_seq, test_size=0.2, random_state=42, shuffle=False
)

model = Sequential([
    Input(shape=(time_steps, len(selected_features))),
    LSTM(64, return_sequences=True, activation='tanh'),
    LSTM(32, return_sequences=False, activation='tanh'),
    Dropout(0.2),
    Dense(1)
])


model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Early Stopping callback (stops training if validation loss doesn’t improve for 20 epochs)
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)

# Training parameters
epochs = 100
batch_size = 32


history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test),
    verbose=1,
    callbacks=[early_stop]
)

y_pred = model.predict(X_test)
mae_scaled = mean_absolute_error(y_test, y_pred)



y_pred_actual = scaler_y.inverse_transform(y_pred)
y_test_actual = scaler_y.inverse_transform(y_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae_percentage = mean_absolute_percentage_error(y_test_actual, y_pred_actual)


In [None]:

plt.figure(figsize=(14, 7))
plt.plot(dates_test, y_test_actual, label="Actual Adj Close", color="blue")
plt.plot(dates_test, y_pred_actual, label="Predicted Adj Close", color="red", linestyle="dashed")
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.xticks(rotation=45)
plt.xlabel("Year")
plt.ylabel("Adjusted Close Price")
plt.title("S&P500 Adjusted Close Price Prediction Over Time")
plt.legend()
plt.grid()
plt.show()

print(f"Mean Percentage Error: {mae_percentage}")
print(f"🔹 Root Mean Squared Error (RMSE): {rmse:.4f}")
