# Data Preprocessing
This notebook handles downloading, cleaning, and preparing data for the LSTM model.

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [None]:
tickers = {
    "TD_Close": "TD.TO",
    "BMO_Close": "BMO.TO",
    "RBC_Close": "RY.TO",
    "BNS_Close": "BNS.TO",
    "Financials_Close": "XLF"
}
economic_tickers = {
    "Interest_Rates": "^IRX",
    "Volatility_Index": "^VIX",
    "Bank_Index": "^BKX",
    "SP500": "^GSPC"
}

start_date = "2015-01-01"
end_date = "2024-01-01"

data = pd.DataFrame()
for column_name, ticker in {**tickers, **economic_tickers}.items():
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    data[column_name] = stock_data['Close']

data.dropna(inplace=True)
data.head()

In [None]:
# Moving averages
data['TD_20_MA'] = data['TD_Close'].rolling(window=20).mean()
data['TD_200_MA'] = data['TD_Close'].rolling(window=200).mean()

# Sentiment analysis
political_news = [
    "New policy to boost economic growth signed into law",
    "Political unrest causes market uncertainty",
    "Election results indicate stable government",
    "Trade agreement reached between major countries"
]
analyzer = SentimentIntensityAnalyzer()
sentiment_scores = [analyzer.polarity_scores(news)['compound'] for news in political_news]
sentiment_dates = pd.date_range(start='2023-01-01', periods=len(sentiment_scores), freq='ME')

sentiment_df = pd.DataFrame({'Date': sentiment_dates, 'Sentiment_Score': sentiment_scores})
data = pd.merge(data.reset_index(), sentiment_df, on='Date', how='left').fillna(0)

# Event indicators
political_events = pd.DataFrame({
    'Date': pd.to_datetime([
        '2020-11-03', '2021-01-20', '2022-11-08', 
        '2016-06-23', '2017-01-20', '2019-12-12'
    ]),
    'Event_Indicator': 1
})
data = pd.merge(data, political_events, on='Date', how='left').fillna(0)

In [None]:
# Normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data.drop(columns=['Date']))
data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns[1:])

# Save preprocessed data
data_scaled_df.to_csv('../data/preprocessed_data.csv', index=False)