In [3]:
import yfinance as yf 
import pandas as pd
import requests
import io
import numpy as np
from datetime import datetime, timezone


In [107]:
DATA_OUTPUT_DIR="../data/raw"

In [2]:
PROCESSED_DATA_OUTPUT_DIR="../data/processed"

In [101]:
#just reading the html file using pd.read_html is forbidden so
# Ref: https://stackoverflow.com/a/75845569/
def normalize_ticker(ticker):
    return ticker.replace(".","-")

def list_slickcharts_sp500():
    url = 'https://www.slickcharts.com/sp500'
    user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0'  # Default user-agent fails.
    response = requests.get(url, headers={'User-Agent': user_agent})
    ticker= pd.read_html(io.StringIO(response.text), match='Symbol', index_col='Symbol')[0]
    ticker=ticker.reset_index()
    symbols=ticker["Symbol"].tolist()
    symbols=[normalize_ticker(tick) for tick in symbols]
    return symbols

In [95]:
def fix_indexes_yfinance(df):
    return df.stack(level=0).reset_index()


In [116]:
def get_stock_market_data(start_date,end_date):
    tickers= list_slickcharts_sp500()    
    data = yf.download(tickers, start=start_date, end=end_date, 
                      group_by='ticker', auto_adjust=True)
    
    data=data.stack(level=0).reset_index()
    return data

In [4]:
d=yf.download("AAPL", start="2025-09-25", end="2025-10-08", group_by='ticker', auto_adjust=True)

[*********************100%***********************]  1 of 1 completed


Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Price,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-09-25,253.210007,257.170013,251.710007,256.869995,55202100
2025-09-26,254.100006,257.600006,253.779999,255.460007,46076300
2025-09-29,254.559998,255.0,253.009995,254.429993,40127700
2025-09-30,254.860001,255.919998,253.110001,254.630005,37704300
2025-10-01,255.039993,258.790009,254.929993,255.449997,48713900
2025-10-02,256.579987,258.179993,254.149994,257.130005,42630200
2025-10-03,254.669998,259.23999,253.949997,258.019989,49155600
2025-10-06,257.98999,259.070007,255.050003,256.690002,44664100
2025-10-07,256.809998,257.399994,255.429993,256.480011,31955800


In [None]:
df=get_stock_market_data("2015-01-01",datetime.now(timezone.utc))
df.to_parquet(f"{DATA_OUTPUT_DIR}/sp500_stocks_2015_2025.parquet", index=False)
print("Data saved successfully")

[*********************100%***********************]  503 of 503 completed
  data=data.stack(level=0).reset_index()


In [113]:
def get_df_details(df):
    print("Printing DF details \n")
    print(f"Shape {df.shape}")
    print(f"Missing {df.isna().sum()}")
    print(f"Columns {df.columns}")
    print(f"Duplicated {df.duplicated().sum()}")


In [114]:
get_df_details(df)

Printing DF details 

Shape (1316543, 7)
Missing Price
Date      0
Ticker    0
Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64
Columns Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object', name='Price')
Duplicated 0


In [128]:
# Fetch SP500 index data for the same time period
def get_sp500_index_data(start_date,end_date):
    print("Fetching SP500 index data...")
    sp500_data = yf.download("^GSPC", start=start_date, end=end_date, auto_adjust=True, group_by='Ticker')
    sp500_data=sp500_data.stack(level=0).reset_index()
    sp500_data=sp500_data.drop(columns=["Ticker"])

    
    # Rename columns to avoid conflicts
    sp500_data.columns = ['Date', 'sp500_Open', 'sp500_High', 'sp500_Low', 
                         'sp500_Close', 'sp500_Volume']
   
    return sp500_data

In [129]:
sp500_data=get_sp500_index_data("2015-01-01", datetime.now(timezone.utc))

Fetching SP500 index data...


[*********************100%***********************]  1 of 1 completed
  sp500_data=sp500_data.stack(level=0).reset_index()


In [134]:
get_df_details(sp500_data)

Printing DF details 

Shape (2699, 6)
Missing Date            0
sp500_Open      0
sp500_High      0
sp500_Low       0
sp500_Close     0
sp500_Volume    0
dtype: int64
Columns Index(['Date', 'sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close',
       'sp500_Volume'],
      dtype='object')
Duplicated 0


In [136]:
sp500_data.to_parquet(f"{DATA_OUTPUT_DIR}/sp500_index_2015-2025.parquet", index=False)

In [137]:
print(df["Date"].nunique())
print(sp500_data["Date"].nunique())

2699
2699


In [138]:
merged_df=pd.merge(df,sp500_data,on="Date", how="inner")

In [139]:
get_df_details(merged_df)

Printing DF details 

Shape (1316543, 12)
Missing Date            0
Ticker          0
Open            0
High            0
Low             0
Close           0
Volume          0
sp500_Open      0
sp500_High      0
sp500_Low       0
sp500_Close     0
sp500_Volume    0
dtype: int64
Columns Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume'],
      dtype='object')
Duplicated 0


In [None]:
merged_df.to_parquet(f"{DATA_OUTPUT_DIR}/stock_and_market_data_2015_2025.parquet", index=False)


In [141]:
merged_df.head()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,sp500_Open,sp500_High,sp500_Low,sp500_Close,sp500_Volume
0,2015-01-02,A,37.764061,37.883278,37.02125,37.195492,1529200.0,2058.899902,2072.360107,2046.040039,2058.199951,2708700000
1,2015-01-02,AAPL,24.718174,24.72927,23.821672,24.261047,212818400.0,2058.899902,2072.360107,2046.040039,2058.199951,2708700000
2,2015-01-02,ABBV,42.105342,42.723023,42.105342,42.394878,5086100.0,2058.899902,2072.360107,2046.040039,2058.199951,2708700000
3,2015-01-02,ABT,36.86576,37.028704,36.368785,36.580612,3216600.0,2058.899902,2072.360107,2046.040039,2058.199951,2708700000
4,2015-01-02,ACGL,18.764398,18.884845,18.472788,18.539352,1101600.0,2058.899902,2072.360107,2046.040039,2058.199951,2708700000


In [5]:
processed_df=pd.read_parquet(f"{PROCESSED_DATA_OUTPUT_DIR}/swing_trading_model_data.parquet")

In [8]:
processed_df.isna().sum()

Date                    0
Ticker                  0
Open                    0
High                    0
Low                     0
Close                   0
Volume                  0
sp500_Open              0
sp500_High              0
sp500_Low               0
sp500_Close             0
sp500_Volume            0
return_5d               0
rsi_14d                 0
daily_return            0
volatility_10d          0
volatility_20d          0
sp500_return_5d         0
relative_strength_5d    0
target_5d               0
dtype: int64