In [1]:
import sys
# The '!' tells Jupyter to run a command in the terminal
# We use sys.executable to ensure we're using the Python interpreter this notebook is running on
!{sys.executable} -m pip install yfinance pandas_ta lxml

# pandas_ta and lxml are other libraries we will need very soon, so let's install them now.


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import sys
!{sys.executable} -m pip install --upgrade yfinance


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import pandas as pd
import yfinance as yf
import os
import time
from tqdm import tqdm

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# The URL for the list of S&P 500 companies on Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# pandas can read tables directly from a webpage.
payload = pd.read_html(url)
sp500_table = payload[0]

# The stock tickers are in the 'Symbol' column.
# Some tickers in Wikipedia have a '.', like 'BRK.B'. We need to replace it with a '-' for yfinance to read it.
sp500_tickers = sp500_table['Symbol'].str.replace('.', '-', regex=False).tolist()

print("First 10 tickers from S&P 500 list (formatted for yfinance):")
print(sp500_tickers[:10])

First 10 tickers from S&P 500 list (formatted for yfinance):
['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']


In [5]:
# Define the universe of stocks: Top 100 plus SPY and VIX
top_100_tickers = sp500_tickers[:100]
tickers_to_download = top_100_tickers + ['SPY', '^VIX']

# Define the time period
START_DATE = "2021-01-01"
END_DATE = "2024-12-31"

# Define the path to save the raw data
RAW_DATA_DIR = "../data/raw"

# Create the directory if it doesn't exist
if not os.path.exists(RAW_DATA_DIR):
    os.makedirs(RAW_DATA_DIR)

print(f"Number of tickers to download: {len(tickers_to_download)}")
print(f"Data will be saved in: {RAW_DATA_DIR}")

Number of tickers to download: 102
Data will be saved in: ../data/raw


In [6]:
# This is the full, corrected code block for the download cell.

failed_tickers = []
# Make sure the raw data directory exists
if not os.path.exists(RAW_DATA_DIR):
    os.makedirs(RAW_DATA_DIR)

print("Starting data download with corrected method...")

for ticker in tqdm(tickers_to_download):
    try:
        # auto_adjust=True gives clean OHLCV data without Adjustments columns
        data = yf.download(ticker, start=START_DATE, end=END_DATE, auto_adjust=True, progress=False)

        if not data.empty:
            file_path = os.path.join(RAW_DATA_DIR, f"{ticker}.csv")
            
            # This correctly saves the index (the dates) as a column named 'Date'
            data.to_csv(file_path, index=True, index_label='Date')
            
        else:
            print(f"⚠️ No data found for {ticker}, skipping.")
            failed_tickers.append(ticker)

    except Exception as e:
        print(f"❌ Failed to download data for {ticker}: {e}")
        failed_tickers.append(ticker)

print("\n--- Download Summary ---")
print("Data download process complete.")
if failed_tickers:
    print(f"Failed to download the following tickers: {failed_tickers}")
else:
    print("All tickers were downloaded successfully!")

Starting data download with corrected method...


100%|█████████████████████████████████████████| 102/102 [01:11<00:00,  1.43it/s]


--- Download Summary ---
Data download process complete.
All tickers were downloaded successfully!



