In [1]:
from pathlib import Path
from datetime import datetime
import os, sys, json
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

# Load .env
load_dotenv(dotenv_path=Path("..") / ".env")  # notebook is under notebooks/, .env is one level up

DATA_DIR = Path("..") / "data"
RAW_DIR = DATA_DIR / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

def ts():
    """Return a timestamp like 20250817-1615 for filenames."""
    return datetime.now().strftime("%Y%m%d-%H%M")

def validate_df(df: pd.DataFrame, required_cols: list[str]) -> dict:
    """Basic validation: required columns present, shapes/NA counts."""
    report = {"shape": df.shape, "missing": {}, "required_missing": []}
    for c in required_cols:
        if c not in df.columns:
            report["required_missing"].append(c)
    for c in df.columns:
        report["missing"][c] = int(df[c].isna().sum())
    return report

def assert_ok(report: dict):
    """Raise if required columns missing."""
    if report["required_missing"]:
        raise ValueError(f"Missing required columns: {report['required_missing']}")

print("Loaded env — DATA_DIR:", os.getenv("DATA_DIR"), "| TICKER:", os.getenv("TICKER"))


Loaded env — DATA_DIR: ./data | TICKER: AAPL


In [2]:
import yfinance as yf

ticker = os.getenv("TICKER", "AAPL")
print("Pulling daily prices for:", ticker)

data = yf.Ticker(ticker).history(period="1y", interval="1d")
# move index (DatetimeIndex) to a column for CSV friendliness
data = data.reset_index().rename(columns={"Date": "date", "Open":"open","High":"high","Low":"low","Close":"close","Volume":"volume"})
# parse dtypes
data["date"] = pd.to_datetime(data["date"])
for col in ["open","high","low","close"]:
    data[col] = pd.to_numeric(data[col], errors="coerce")
data["volume"] = pd.to_numeric(data["volume"], errors="coerce").astype("Int64")

# validate
req_cols = ["date","open","high","low","close","volume"]
report = validate_df(data, req_cols)
assert_ok(report)
print("API validation:", json.dumps(report, indent=2))

# save
api_path = RAW_DIR / f"api_yfinance_{ticker}_{ts()}.csv"
data.to_csv(api_path, index=False)
api_path


Pulling daily prices for: AAPL
API validation: {
  "shape": [
    250,
    8
  ],
  "missing": {
    "date": 0,
    "open": 0,
    "high": 0,
    "low": 0,
    "close": 0,
    "volume": 0,
    "Dividends": 0,
    "Stock Splits": 0
  },
  "required_missing": []
}


PosixPath('../data/raw/api_yfinance_AAPL_20250817-2248.csv')

**Source:**  
- Yahoo Finance via `yfinance` Python library  

**Parameters:**  
- `ticker`: Loaded from `.env` file (example: AAPL)  
- `period`: "1y" (1 year of data)  
- `interval`: "1d" (daily frequency)  

**Validation Performed:**  
- Required columns: `date, open, high, low, close, volume`  
- Checked for missing values (all zero above)  
- Confirmed dataset shape: 250 rows × 8 columns  

In [3]:
URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0 (educational project)"}

resp = requests.get(URL, headers=headers, timeout=30)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")
# The table we want has id="constituents"
table = soup.find("table", {"id": "constituents"})
rows = table.find_all("tr")

# Parse header
header = [th.get_text(strip=True) for th in rows[0].find_all(["th","td"])]

records = []
for r in rows[1:]:
    cells = [td.get_text(strip=True) for td in r.find_all(["td","th"])]
    if len(cells) == len(header):
        records.append(cells)

scrape_df = pd.DataFrame(records, columns=header)

# Keep a small subset of columns commonly present
keep = [c for c in ["Symbol","Security","GICS Sector","GICS Sub-Industry","Headquarters Location"] if c in scrape_df.columns]
scrape_df = scrape_df[keep].copy()

# basic validation
req_cols = ["Symbol","Security"]
report2 = validate_df(scrape_df, req_cols)
assert_ok(report2)
print("Scrape validation:", json.dumps(report2, indent=2))

# save
scrape_path = RAW_DIR / f"scrape_wikipedia_sp500_constituents_{ts()}.csv"
scrape_df.to_csv(scrape_path, index=False)
scrape_path


Scrape validation: {
  "shape": [
    503,
    4
  ],
  "missing": {
    "Symbol": 0,
    "Security": 0,
    "GICS Sub-Industry": 0,
    "Headquarters Location": 0
  },
  "required_missing": []
}


PosixPath('../data/raw/scrape_wikipedia_sp500_constituents_20250817-2258.csv')

**Source:**  
- Wikipedia S&P 500 list  (URL: <https://en.wikipedia.org/wiki/List_of_S%26P_500_companies>)

**Parameters:**  
- Extracted `<table>` element using CSS selectors  
- Parsed rows into pandas DataFrame  
- Columns standardized to expected names  

**Validation Performed:**  
- Checked required numeric/text columns exist  
- Counted missing values (NA)  
- Confirmed shape matches expected rows × columns 

## Assumptions & Risks

- Assumes Yahoo Finance API remains available and consistent  
- Assumes scraping selectors will continue to work (site structure unchanged)  
- Risk of incomplete or delayed data (weekends, holidays, exchange outages)  
- Public website may update irregularly; not guaranteed to be authoritative  