In [1]:
from pathlib import Path
from datetime import datetime
import os, json
import pandas as pd
from dotenv import load_dotenv

load_dotenv(dotenv_path=Path("..") / ".env")

# Directories
RAW_DIR = Path("..") / "data" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

def ts():
    """Timestamp like 20250817-1615 for filenames."""
    return datetime.now().strftime("%Y%m%d-%H%M")

def validate_df(df: pd.DataFrame, required_cols: list[str]) -> dict:
    """Basic validation report: shape, NA counts, missing required cols."""
    report = {"shape": list(df.shape), "missing": {}, "required_missing": []}
    for c in required_cols:
        if c not in df.columns:
            report["required_missing"].append(c)
    for c in df.columns:
        report["missing"][c] = int(df[c].isna().sum())
    return report

def assert_ok(report: dict):
    if report["required_missing"]:
        raise ValueError(f"Missing required columns: {report['required_missing']}")

print("Loaded .env — TICKER:", os.getenv("TICKER", "<not set>"))

Loaded .env — TICKER: AAPL


In [2]:
import yfinance as yf

ticker = os.getenv("TICKER", "AAPL")
print("Pulling daily prices for:", ticker)

df = yf.Ticker(ticker).history(period="1y", interval="1d").reset_index()

# Standardize column names
df = df.rename(columns={
    "Date": "date",
    "Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume"
})

# Dtypes
df["date"] = pd.to_datetime(df["date"])
for c in ["open","high","low","close"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df["volume"] = pd.to_numeric(df["volume"], errors="coerce").astype("Int64")

# Validate
req_cols = ["date","open","high","low","close","volume"]
report = validate_df(df, req_cols)
assert_ok(report)
print("API validation:", json.dumps(report, indent=2))

# Save
api_path = RAW_DIR / f"api_yfinance_{ticker}_{ts()}.csv"
df.to_csv(api_path, index=False)
api_path


Pulling daily prices for: AAPL
API validation: {
  "shape": [
    250,
    8
  ],
  "missing": {
    "date": 0,
    "open": 0,
    "high": 0,
    "low": 0,
    "close": 0,
    "volume": 0,
    "Dividends": 0,
    "Stock Splits": 0
  },
  "required_missing": []
}


PosixPath('../data/raw/api_yfinance_AAPL_20250817-2321.csv')

In [8]:
import requests
from bs4 import BeautifulSoup

URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0 (educational project)"}

resp = requests.get(URL, headers=headers, timeout=30)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")
table = soup.find("table", {"id": "constituents"})

# Use pandas to parse the HTML table for resilience
sp500 = pd.read_html(str(table))[0]

# Keep a consistent subset if present
keep = [c for c in ["Symbol","Security","GICS Sector","GICS Sub-Industry","Headquarters Location"] if c in sp500.columns]
sp500 = sp500[keep].copy() if keep else sp500.copy()

# Validate
req_cols2 = ["Symbol","Security"]
report2 = validate_df(sp500, req_cols2)
assert_ok(report2)
print("Scrape validation:", json.dumps(report2, indent=2))

# Save
scrape_path = RAW_DIR / f"scrape_wikipedia_sp500_constituents_{ts()}.csv"
sp500.to_csv(scrape_path, index=False)
scrape_path


Scrape validation: {
  "shape": [
    503,
    5
  ],
  "missing": {
    "Symbol": 0,
    "Security": 0,
    "GICS Sector": 0,
    "GICS Sub-Industry": 0,
    "Headquarters Location": 0
  },
  "required_missing": []
}


  sp500 = pd.read_html(str(table))[0]


PosixPath('../data/raw/scrape_wikipedia_sp500_constituents_20250817-2325.csv')

## Assumptions & Risks

- Assumes Yahoo Finance remains available and consistent; occasional gaps/adjustments possible  
- Wikipedia table structure may change; selectors may need updates  
- `.env` is not committed; `.env.example` shared for reproducibility  
- Saved raw files are timestamped; later stages should reference a specific snapshot