In [1]:
# !pip install sqlalchemy psycopg2-binary
import os, pandas as pd
from sqlalchemy import create_engine

pg_user = os.getenv("PGUSER", "postgres")
pg_pass = os.getenv("PGPASSWORD", "CSDBMS623")
pg_host = os.getenv("PGHOST", "localhost")
pg_port = os.getenv("PGPORT", "5432")
pg_db   = os.getenv("PGDATABASE", "SP500_ML")

engine = create_engine(f"postgresql+psycopg2://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}")

# Use only the most recent membership date to define the current universe
universe = pd.read_sql_query("""
    SELECT DISTINCT UPPER(TRIM(latest_ticker)) AS latest_ticker
    FROM sp500_long_latest_profiles
    WHERE latest_ticker IS NOT NULL

""", engine)["latest_ticker"].tolist()

print("Universe size:", len(universe))

# Now run your Yahoo fetcher
#prices_df = fetch_prices_for_universe(universe, checkpoint_path="prices_checkpoint.parquet")

Universe size: 679


# Balance Sheet Pipeline (Quarterly pull → SCD windows → Signed YoY growth)

This block pulls **quarterly balance sheets** from FMP, builds **as-of validity windows**, and computes **YoY growth** on selected items using **symmetric percent change (SPC)**, which is stable for zeros and sign flips.

## What it does

- **Fetch & clean**
  - Endpoint: `/api/v3/balance-sheet-statement/{ticker}?period=quarter&limit=120&apikey=...`
  - Robust retries for 429/5xx, normalize JSON, uppercase `symbol`, coerce `date`, de-dupe `(symbol, date)`, sort by `["symbol","date"]`.

- **As-of windows (SCD)**
  - `date_start = date`
  - `date_end   = next(date)` within the same `symbol` (last row uses `2100-01-01`).
  - Use `[date_start, date_end)` for leakage-free joins to prices.

- **Signed YoY growth (default = t vs t−4)**
  - For each target column (e.g., `totalAssets`, `totalLiabilities`, `netDebt`, …):
    \[
    \text{spc}(a,b)=\frac{2(a-b)}{|a|+|b|}\in[-2,+2]
    \]
    Returns −200%…+200%, well-behaved at zero; if `a=b=0`, growth = 0.

- **Exact same-quarter YoY (optional)**
  - Deduplicate to the **latest** filing for each `(symbol, calendarYear, period∈{Q1..Q4})`.
  - Join Qx(Y) to Qx(Y−1) and apply SPC. This is more robust than a raw `shift(4)` if quarters are missing/re-filed.

## Outputs

- Base BS fields from FMP (assets/liabilities/equity, current/non-current splits, debt, etc.)
- `date_start`, `date_end` validity window
- `*_yoy` columns for your selected targets (SPC YoY)
- (Optional) `*_q_yoy` if you enable the exact same-quarter YoY helper.


In [24]:
# ===================== FMP Balance Sheet Pull + SCD Windows + Signed YoY Growth =====================
import time
from typing import Iterable, List, Optional, Tuple, Union
import requests
import pandas as pd

FMP_BS_BASE = "https://financialmodelingprep.com/api/v3/balance-sheet-statement"
MAX_VALID_DATE = pd.Timestamp("2100-01-01")

# -------- Robust GET with retries --------
def _get_with_retries(
    session: requests.Session,
    url: str,
    params: dict,
    timeout: int = 60,
    max_retries: int = 4,
    base_sleep: float = 1.0,
):
    last = None
    for attempt in range(1, max_retries + 1):
        resp = session.get(url, params=params, timeout=timeout)
        if resp.status_code == 200:
            return resp
        last = resp
        if resp.status_code in (429, 500, 502, 503, 504):
            time.sleep(base_sleep * (2 ** (attempt - 1)))
            continue
        resp.raise_for_status()
    if last is not None:
        last.raise_for_status()
    raise RuntimeError("Request failed without response.")

# -------- Normalize API JSON --------
def _normalize_fmp_json(j):
    if isinstance(j, list):
        return j
    if isinstance(j, dict):
        for k in ("error", "Error", "message", "Note", "Error Message"):
            if k in j and isinstance(j[k], str):
                raise RuntimeError(f"API message: {j[k]}")
        for k in ("financials", "items", "data", "results", "financialStatements"):
            if k in j and isinstance(j[k], list):
                return j[k]
        return [j]
    raise RuntimeError(f"Unexpected JSON type: {type(j)}")

# -------- Signed symmetric YoY % change (robust to zeros/negatives) --------
def _signed_pct_change(curr: pd.Series, prev: pd.Series) -> pd.Series:
    """
    growth = 2*(curr - prev) / (|curr| + |prev|)
    Returns Float64; if both are 0 -> 0.0
    """
    a = pd.to_numeric(curr, errors="coerce").astype("Float64")
    b = pd.to_numeric(prev, errors="coerce").astype("Float64")
    denom = a.abs() + b.abs()
    out = pd.Series(pd.NA, index=a.index, dtype="Float64")
    valid = denom.notna() & (denom != 0)
    out.loc[valid] = 2.0 * (a[valid] - b[valid]) / denom[valid]
    both_zero = (a == 0) & (b == 0)
    out.loc[both_zero] = 0.0
    return out

# -------- SCD windows --------
def _add_validity_windows(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["symbol", "date"]).reset_index(drop=True)
    if "date_start" not in df.columns:
        df["date_start"] = df["date"]
    if "date_end" not in df.columns:
        df["date_end"] = df.groupby("symbol")["date"].shift(-1).fillna(MAX_VALID_DATE)
    return df

# -------- One-ticker pull (fixed symbol handling) --------
def fetch_balance_sheets_one(
    ticker: str,
    api_key: str,
    session: Optional[requests.Session] = None,
    period: str = "quarter",   # use "quarter" for analysis snapshots
    limit: int = 120,
    timeout: int = 30,
) -> pd.DataFrame:
    if session is None:
        session = requests.Session()
    url = f"{FMP_BS_BASE}/{ticker}"
    params = {"period": period, "apikey": api_key, "limit": limit}

    r = _get_with_retries(session, url, params, timeout=timeout)
    try:
        data = r.json()
    except ValueError as e:
        raise RuntimeError(f"Non-JSON response for {ticker}: {r.text[:300]}") from e

    records = _normalize_fmp_json(data)
    if not records:
        raise RuntimeError(f"No balance sheet records for {ticker}.")

    df = pd.DataFrame.from_records(records)

    # robust symbol handling (no ambiguous truth)
    if "symbol" in df.columns:
        df["symbol"] = df["symbol"].astype(str)
        mask_missing = df["symbol"].isin(["", "None", "nan", "NaN"]) | df["symbol"].isna()
        df.loc[mask_missing, "symbol"] = ticker
        df["symbol"] = df["symbol"].str.upper()
    else:
        df["symbol"] = ticker.upper()

    if "date" not in df.columns:
        raise RuntimeError(f"'date' missing for {ticker}.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

    df = (
        df.dropna(subset=["date"])
          .drop_duplicates(subset=["symbol", "date"], keep="first")
          .sort_values(["symbol", "date"])
          .reset_index(drop=True)
    )
    return df

# -------- Multi-ticker orchestrator --------
def fetch_balance_sheets(
    tickers: Union[str, Iterable[str]],
    api_key: str,
    period: str = "quarter",
    limit: int = 120,
    batch_size: int = 25,
    sleep_between_batches: float = 1.0,
    timeout: int = 30,
    skip_errors: bool = True,
    verbose: bool = True,
) -> pd.DataFrame:
    if isinstance(tickers, str):
        tickers = [tickers]
    tickers = [t.upper().strip() for t in tickers if str(t).strip()]
    session = requests.Session()
    frames: List[pd.DataFrame] = []
    skipped: List[Tuple[str, str]] = []

    for i in range(0, len(tickers), batch_size):
        batch = tickers[i:i + batch_size]
        if verbose:
            print(f"Batch {i//batch_size + 1}: {len(batch)} tickers "
                  f"({i+1}–{min(i+len(batch), len(tickers))} of {len(tickers)})")
        for t in batch:
            try:
                frames.append(fetch_balance_sheets_one(
                    t, api_key, session, period, limit, timeout
                ))
            except Exception as e:
                if skip_errors:
                    skipped.append((t, str(e)))
                    if verbose:
                        print(f"  [skip] {t}: {e}")
                else:
                    raise
        if i + batch_size < len(tickers):
            time.sleep(sleep_between_batches)

    if not frames:
        if verbose:
            print("No successful balance-sheet pulls.")
            if skipped:
                print(f"Skipped {len(skipped)} tickers. Examples: {skipped[:5]}")
        return pd.DataFrame()

    df_all = pd.concat(frames, ignore_index=True)
    df_all = _add_validity_windows(df_all)

    if verbose:
        print(f"✅ Success: {len(frames)} tickers; rows: {len(df_all)}")
        if skipped:
            print(f"⚠️ Skipped {len(skipped)} tickers.")

    return df_all

# -------- Optional: add signed YoY growth (t vs t-4) --------
BS_YOY_TARGETS = [
    # liquidity / current
    "cashAndCashEquivalents", "shortTermInvestments", "cashAndShortTermInvestments",
    "netReceivables", "inventory", "otherCurrentAssets", "totalCurrentAssets",
    # long-lived
    "propertyPlantEquipmentNet", "longTermInvestments", "totalNonCurrentAssets",
    # totals
    "totalAssets",
    # liabilities
    "accountPayables", "shortTermDebt", "totalCurrentLiabilities",
    "longTermDebt", "totalNonCurrentLiabilities", "totalLiabilities",
    # equity & leverage
    "commonStock", "retainedEarnings", "totalStockholdersEquity", "totalEquity",
    "totalDebt", "netDebt",
]

def add_bs_yoy_growth(df: pd.DataFrame, targets: Optional[List[str]] = None) -> pd.DataFrame:
    if df.empty:
        return df
    if targets is None:
        targets = BS_YOY_TARGETS

    df = df.sort_values(["symbol", "date"]).reset_index(drop=True).copy()

    for col in targets:
        if col not in df.columns:
            continue
        s = pd.to_numeric(df[col], errors="coerce")
        prev = df.groupby("symbol")[col].shift(4)
        df[f"{col}_yoy"] = _signed_pct_change(s, prev)
    return df

# -------------------------- Example usage --------------------------
if __name__ == "__main__":
    API_KEY = ""
    tickers = universe # or a longer list

    # 1) Pull balance sheets (quarterly recommended)
    df_bs = fetch_balance_sheets(
        tickers=tickers,
        api_key=API_KEY,
        period="quarter",    # use "annual" if you truly want FY snapshots
        limit=120,
        batch_size=5,
        sleep_between_batches=1.0,
        skip_errors=True,
        verbose=True,
    )

    # 2) (Optional) add signed YoY growth per selected BS fields
    if not df_bs.empty:
        df_bs = add_bs_yoy_growth(df_bs)
        # Peek
        cols_show = [
            "symbol","date","totalAssets","totalAssets_yoy",
            "totalLiabilities","totalLiabilities_yoy",
            "totalStockholdersEquity","totalStockholdersEquity_yoy",
            "cashAndShortTermInvestments","cashAndShortTermInvestments_yoy",
            "totalDebt","totalDebt_yoy","netDebt","netDebt_yoy",
            "date_start","date_end"
        ]
        print(df_bs[[c for c in cols_show if c in df_bs.columns]].head(12))


Batch 1: 5 tickers (1–5 of 679)
Batch 2: 5 tickers (6–10 of 679)
Batch 3: 5 tickers (11–15 of 679)
Batch 4: 5 tickers (16–20 of 679)
Batch 5: 5 tickers (21–25 of 679)
Batch 6: 5 tickers (26–30 of 679)
Batch 7: 5 tickers (31–35 of 679)
Batch 8: 5 tickers (36–40 of 679)
Batch 9: 5 tickers (41–45 of 679)
Batch 10: 5 tickers (46–50 of 679)
Batch 11: 5 tickers (51–55 of 679)
Batch 12: 5 tickers (56–60 of 679)
Batch 13: 5 tickers (61–65 of 679)
Batch 14: 5 tickers (66–70 of 679)
Batch 15: 5 tickers (71–75 of 679)
Batch 16: 5 tickers (76–80 of 679)
Batch 17: 5 tickers (81–85 of 679)
Batch 18: 5 tickers (86–90 of 679)
Batch 19: 5 tickers (91–95 of 679)
Batch 20: 5 tickers (96–100 of 679)
Batch 21: 5 tickers (101–105 of 679)
Batch 22: 5 tickers (106–110 of 679)
Batch 23: 5 tickers (111–115 of 679)
Batch 24: 5 tickers (116–120 of 679)
Batch 25: 5 tickers (121–125 of 679)
Batch 26: 5 tickers (126–130 of 679)
Batch 27: 5 tickers (131–135 of 679)
Batch 28: 5 tickers (136–140 of 679)
Batch 29: 5 tic

In [30]:
df_bs

Unnamed: 0,date,symbol,reportedCurrency,cik,fillingDate,acceptedDate,calendarYear,period,cashAndCashEquivalents,shortTermInvestments,...,totalCurrentLiabilities_yoy,longTermDebt_yoy,totalNonCurrentLiabilities_yoy,totalLiabilities_yoy,commonStock_yoy,retainedEarnings_yoy,totalStockholdersEquity_yoy,totalEquity_yoy,totalDebt_yoy,netDebt_yoy
0,1999-10-31,A,USD,0001090872,2000-01-25,2000-01-25 00:00:00,1999,Q4,0.000000e+00,0.0,...,,,,,,,,,,
1,2000-01-31,A,USD,0001090872,2000-03-15,2000-03-15 00:00:00,2000,Q1,1.368000e+09,42000000.0,...,,,,,,,,,,
2,2000-04-30,A,USD,0001090872,2000-06-12,2000-06-12 00:00:00,2000,Q2,9.780000e+08,0.0,...,,,,,,,,,,
3,2000-07-31,A,USD,0001090872,2000-09-01,2000-09-01 00:00:00,2000,Q3,7.030000e+08,0.0,...,,,,,,,,,,
4,2000-10-31,A,USD,0001090872,2001-01-17,2001-01-17 00:00:00,2000,Q4,9.960000e+08,0.0,...,0.452118,0.0,0.05364,0.391262,0.222222,2.0,0.435527,0.435527,2.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66984,2024-06-30,ZTS,USD,0001555280,2024-08-06,2024-08-06 12:41:47,2024,Q2,1.573000e+09,0.0,...,0.015704,0.000148,0.006636,0.0084,0.0,0.152998,0.071108,0.070765,-0.000148,0.027446
66985,2024-09-30,ZTS,USD,0001555280,2024-11-04,2024-11-04 13:52:13,2024,Q3,1.710000e+09,0.0,...,0.056798,0.001778,-0.000404,0.010025,0.0,0.151827,0.030256,0.031044,0.001481,0.01016
66986,2024-12-31,ZTS,USD,0001555280,2025-02-13,2025-02-13 14:51:25,2024,Q4,1.985000e+09,0.0,...,0.574609,-0.223613,-0.200728,0.018335,0.0,0.150294,-0.046483,-0.045282,-0.00163,0.009077
66987,2025-03-31,ZTS,USD,0001555280,2025-05-06,2025-05-06 13:29:54,2025,Q1,1.721000e+09,0.0,...,0.557612,-0.221637,-0.197694,0.015689,0.0,0.145551,-0.082982,-0.081797,-0.002959,0.047222


In [31]:
# ===================== Add Balance-Sheet Ratios onto df_bs =====================
import pandas as pd

def _to_num(s):
    return pd.to_numeric(s, errors="coerce").astype("Float64")

def _safe_div(a, b):
    a = _to_num(a); b = _to_num(b)
    out = a / b
    return out.mask(b.isna() | (b == 0)).astype("Float64")

def add_bs_ratios_inplace(df_bs: pd.DataFrame) -> pd.DataFrame:
    """
    Returns a NEW DataFrame equal to df_bs with ratio columns appended.
    It does NOT drop any existing columns.
    """
    df = df_bs.copy()

    # Ensure keys / windows exist (no look-ahead alignment)
    df["symbol"] = df["symbol"].astype(str).str.upper().str.strip()
    df["date"]   = pd.to_datetime(df["date"], errors="coerce")
    if "date_start" not in df.columns:
        df["date_start"] = df["date"]
    if "date_end" not in df.columns:
        df["date_end"] = df.groupby("symbol")["date"].shift(-1).fillna(pd.Timestamp("2100-01-01"))

    # Numeric building blocks
    ca = _to_num(df.get("totalCurrentAssets"))
    cl = _to_num(df.get("totalCurrentLiabilities"))
    cash = _to_num(df.get("cashAndCashEquivalents"))
    sti  = _to_num(df.get("shortTermInvestments"))
    ar   = _to_num(df.get("netReceivables"))
    inv  = _to_num(df.get("inventory"))

    ta = _to_num(df.get("totalAssets"))
    tl = _to_num(df.get("totalLiabilities"))
    eq = _to_num(df.get("totalStockholdersEquity"))
    # Equity fallback if missing
    eq = eq.where(eq.notna(), ta - tl)

    sd = _to_num(df.get("shortTermDebt"))
    ld = _to_num(df.get("longTermDebt"))
    td = _to_num(df.get("totalDebt"))
    # totalDebt fallback if missing
    td = td.where(td.notna(), (sd.fillna(0) + ld.fillna(0)).where(sd.notna() | ld.notna()))

    # Net debt (can be negative). NA if we have no info at all.
    cash_like = (cash.fillna(0) + sti.fillna(0)).astype("Float64")
    has_any_cashbits = cash.notna() | sti.notna()
    net_debt = (td - cash_like).where(td.notna() | has_any_cashbits)

    # ---- Liquidity ----
    current_ratio = _safe_div(ca, cl)
    quick_assets  = (cash.fillna(0) + sti.fillna(0) + ar.fillna(0)).astype("Float64")
    quick_ratio   = _safe_div(quick_assets, cl)
    cash_ratio    = _safe_div(cash.fillna(0) + sti.fillna(0), cl)
    working_capital = (ca - cl).astype("Float64")
    working_capital_to_assets = _safe_div(working_capital, ta)
    inventory_to_current      = _safe_div(inv, ca)

    # ---- Capital structure ----
    debt_to_equity         = _safe_div(td, eq)
    debt_to_assets         = _safe_div(td, ta)
    net_debt_to_equity     = _safe_div(net_debt, eq)
    liabilities_to_assets  = _safe_div(tl, ta)
    equity_ratio           = _safe_div(eq, ta)
    long_term_capital      = (ld + eq).astype("Float64")
    lt_debt_to_capital     = _safe_div(ld, long_term_capital)
    total_debt_to_capital  = _safe_div(td, (td + eq))

    # Append columns (keeps all originals)
    df["current_ratio"]              = current_ratio
    df["quick_ratio"]                = quick_ratio
    df["cash_ratio"]                 = cash_ratio
    df["working_capital"]            = working_capital
    df["working_capital_to_assets"]  = working_capital_to_assets
    df["inventory_to_current"]       = inventory_to_current

    df["debt_to_equity"]             = debt_to_equity
    df["debt_to_assets"]             = debt_to_assets
    df["net_debt_to_equity"]         = net_debt_to_equity
    df["liabilities_to_assets"]      = liabilities_to_assets
    df["equity_ratio"]               = equity_ratio
    df["lt_debt_to_capital"]         = lt_debt_to_capital
    df["total_debt_to_capital"]      = total_debt_to_capital

    # Keep handy components (easy to drop later)
    df["total_debt"]                 = td
    df["net_debt"]                   = net_debt
    df["total_assets"]               = ta
    df["total_equity"]               = eq
    df["total_liabilities"]          = tl
    df["total_current_assets"]       = ca
    df["total_current_liabilities"]  = cl
    df["cash_and_st_investments"]    = cash_like

    # Sorted output
    return df.sort_values(["symbol","date"]).reset_index(drop=True)

# ---- Run it on your balance sheet frame ----
df_bs = add_bs_ratios_inplace(df_bs)
print(df_bs.filter(regex="^(symbol|date$|current_ratio|quick_ratio|debt_to_equity|total_debt$)").head())


        date symbol  current_ratio  quick_ratio  debt_to_equity   total_debt
0 1999-10-31      A         2.1047     0.972635             0.0          0.0
1 2000-01-31      A       2.410256     1.381229        0.222247  997000000.0
2 2000-04-30      A       2.313358     1.311528        0.021112   98000000.0
3 2000-07-31      A       2.129084     1.143426        0.026316  129000000.0
4 2000-10-31      A       1.929778     1.101765        0.020893  110000000.0


In [32]:
df_bs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66989 entries, 0 to 66988
Data columns (total 100 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   date                                     66989 non-null  datetime64[ns]
 1   symbol                                   66989 non-null  object        
 2   reportedCurrency                         66989 non-null  object        
 3   cik                                      66989 non-null  object        
 4   fillingDate                              66989 non-null  object        
 5   acceptedDate                             66989 non-null  object        
 6   calendarYear                             66989 non-null  object        
 7   period                                   66989 non-null  object        
 8   cashAndCashEquivalents                   66989 non-null  float64       
 9   shortTermInvestments                  

In [33]:
chk = (df_bs.sort_values(["symbol","date"])
          .assign(lag4_date = lambda d: d.groupby("symbol")["date"].shift(4),
                  days_diff = lambda d: (d["date"] - d["lag4_date"]).dt.days))

# If you have 'period' like Q1/Q2/Q3/Q4:
has_q = df_bs["period"].str.upper().isin(["Q1","Q2","Q3","Q4"])
same_q = (df_bs.loc[has_q, "period"].str.upper().values ==
          df_bs.loc[has_q].groupby("symbol")["period"].shift(4).str.upper().values)

print("Share with ~1-year spacing (±30 days):",
      (chk["days_diff"].between(335, 395)).mean())

# If 'period' exists:
if "period" in df_bs.columns:
    print("Share with exact same quarter label vs lag-4:", pd.Series(same_q).mean())


Share with ~1-year spacing (±30 days): 0.9507083252474287
Share with exact same quarter label vs lag-4: 0.9544402812402036


In [39]:
# ===================== ADD AFTER YOUR API PULL =====================
import pandas as pd

def add_financial_leverage(df_bs: pd.DataFrame) -> pd.DataFrame:
    """
    Adds balance-sheet leverage metrics per symbol/date:
      - financialLeverage = totalAssets / totalStockholdersEquity
      - financialLeverage_avg4q = avg(assets,4q) / avg(equity,4q)
      - financialLeverage_yoy = signed pct change of financialLeverage vs t-4
      - debtToEquity, liabilitiesToEquity, debtToAssets, netDebtToEquity (if inputs exist)
    """
    df = df_bs.sort_values(["symbol", "date"]).reset_index(drop=True).copy()

    def _pick(*names):
        for n in names:
            if n in df.columns: return n
        return None

    # Column picks (CamelCase from FMP, fall back to lowercase if needed)
    assets_col  = _pick("totalAssets", "totalassets")
    equity_col  = _pick("totalStockholdersEquity", "totalstockholdersequity", "totalEquity", "totalequity")
    debt_col    = _pick("totalDebt", "totaldebt")
    liab_col    = _pick("totalLiabilities", "totalliabilities")
    cash_col    = _pick("cashAndCashEquivalents", "cashandcashequivalents")
    std_col     = _pick("shortTermDebt", "shorttermdebt")
    ltd_col     = _pick("longTermDebt", "longtermdebt")

    # If totalDebt missing but short/long exist, synthesize
    if debt_col is None and std_col and ltd_col:
        debt_col = "__totalDebt_synth"
        df[debt_col] = pd.to_numeric(df[std_col], errors="coerce") + pd.to_numeric(df[ltd_col], errors="coerce")

    # If netDebt missing but totalDebt & cash exist, synthesize
    net_debt_col = _pick("netDebt", "netdebt")
    if net_debt_col is None and debt_col and cash_col:
        net_debt_col = "__netDebt_synth"
        df[net_debt_col] = pd.to_numeric(df[debt_col], errors="coerce") - pd.to_numeric(df[cash_col], errors="coerce")

    def _safe_ratio(num_s, den_s):
        n = pd.to_numeric(num_s, errors="coerce").astype("Float64")
        d = pd.to_numeric(den_s, errors="coerce").astype("Float64")
        out = pd.Series(pd.NA, index=n.index, dtype="Float64")
        ok = d.notna() & (d != 0)
        out.loc[ok] = n[ok] / d[ok]
        return out

    # Equity multiplier (period-end)
    if assets_col and equity_col:
        df["financialLeverage"] = _safe_ratio(df[assets_col], df[equity_col])

        # 4-quarter averages
        A_avg = df.groupby("symbol")[assets_col].transform(
            lambda s: pd.to_numeric(s, errors="coerce").rolling(4, min_periods=2).mean()
        )
        E_avg = df.groupby("symbol")[equity_col].transform(
            lambda s: pd.to_numeric(s, errors="coerce").rolling(4, min_periods=2).mean()
        )
        df["financialLeverage_avg4q"] = _safe_ratio(A_avg, E_avg).astype("Float64")

        # YoY on period-end multiplier (uses your _signed_pct_change defined above)
        prev = df.groupby("symbol")["financialLeverage"].shift(4)
        df["financialLeverage_yoy"] = _signed_pct_change(df["financialLeverage"], prev)

    # Helpful variants (only if inputs exist)
    if debt_col and equity_col:
        df["debtToEquity"] = _safe_ratio(df[debt_col], df[equity_col]).astype("Float64")
    if liab_col and equity_col:
        df["liabilitiesToEquity"] = _safe_ratio(df[liab_col], df[equity_col]).astype("Float64")
    if debt_col and assets_col:
        df["debtToAssets"] = _safe_ratio(df[debt_col], df[assets_col]).astype("Float64")
    if net_debt_col and equity_col:
        df["netDebtToEquity"] = _safe_ratio(df[net_debt_col], df[equity_col]).astype("Float64")

    # Clean up temp synthesized columns from output
    for tmp in ("__totalDebt_synth", "__netDebt_synth"):
        if tmp in df.columns:
            # keep them if you want to inspect; otherwise drop:
            df.drop(columns=[tmp], inplace=True)

    return df

# --- call it right after your API pull (and after add_bs_yoy_growth if you use it) ---
# df_bs = fetch_balance_sheets(...)

df_bs = add_financial_leverage(df_bs)
# (optional) peek:
# print(df_bs[["symbol","date","financialLeverage","financialLeverage_avg4q","financialLeverage_yoy",
#              "debtToEquity","liabilitiesToEquity","debtToAssets","netDebtToEquity"]].head(12))


In [53]:
df_bs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66989 entries, 0 to 66988
Columns: 107 entries, date to netDebtToEquity
dtypes: Float64(51), datetime64[ns](3), float64(42), int64(2), object(9)
memory usage: 57.9+ MB


In [57]:
df_bs.info(verbose=True)          # force full list
# or
#df_bs.info(max_cols=10_000)       # raise the threshold for this call


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66989 entries, 0 to 66988
Data columns (total 107 columns):
 #    Column                                   Dtype         
---   ------                                   -----         
 0    date                                     datetime64[ns]
 1    symbol                                   object        
 2    reportedCurrency                         object        
 3    cik                                      object        
 4    fillingDate                              object        
 5    acceptedDate                             object        
 6    calendarYear                             object        
 7    period                                   object        
 8    cashAndCashEquivalents                   float64       
 9    shortTermInvestments                     float64       
 10   cashAndShortTermInvestments              float64       
 11   netReceivables                           float64       
 12   inventory       

In [59]:
# ===================== DB INGEST — Balance Sheet (auto-add missing columns) =====================
import math
from typing import Sequence, Set, Dict
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from sqlalchemy.types import BigInteger, Float, Text, DateTime

# ---------- Configure ----------
PG_CONN_STR = "postgresql://postgres:CSDBMS623@localhost:5432/SP500_ML"
SCHEMA      = "public"
TABLE       = "balance_sheets_q"
CHUNK_ROWS  = 25_000

# ---------- Engine ----------
def _get_engine(conn_str: str) -> Engine:
    return create_engine(conn_str, pool_pre_ping=True)

# ---------- Create minimal table + indexes (rest added dynamically) ----------
def ensure_table_and_indexes(engine: Engine, schema: str, table: str):
    ddl = f'''
    CREATE TABLE IF NOT EXISTS "{schema}"."{table}" (
      symbol      TEXT,
      date        TIMESTAMP,
      date_start  TIMESTAMP,
      date_end    TIMESTAMP
    );
    '''
    uq  = f"""
    DO $$
    BEGIN
      IF NOT EXISTS (
        SELECT 1 FROM pg_constraint WHERE conname = '{table}_symbol_date_key'
      ) THEN
        ALTER TABLE "{schema}"."{table}"
        ADD CONSTRAINT {table}_symbol_date_key UNIQUE (symbol, date);
      END IF;
    END$$;
    """
    idx1 = f'CREATE INDEX IF NOT EXISTS {table}_symbol_idx ON "{schema}"."{table}" (symbol);'
    idx2 = f'CREATE INDEX IF NOT EXISTS {table}_date_idx   ON "{schema}"."{table}" (date);'
    with engine.begin() as conn:
        conn.execute(text(ddl)); conn.execute(text(uq)); conn.execute(text(idx1)); conn.execute(text(idx2))

# ---------- Introspection + migration ----------
def _existing_columns(engine: Engine, schema: str, table: str) -> Set[str]:
    sql = """
    SELECT lower(column_name) FROM information_schema.columns
    WHERE table_schema = :schema AND table_name = :table
    """
    with engine.begin() as conn:
        rows = conn.execute(text(sql), {"schema": schema, "table": table}).fetchall()
    return {r[0] for r in rows}

def _infer_sql_type_from_series(s: pd.Series) -> str:
    if pd.api.types.is_datetime64_any_dtype(s): return "TIMESTAMP"
    if pd.api.types.is_integer_dtype(s):        return "BIGINT"
    if pd.api.types.is_float_dtype(s):          return "DOUBLE PRECISION"  # covers float64 & pandas Float64
    return "TEXT"

def ensure_missing_columns(engine: Engine, schema: str, table: str, df: pd.DataFrame):
    have = _existing_columns(engine, schema, table)
    missing = [c for c in df.columns if c not in have]
    if not missing: return
    alters = []
    for c in missing:
        sql_t = _infer_sql_type_from_series(df[c])
        alters.append(f'ADD COLUMN IF NOT EXISTS {c} {sql_t}')
    with engine.begin() as conn:
        conn.execute(text(f'ALTER TABLE "{schema}"."{table}" ' + ", ".join(alters) + ";"))

# ---------- Staging + merge ----------
def _build_dtype_map(df: pd.DataFrame) -> Dict[str, object]:
    """Build a dtype map for pandas.to_sql so timestamps/numerics land correctly."""
    dmap: Dict[str, object] = {}
    for c in df.columns:
        s = df[c]
        if pd.api.types.is_datetime64_any_dtype(s):
            dmap[c] = DateTime(timezone=False)
        elif pd.api.types.is_integer_dtype(s):
            dmap[c] = BigInteger()
        elif pd.api.types.is_float_dtype(s):
            dmap[c] = Float()
        else:
            dmap[c] = Text()
    return dmap

def _to_sql_staging(engine: Engine, df: pd.DataFrame, schema: str, staging: str):
    df.to_sql(
        name=staging,
        con=engine,
        schema=schema,
        if_exists="replace",
        index=False,
        dtype=_build_dtype_map(df),
        chunksize=10_000,
        method=None,
    )

def _merge_from_staging(engine: Engine, schema: str, table: str, staging: str, cols: Sequence[str]):
    non_key_cols = [c for c in cols if c not in ("symbol", "date")]
    set_clause = ", ".join([f"{c}=EXCLUDED.{c}" for c in non_key_cols]) or "symbol=EXCLUDED.symbol"
    sql = f"""
    INSERT INTO "{schema}"."{table}" ({", ".join(cols)})
    SELECT {", ".join(cols)} FROM "{schema}"."{staging}"
    ON CONFLICT (symbol, date)
    DO UPDATE SET {set_clause};
    """
    with engine.begin() as conn:
        conn.execute(text(sql))

# ---------- Public API ----------
def upsert_balance_sheets_postgres(
    df: pd.DataFrame,
    conn_str: str = PG_CONN_STR,
    schema: str = SCHEMA,
    table: str = TABLE,
    chunk_rows: int = CHUNK_ROWS,
):
    """
    Idempotent upsert for balance sheets:
      - lowercases column names once (so no quoted identifiers)
      - ensures base table & unique(symbol,date) & indexes
      - AUTO-ADDS any missing columns to match df (including YoY and ratios)
      - stages & upserts in chunks
    """
    if df.empty:
        print("DataFrame is empty; nothing to ingest."); return

    # --- 1) Normalize names and datetimes ---
    df = df.copy()
    # Keep your columns but lowercased (underscores stay as-is; camelCase becomes flat lowercase)
    df.columns = df.columns.str.lower()

    # Make sure SCD window columns exist
    for c in ("date_start", "date_end"):
        if c not in df.columns:
            df[c] = pd.to_datetime(df["date"], errors="coerce")

    # tz-naive datetimes
    for dcol in ("date", "date_start", "date_end"):
        if dcol in df.columns:
            df[dcol] = pd.to_datetime(df[dcol], errors="coerce").dt.tz_localize(None)

    # --- 2) Ensure table & migrate any missing columns ---
    engine = _get_engine(conn_str)
    ensure_table_and_indexes(engine, schema, table)
    ensure_missing_columns(engine, schema, table, df)

    # --- 3) Keys first for readability ---
    key_first = [c for c in ("symbol","date","date_start","date_end") if c in df.columns]
    rest = [c for c in df.columns if c not in key_first]
    df = df[key_first + rest]

    # --- 4) Chunked stage + merge ---
    n = len(df); n_chunks = math.ceil(n / chunk_rows)
    for i in range(n_chunks):
        lo, hi = i * chunk_rows, min((i + 1) * chunk_rows, n)
        staging = f"stg_{table}"
        chunk = df.iloc[lo:hi].copy()
        _to_sql_staging(engine, chunk, schema, staging)
        _merge_from_staging(engine, schema, table, staging, chunk.columns.tolist())
        print(f"Upserted rows {lo}–{hi} / {n}")

    print("✅ Balance sheet ingestion complete.")


In [61]:
upsert_balance_sheets_postgres(df_bs)

Upserted rows 0–25000 / 66989
Upserted rows 25000–50000 / 66989
Upserted rows 50000–66989 / 66989
✅ Balance sheet ingestion complete.
