In [1]:
# !pip install sqlalchemy psycopg2-binary
import os, pandas as pd
from sqlalchemy import create_engine

pg_user = os.getenv("PGUSER", "postgres")
pg_pass = os.getenv("PGPASSWORD", "CSDBMS623")
pg_host = os.getenv("PGHOST", "localhost")
pg_port = os.getenv("PGPORT", "5432")
pg_db   = os.getenv("PGDATABASE", "SP500_ML")

engine = create_engine(f"postgresql+psycopg2://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}")

# Use only the most recent membership date to define the current universe
universe = pd.read_sql_query("""
    SELECT DISTINCT UPPER(TRIM(latest_ticker)) AS latest_ticker
    FROM sp500_long_latest_profiles
    WHERE latest_ticker IS NOT NULL

""", engine)["latest_ticker"].tolist()

print("Universe size:", len(universe))

# Now run your Yahoo fetcher
#prices_df = fetch_prices_for_universe(universe, checkpoint_path="prices_checkpoint.parquet")

Universe size: 679


In [7]:
# ===================== FMP Historical Market Cap — Batched, 2012→Present =====================
import time
from datetime import date
from typing import Iterable, List, Optional, Tuple, Union
import requests
import pandas as pd

FMP_HMC_BASE = "https://financialmodelingprep.com/api/v3/historical-market-capitalization"

# -------- Robust GET with simple retries --------
def _get_with_retries(
    session: requests.Session,
    url: str,
    params: dict,
    timeout: int = 30,
    max_retries: int = 4,
    base_sleep: float = 1.0,
):
    last = None
    for attempt in range(1, max_retries + 1):
        resp = session.get(url, params=params, timeout=timeout)
        if resp.status_code == 200:
            return resp
        last = resp
        # handle rate limits / transient
        if resp.status_code in (429, 500, 502, 503, 504):
            time.sleep(base_sleep * (2 ** (attempt - 1)))
            continue
        resp.raise_for_status()
    if last is not None:
        last.raise_for_status()
    raise RuntimeError("Request failed without response.")

# -------- Normalize JSON -> list[dict] --------
def _normalize_fmp_json(j):
    if isinstance(j, list):
        return j
    if isinstance(j, dict):
        for k in ("error", "Error", "message", "Note", "Error Message"):
            if k in j and isinstance(j[k], str):
                raise RuntimeError(f"API message: {j[k]}")
        for k in ("historical", "data", "results", "items"):
            if k in j and isinstance(j[k], list):
                return j[k]
        return [j]
    raise RuntimeError(f"Unexpected JSON type: {type(j)}")

# -------- Date windowing (≤5y per call) --------
def _year_windows(start: pd.Timestamp, end: pd.Timestamp, years_per_call: int = 5) -> List[Tuple[str, str]]:
    """
    Build [from, to] windows no longer than `years_per_call` years, inclusive.
    Returns ISO date strings YYYY-MM-DD.
    """
    windows = []
    cur_start = start.normalize()
    while cur_start <= end:
        cur_end = min(cur_start + pd.DateOffset(years=years_per_call) - pd.Timedelta(days=1), end)
        windows.append((cur_start.date().isoformat(), cur_end.date().isoformat()))
        cur_start = cur_end + pd.Timedelta(days=1)
    return windows

# -------- One-ticker fetch over windows --------
def fetch_market_cap_one(
    ticker: str,
    api_key: str,
    start: Union[str, pd.Timestamp] = "2012-01-01",
    end: Optional[Union[str, pd.Timestamp]] = None,
    session: Optional[requests.Session] = None,
    timeout: int = 30,
    per_call_limit: int = 200000,  # generous; API also enforces ~5y span per call
) -> pd.DataFrame:
    if session is None:
        session = requests.Session()

    if end is None:
        end = pd.Timestamp.today().normalize()
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)

    frames: List[pd.DataFrame] = []
    for f, t in _year_windows(start, end, years_per_call=5):
        params = {"from": f, "to": t, "apikey": api_key, "limit": per_call_limit}
        url = f"{FMP_HMC_BASE}/{ticker.upper()}"
        r = _get_with_retries(session, url, params, timeout=timeout)
        try:
            data = r.json()
        except ValueError as e:
            raise RuntimeError(f"Non-JSON response for {ticker}: {r.text[:300]}") from e
        records = _normalize_fmp_json(data)
        if not records:
            continue
        df = pd.DataFrame.from_records(records)

        # Harmonize keys; FMP uses 'marketCap'; guard for variants.
        if "marketCap" not in df.columns and "marketcap" in df.columns:
            df.rename(columns={"marketcap": "marketCap"}, inplace=True)

        # Ensure required columns
        if "symbol" not in df.columns:
            df["symbol"] = ticker.upper()
        else:
            df["symbol"] = df["symbol"].astype(str).str.upper().replace({"": ticker.upper()})
        if "date" not in df.columns:
            # some payloads may nest under 'historical'; handled in _normalize, but just in case:
            raise RuntimeError(f"'date' missing for {ticker} window {f}→{t}")

        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        df["marketCap"] = pd.to_numeric(df.get("marketCap"), errors="coerce")

        frames.append(df[["symbol", "date", "marketCap"]])

    if not frames:
        # Return empty frame with expected columns to keep pipeline stable
        return pd.DataFrame(columns=["symbol", "date", "marketCap"])

    out = (
        pd.concat(frames, ignore_index=True)
          .dropna(subset=["date"])
          .drop_duplicates(subset=["symbol", "date"], keep="last")
          .sort_values(["symbol", "date"])
          .reset_index(drop=True)
    )
    # Optional: use pandas nullable Float64 if you prefer
    out["marketCap"] = out["marketCap"].astype("Float64")
    return out

# -------- Multi-ticker orchestrator (batched) --------
def fetch_market_caps(
    tickers: Union[str, Iterable[str]],
    api_key: str,
    start: Union[str, pd.Timestamp] = "2012-01-01",
    end: Optional[Union[str, pd.Timestamp]] = None,
    batch_size: int = 25,
    sleep_between_batches: float = 1.0,
    timeout: int = 30,
    skip_errors: bool = True,
    verbose: bool = True,
) -> pd.DataFrame:
    if isinstance(tickers, str):
        tickers = [tickers]
    tickers = [t.upper().strip() for t in tickers if str(t).strip()]

    session = requests.Session()
    frames: List[pd.DataFrame] = []
    skipped: List[Tuple[str, str]] = []

    total = len(tickers)
    for i in range(0, total, batch_size):
        batch = tickers[i:i + batch_size]
        if verbose:
            print(f"Batch {i//batch_size + 1}: {len(batch)} tickers ({i+1}–{min(i+len(batch), total)} of {total})")

        for t in batch:
            try:
                df_t = fetch_market_cap_one(
                    t, api_key=api_key, start=start, end=end,
                    session=session, timeout=timeout
                )
                if not df_t.empty:
                    frames.append(df_t)
            except Exception as e:
                if skip_errors:
                    skipped.append((t, str(e)))
                    if verbose:
                        print(f"  [skip] {t}: {e}")
                else:
                    raise

        if i + batch_size < total:
            time.sleep(sleep_between_batches)

    if not frames:
        if verbose:
            print("No successful market-cap pulls.")
            if skipped:
                print(f"Skipped {len(skipped)} tickers. Examples: {skipped[:5]}")
        return pd.DataFrame(columns=["symbol", "date", "marketCap"])

    df_all = pd.concat(frames, ignore_index=True)
    if verbose:
        print(f"✅ Success: {len(frames)} ticker windows; rows: {len(df_all)}")
        if skipped:
            print(f"⚠️ Skipped {len(skipped)} tickers.")
    return df_all

# -------------------------- Example --------------------------
if __name__ == "__main__":
    API_KEY = ""
    tickers = universe# or a longer list

    df_mcap = fetch_market_caps(
        tickers=tickers,
        api_key=API_KEY,
        start="2012-01-01",
        end=None,                 # defaults to today
        batch_size=5,
        sleep_between_batches=3.0,
        skip_errors=True,
        verbose=True,
    )
    print(df_mcap.tail(12))


Batch 1: 5 tickers (1–5 of 679)
Batch 2: 5 tickers (6–10 of 679)
Batch 3: 5 tickers (11–15 of 679)
Batch 4: 5 tickers (16–20 of 679)
Batch 5: 5 tickers (21–25 of 679)
Batch 6: 5 tickers (26–30 of 679)
Batch 7: 5 tickers (31–35 of 679)
Batch 8: 5 tickers (36–40 of 679)
Batch 9: 5 tickers (41–45 of 679)
Batch 10: 5 tickers (46–50 of 679)
Batch 11: 5 tickers (51–55 of 679)
Batch 12: 5 tickers (56–60 of 679)
Batch 13: 5 tickers (61–65 of 679)
Batch 14: 5 tickers (66–70 of 679)
Batch 15: 5 tickers (71–75 of 679)
Batch 16: 5 tickers (76–80 of 679)
Batch 17: 5 tickers (81–85 of 679)
Batch 18: 5 tickers (86–90 of 679)
Batch 19: 5 tickers (91–95 of 679)
Batch 20: 5 tickers (96–100 of 679)
Batch 21: 5 tickers (101–105 of 679)
Batch 22: 5 tickers (106–110 of 679)
Batch 23: 5 tickers (111–115 of 679)
Batch 24: 5 tickers (116–120 of 679)
Batch 25: 5 tickers (121–125 of 679)
Batch 26: 5 tickers (126–130 of 679)
Batch 27: 5 tickers (131–135 of 679)
Batch 28: 5 tickers (136–140 of 679)
Batch 29: 5 tic

In [9]:
# ===================== DB INGEST — Historical Market Cap (auto-add columns) =====================
import math
from typing import Sequence, Set, Dict
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from sqlalchemy.types import BigInteger, Float, Text, DateTime

# ---------- Configure ----------
PG_CONN_STR = "postgresql://postgres:CSDBMS623@localhost:5432/SP500_ML"
SCHEMA      = "public"
TABLE       = "market_caps_d"   # daily series
CHUNK_ROWS  = 25_000

# ---------- Engine ----------
def _get_engine(conn_str: str) -> Engine:
    return create_engine(conn_str, pool_pre_ping=True)

# ---------- Create base table + indexes ----------
def ensure_table_and_indexes(engine: Engine, schema: str, table: str):
    ddl = f'''
    CREATE TABLE IF NOT EXISTS "{schema}"."{table}" (
      symbol     TEXT,
      date       TIMESTAMP,
      marketcap  DOUBLE PRECISION
    );
    '''
    uq  = f"""
    DO $$
    BEGIN
      IF NOT EXISTS (
        SELECT 1 FROM pg_constraint WHERE conname = '{table}_symbol_date_key'
      ) THEN
        ALTER TABLE "{schema}"."{table}"
        ADD CONSTRAINT {table}_symbol_date_key UNIQUE (symbol, date);
      END IF;
    END$$;
    """
    idx1 = f'CREATE INDEX IF NOT EXISTS {table}_symbol_idx ON "{schema}"."{table}" (symbol);'
    idx2 = f'CREATE INDEX IF NOT EXISTS {table}_date_idx   ON "{schema}"."{table}" (date);'
    with engine.begin() as conn:
        conn.execute(text(ddl))
        conn.execute(text(uq))
        conn.execute(text(idx1))
        conn.execute(text(idx2))

# ---------- Introspection + migration ----------
def _existing_columns(engine: Engine, schema: str, table: str) -> Set[str]:
    sql = """
    SELECT lower(column_name) FROM information_schema.columns
    WHERE table_schema = :schema AND table_name = :table
    """
    with engine.begin() as conn:
        rows = conn.execute(text(sql), {"schema": schema, "table": table}).fetchall()
    return {r[0] for r in rows}

def _infer_sql_type_from_series(s: pd.Series) -> str:
    if pd.api.types.is_datetime64_any_dtype(s): return "TIMESTAMP"
    if pd.api.types.is_integer_dtype(s):        return "BIGINT"
    if pd.api.types.is_float_dtype(s):          return "DOUBLE PRECISION"
    return "TEXT"

def ensure_missing_columns(engine: Engine, schema: str, table: str, df: pd.DataFrame):
    have = _existing_columns(engine, schema, table)
    missing = [c for c in df.columns if c not in have]
    if not missing: return
    alters = []
    for c in missing:
        sql_t = _infer_sql_type_from_series(df[c])
        alters.append(f'ADD COLUMN IF NOT EXISTS {c} {sql_t}')
    with engine.begin() as conn:
        conn.execute(text(f'ALTER TABLE "{schema}"."{table}" ' + ", ".join(alters) + ";"))

# ---------- Staging + merge ----------
def _build_dtype_map(df: pd.DataFrame) -> Dict[str, object]:
    dmap: Dict[str, object] = {}
    for c in df.columns:
        s = df[c]
        if pd.api.types.is_datetime64_any_dtype(s):
            dmap[c] = DateTime(timezone=False)
        elif pd.api.types.is_integer_dtype(s):
            dmap[c] = BigInteger()
        elif pd.api.types.is_float_dtype(s):
            dmap[c] = Float()
        else:
            dmap[c] = Text()
    return dmap

def _to_sql_staging(engine: Engine, df: pd.DataFrame, schema: str, staging: str):
    df.to_sql(
        name=staging,
        con=engine,
        schema=schema,
        if_exists="replace",
        index=False,
        dtype=_build_dtype_map(df),
        chunksize=10_000,
    )

def _merge_from_staging(engine: Engine, schema: str, table: str, staging: str, cols: Sequence[str]):
    non_key_cols = [c for c in cols if c not in ("symbol", "date")]
    set_clause = ", ".join([f"{c}=EXCLUDED.{c}" for c in non_key_cols]) or "symbol=EXCLUDED.symbol"
    sql = f"""
    INSERT INTO "{schema}"."{table}" ({", ".join(cols)})
    SELECT {", ".join(cols)} FROM "{schema}"."{staging}"
    ON CONFLICT (symbol, date)
    DO UPDATE SET {set_clause};
    """
    with engine.begin() as conn:
        conn.execute(text(sql))

# ---------- Public API ----------
def upsert_market_caps_postgres(
    df: pd.DataFrame,
    conn_str: str = PG_CONN_STR,
    schema: str = SCHEMA,
    table: str = TABLE,
    chunk_rows: int = CHUNK_ROWS,
):
    """
    Upsert df_mcap -> Postgres:
      - lowercase columns
      - ensure table + unique(symbol,date) + indexes
      - auto-add any extra columns from df
      - stage & upsert in chunks
    """
    if df.empty:
        print("df_mcap is empty; nothing to ingest.")
        return

    df = df.copy()
    # normalize column names: symbol/date/marketCap -> symbol/date/marketcap
    df.columns = df.columns.str.lower()

    # common cleanups
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)
    if "symbol" in df.columns:
        df["symbol"] = df["symbol"].astype(str).str.upper()

    # prefer 'marketcap' name if 'marketCap' came through
    if "marketcap" not in df.columns and "marketcap" in [c.lower() for c in df.columns]:
        pass  # already normalized via lowercasing
    elif "marketcap" not in df.columns and "marketcap" not in df.columns and "marketcap" not in df.columns:
        # ensure the base column exists (if user has a different name)
        # No-op: but you can map your custom column to 'marketcap' here if needed.
        pass

    engine = _get_engine(conn_str)
    ensure_table_and_indexes(engine, schema, table)
    ensure_missing_columns(engine, schema, table, df)

    # keys first for readability
    key_first = [c for c in ("symbol", "date") if c in df.columns]
    rest = [c for c in df.columns if c not in key_first]
    df = df[key_first + rest]

    # chunked stage + merge
    n = len(df)
    n_chunks = math.ceil(n / chunk_rows)
    for i in range(n_chunks):
        lo, hi = i * chunk_rows, min((i + 1) * chunk_rows, n)
        staging = f"stg_{table}"
        chunk = df.iloc[lo:hi].copy()
        _to_sql_staging(engine, chunk, schema, staging)
        _merge_from_staging(engine, schema, table, staging, chunk.columns.tolist())
        print(f"Upserted rows {lo}–{hi} / {n}")

    print("✅ Market cap ingestion complete.")


In [11]:
upsert_market_caps_postgres(df_mcap)

Upserted rows 0–25000 / 2124458
Upserted rows 25000–50000 / 2124458
Upserted rows 50000–75000 / 2124458
Upserted rows 75000–100000 / 2124458
Upserted rows 100000–125000 / 2124458
Upserted rows 125000–150000 / 2124458
Upserted rows 150000–175000 / 2124458
Upserted rows 175000–200000 / 2124458
Upserted rows 200000–225000 / 2124458
Upserted rows 225000–250000 / 2124458
Upserted rows 250000–275000 / 2124458
Upserted rows 275000–300000 / 2124458
Upserted rows 300000–325000 / 2124458
Upserted rows 325000–350000 / 2124458
Upserted rows 350000–375000 / 2124458
Upserted rows 375000–400000 / 2124458
Upserted rows 400000–425000 / 2124458
Upserted rows 425000–450000 / 2124458
Upserted rows 450000–475000 / 2124458
Upserted rows 475000–500000 / 2124458
Upserted rows 500000–525000 / 2124458
Upserted rows 525000–550000 / 2124458
Upserted rows 550000–575000 / 2124458
Upserted rows 575000–600000 / 2124458
Upserted rows 600000–625000 / 2124458
Upserted rows 625000–650000 / 2124458
Upserted rows 650000–67

In [13]:
# ===================== FMP Enterprise Value — Batched, 2012→Present =====================
import time
from typing import Iterable, List, Optional, Tuple, Union
import requests
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

FMP_EV_BASE = "https://financialmodelingprep.com/api/v3/enterprise-values"

# -------- Robust GET with simple retries --------
def _get_with_retries(
    session: requests.Session,
    url: str,
    params: dict,
    timeout: int = 30,
    max_retries: int = 4,
    base_sleep: float = 1.0,
):
    last = None
    for attempt in range(1, max_retries + 1):
        resp = session.get(url, params=params, timeout=timeout)
        if resp.status_code == 200:
            return resp
        last = resp
        if resp.status_code in (429, 500, 502, 503, 504):
            time.sleep(base_sleep * (2 ** (attempt - 1)))
            continue
        resp.raise_for_status()
    if last is not None:
        last.raise_for_status()
    raise RuntimeError("Request failed without response.")

# -------- Normalize Enterprise Values payload -> list[dict] --------
def _normalize_ev_json(j, ticker: str) -> List[dict]:
    """
    FMP often returns:
      { "symbol": "AAPL", "enterpriseValues": [ {...}, {...} ] }
    but sometimes we may get a raw list of dicts already.
    This function returns a flat list of dicts, each with 'symbol' set.
    """
    if isinstance(j, list):
        # already a list of records (rare for this endpoint)
        out = []
        for rec in j:
            rec = dict(rec)
            rec["symbol"] = (str(rec.get("symbol", ticker)).upper() or ticker.upper())
            out.append(rec)
        return out

    if isinstance(j, dict):
        # common case: {symbol, enterpriseValues: [...]}
        symbol = str(j.get("symbol", ticker)).upper()
        evs = j.get("enterpriseValues")
        if isinstance(evs, list):
            out = []
            for rec in evs:
                r = dict(rec)
                r["symbol"] = symbol
                out.append(r)
            return out

        # fallback: any list-like under other keys
        for k in ("items", "data", "results", "historical"):
            if isinstance(j.get(k), list):
                out = []
                for rec in j[k]:
                    r = dict(rec)
                    r["symbol"] = symbol
                    out.append(r)
                return out

        # single record fallback
        one = dict(j)
        one["symbol"] = symbol
        return [one]

    raise RuntimeError(f"Unexpected JSON type from API: {type(j)}")

# -------- One-ticker fetch (quarterly by default) --------
def fetch_enterprise_values_one(
    ticker: str,
    api_key: str,
    period: str = "quarter",   # "quarter" or "annual"
    start: Union[str, pd.Timestamp] = "2012-01-01",
    end: Optional[Union[str, pd.Timestamp]] = None,
    session: Optional[requests.Session] = None,
    timeout: int = 30,
    per_call_limit: int = 5000,  # large to get full history
) -> pd.DataFrame:
    """
    Pulls enterprise values for one ticker.
    Filters rows to [start, end] after fetch (endpoint does not support from/to).
    """
    if session is None:
        session = requests.Session()
    url = f"{FMP_EV_BASE}/{ticker.upper()}"
    params = {"period": period, "apikey": api_key, "limit": per_call_limit}

    r = _get_with_retries(session, url, params, timeout=timeout)
    try:
        data = r.json()
    except ValueError as e:
        raise RuntimeError(f"Non-JSON response for {ticker}: {r.text[:300]}") from e

    rows = _normalize_ev_json(data, ticker)
    if not rows:
        return pd.DataFrame(columns=["symbol", "date", "enterpriseValue"])

    df = pd.DataFrame.from_records(rows)

    # robust symbol handling
    if "symbol" in df.columns:
        df["symbol"] = df["symbol"].astype(str)
        mask_missing = df["symbol"].isin(["", "None", "nan", "NaN"]) | df["symbol"].isna()
        df.loc[mask_missing, "symbol"] = ticker
        df["symbol"] = df["symbol"].str.upper()
    else:
        df["symbol"] = ticker.upper()

    # date
    if "date" not in df.columns:
        raise RuntimeError(f"'date' missing in enterprise-values payload for {ticker}.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

    # normalize numeric fields if present
    rename_map = {
        "marketcap": "marketCapitalization",
        "stockprice": "stockPrice",
        "numberofshares": "numberOfShares",
        "enterprisevalue": "enterpriseValue",
    }
    for k, v in rename_map.items():
        if k in df.columns and v not in df.columns:
            df.rename(columns={k: v}, inplace=True)

    num_cols = [c for c in ("enterpriseValue","marketCapitalization","stockPrice","numberOfShares") if c in df.columns]
    # also try to coerce any other numeric-looking fields (debt/cash if present)
    for c in list(df.columns):
        if c in ("symbol","date"): 
            continue
        # attempt numeric coercion; if all NaN it stays float anyway
        try:
            df[c] = pd.to_numeric(df[c], errors="ignore")
        except Exception:
            pass
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Float64")

    # de-dup & sort
    df = (
        df.dropna(subset=["date"])
          .drop_duplicates(subset=["symbol", "date"], keep="last")
          .sort_values(["symbol", "date"])
          .reset_index(drop=True)
    )

    # date filter 2012 -> present
    start_ts = pd.to_datetime(start)
    end_ts = pd.to_datetime(end) if end is not None else pd.Timestamp.today().normalize()
    df = df[(df["date"] >= start_ts) & (df["date"] <= end_ts)]

    # ensure the main columns exist even if NaN
    if "enterpriseValue" not in df.columns:
        df["enterpriseValue"] = pd.Series(pd.NA, index=df.index, dtype="Float64")

    # keep a tidy base + any extras FMP returned
    base = ["symbol", "date", "enterpriseValue"]
    extras = [c for c in df.columns if c not in base]
    df = df[base + extras]

    return df

# -------- Multi-ticker orchestrator (batched) --------
def fetch_enterprise_values(
    tickers: Union[str, Iterable[str]],
    api_key: str,
    start: Union[str, pd.Timestamp] = "2012-01-01",
    end: Optional[Union[str, pd.Timestamp]] = None,
    period: str = "quarter",
    batch_size: int = 25,
    sleep_between_batches: float = 1.0,
    timeout: int = 30,
    skip_errors: bool = True,
    verbose: bool = True,
) -> pd.DataFrame:
    if isinstance(tickers, str):
        tickers = [tickers]
    tickers = [t.upper().strip() for t in tickers if str(t).strip()]

    session = requests.Session()
    frames: List[pd.DataFrame] = []
    skipped: List[Tuple[str, str]] = []
    total = len(tickers)

    for i in range(0, total, batch_size):
        batch = tickers[i:i + batch_size]
        if verbose:
            print(f"Batch {i//batch_size + 1}: {len(batch)} tickers ({i+1}–{min(i+len(batch), total)} of {total})")

        for t in batch:
            try:
                df_t = fetch_enterprise_values_one(
                    t, api_key=api_key, period=period,
                    start=start, end=end,
                    session=session, timeout=timeout
                )
                if not df_t.empty:
                    frames.append(df_t)
            except Exception as e:
                if skip_errors:
                    skipped.append((t, str(e)))
                    if verbose:
                        print(f"  [skip] {t}: {e}")
                else:
                    raise

        if i + batch_size < total:
            time.sleep(sleep_between_batches)

    if not frames:
        if verbose:
            print("No successful enterprise-value pulls.")
            if skipped:
                print(f"Skipped {len(skipped)} tickers. Examples: {skipped[:5]}")
        return pd.DataFrame(columns=["symbol", "date", "enterpriseValue"])

    df_all = pd.concat(frames, ignore_index=True)
    if verbose:
        print(f"✅ Success: {len(frames)} ticker chunks; rows: {len(df_all)}")
        if skipped:
            print(f"⚠️ Skipped {len(skipped)} tickers.")
    return df_all

# -------------------------- Example --------------------------
if __name__ == "__main__":
    API_KEY = ""
    tickers = universe # or a longer list

    df_ev = fetch_enterprise_values(
        tickers=tickers,
        api_key=API_KEY,
        start="2012-01-01",
        end=None,           # defaults to today
        period="quarter",   # or "annual"
        batch_size=5,
        sleep_between_batches=3.0,
        skip_errors=True,
        verbose=True,
    )
    print(df_ev.head(12))

Batch 1: 5 tickers (1–5 of 679)
Batch 2: 5 tickers (6–10 of 679)
Batch 3: 5 tickers (11–15 of 679)
Batch 4: 5 tickers (16–20 of 679)
Batch 5: 5 tickers (21–25 of 679)
Batch 6: 5 tickers (26–30 of 679)
Batch 7: 5 tickers (31–35 of 679)
Batch 8: 5 tickers (36–40 of 679)
Batch 9: 5 tickers (41–45 of 679)
Batch 10: 5 tickers (46–50 of 679)
Batch 11: 5 tickers (51–55 of 679)
Batch 12: 5 tickers (56–60 of 679)
Batch 13: 5 tickers (61–65 of 679)
Batch 14: 5 tickers (66–70 of 679)
Batch 15: 5 tickers (71–75 of 679)
Batch 16: 5 tickers (76–80 of 679)
Batch 17: 5 tickers (81–85 of 679)
Batch 18: 5 tickers (86–90 of 679)
Batch 19: 5 tickers (91–95 of 679)
Batch 20: 5 tickers (96–100 of 679)
Batch 21: 5 tickers (101–105 of 679)
Batch 22: 5 tickers (106–110 of 679)
Batch 23: 5 tickers (111–115 of 679)
Batch 24: 5 tickers (116–120 of 679)
Batch 25: 5 tickers (121–125 of 679)
Batch 26: 5 tickers (126–130 of 679)
Batch 27: 5 tickers (131–135 of 679)
Batch 28: 5 tickers (136–140 of 679)
Batch 29: 5 tic

In [15]:
# ===================== DB INGEST — Enterprise Values (auto-add columns) =====================
import math
from typing import Sequence, Set, Dict
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from sqlalchemy.types import BigInteger, Float, Text, DateTime

# ---------- Configure ----------
PG_CONN_STR = "postgresql://postgres:CSDBMS623@localhost:5432/SP500_ML"
SCHEMA      = "public"
TABLE       = "enterprise_values_q"   # quarterly EV series
CHUNK_ROWS  = 25_000

# ---------- Engine ----------
def _get_engine(conn_str: str) -> Engine:
    return create_engine(conn_str, pool_pre_ping=True)

# ---------- Create base table + indexes ----------
def ensure_table_and_indexes(engine: Engine, schema: str, table: str):
    ddl = f'''
    CREATE TABLE IF NOT EXISTS "{schema}"."{table}" (
      symbol          TEXT,
      date            TIMESTAMP,
      enterprisevalue DOUBLE PRECISION
    );
    '''
    uq  = f"""
    DO $$
    BEGIN
      IF NOT EXISTS (
        SELECT 1 FROM pg_constraint WHERE conname = '{table}_symbol_date_key'
      ) THEN
        ALTER TABLE "{schema}"."{table}"
        ADD CONSTRAINT {table}_symbol_date_key UNIQUE (symbol, date);
      END IF;
    END$$;
    """
    idx1 = f'CREATE INDEX IF NOT EXISTS {table}_symbol_idx ON "{schema}"."{table}" (symbol);'
    idx2 = f'CREATE INDEX IF NOT EXISTS {table}_date_idx   ON "{schema}"."{table}" (date);'
    with engine.begin() as conn:
        conn.execute(text(ddl))
        conn.execute(text(uq))
        conn.execute(text(idx1))
        conn.execute(text(idx2))

# ---------- Introspection + migration ----------
def _existing_columns(engine: Engine, schema: str, table: str) -> Set[str]:
    sql = """
    SELECT lower(column_name) FROM information_schema.columns
    WHERE table_schema = :schema AND table_name = :table
    """
    with engine.begin() as conn:
        rows = conn.execute(text(sql), {"schema": schema, "table": table}).fetchall()
    return {r[0] for r in rows}

def _infer_sql_type_from_series(s: pd.Series) -> str:
    if pd.api.types.is_datetime64_any_dtype(s): return "TIMESTAMP"
    if pd.api.types.is_integer_dtype(s):        return "BIGINT"
    if pd.api.types.is_float_dtype(s):          return "DOUBLE PRECISION"  # covers float64 & pandas Float64
    return "TEXT"

def ensure_missing_columns(engine: Engine, schema: str, table: str, df: pd.DataFrame):
    have = _existing_columns(engine, schema, table)
    missing = [c for c in df.columns if c not in have]
    if not missing: return
    alters = []
    for c in missing:
        sql_t = _infer_sql_type_from_series(df[c])
        alters.append(f'ADD COLUMN IF NOT EXISTS {c} {sql_t}')
    with engine.begin() as conn:
        conn.execute(text(f'ALTER TABLE "{schema}"."{table}" ' + ", ".join(alters) + ";"))

# ---------- Staging + merge ----------
def _build_dtype_map(df: pd.DataFrame) -> Dict[str, object]:
    dmap: Dict[str, object] = {}
    for c in df.columns:
        s = df[c]
        if pd.api.types.is_datetime64_any_dtype(s):
            dmap[c] = DateTime(timezone=False)
        elif pd.api.types.is_integer_dtype(s):
            dmap[c] = BigInteger()
        elif pd.api.types.is_float_dtype(s):
            dmap[c] = Float()
        else:
            dmap[c] = Text()
    return dmap

def _to_sql_staging(engine: Engine, df: pd.DataFrame, schema: str, staging: str):
    df.to_sql(
        name=staging,
        con=engine,
        schema=schema,
        if_exists="replace",
        index=False,
        dtype=_build_dtype_map(df),
        chunksize=10_000,
    )

def _merge_from_staging(engine: Engine, schema: str, table: str, staging: str, cols: Sequence[str]):
    non_key_cols = [c for c in cols if c not in ("symbol", "date")]
    set_clause = ", ".join([f"{c}=EXCLUDED.{c}" for c in non_key_cols]) or "symbol=EXCLUDED.symbol"
    sql = f"""
    INSERT INTO "{schema}"."{table}" ({", ".join(cols)})
    SELECT {", ".join(cols)} FROM "{schema}"."{staging}"
    ON CONFLICT (symbol, date)
    DO UPDATE SET {set_clause};
    """
    with engine.begin() as conn:
        conn.execute(text(sql))

# ---------- Public API ----------
def upsert_enterprise_values_postgres(
    df: pd.DataFrame,
    conn_str: str = PG_CONN_STR,
    schema: str = SCHEMA,
    table: str = TABLE,
    chunk_rows: int = CHUNK_ROWS,
):
    """
    Upsert df_ev -> Postgres:
      - lowercase columns
      - ensure table + unique(symbol,date) + indexes
      - auto-add any extra columns from df_ev
      - stage & upsert in chunks
    """
    if df.empty:
        print("df_ev is empty; nothing to ingest.")
        return

    df = df.copy()
    # normalize column names to lowercase
    df.columns = df.columns.str.lower()

    # canonicalize a few expected names
    rename_map = {
        "enterprisevalue": "enterprisevalue",
        "marketcapitalization": "marketcapitalization",
        "stockprice": "stockprice",
        "numberofshares": "numberofshares",
        "minuscashandcashequivalents": "minuscashandcashequivalents",
        "addtotaldebt": "addtotaldebt",
    }
    # (lowercasing already handled; above is just clarity)

    # clean types
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.tz_localize(None)
    if "symbol" in df.columns:
        df["symbol"] = df["symbol"].astype(str).str.upper()

    engine = _get_engine(conn_str)
    ensure_table_and_indexes(engine, schema, table)
    ensure_missing_columns(engine, schema, table, df)

    # keys first for readability
    key_first = [c for c in ("symbol", "date") if c in df.columns]
    rest = [c for c in df.columns if c not in key_first]
    df = df[key_first + rest]

    # chunked stage + merge
    n = len(df)
    n_chunks = math.ceil(n / chunk_rows)
    for i in range(n_chunks):
        lo, hi = i * chunk_rows, min((i + 1) * chunk_rows, n)
        staging = f"stg_{table}"
        chunk = df.iloc[lo:hi].copy()
        _to_sql_staging(engine, chunk, schema, staging)
        _merge_from_staging(engine, schema, table, staging, chunk.columns.tolist())
        print(f"Upserted rows {lo}–{hi} / {n}")

    print("✅ Enterprise values ingestion complete.")


In [17]:
# df_ev has columns like:
# ['symbol','date','enterpriseValue','stockPrice','numberOfShares',
#  'marketCapitalization','minusCashAndCashEquivalents','addTotalDebt']
upsert_enterprise_values_postgres(df_ev)

Upserted rows 0–25000 / 34027
Upserted rows 25000–34027 / 34027
✅ Enterprise values ingestion complete.


In [21]:
# ===================== FMP Analyst Estimates — Full History + Postgres Upsert =====================
import time
from typing import Iterable, List, Optional, Tuple, Union, Sequence, Set, Dict
import requests
import pandas as pd
import numpy as np

# -------------------------- Config --------------------------
FMP_AE_BASE = "https://financialmodelingprep.com/api/v3/analyst-estimates"

# --- Postgres (optional; used by upsert_analyst_estimates_postgres at the bottom) ---
PG_CONN_STR = "postgresql://postgres:CSDBMS623@localhost:5432/SP500_ML"
SCHEMA      = "public"
TABLE       = "analyst_estimates_q"      # change if you prefer another table name
CHUNK_ROWS  = 25_000

# -------------------------- HTTP helpers --------------------------
def _get_with_retries(
    session: requests.Session,
    url: str,
    params: dict,
    timeout: int = 30,
    max_retries: int = 4,
    base_sleep: float = 1.0,
):
    """GET with simple exponential backoff on transient errors/rate-limits."""
    last = None
    for attempt in range(1, max_retries + 1):
        resp = session.get(url, params=params, timeout=timeout)
        if resp.status_code == 200:
            return resp
        last = resp
        if resp.status_code in (429, 500, 502, 503, 504):
            time.sleep(base_sleep * (2 ** (attempt - 1)))
            continue
        resp.raise_for_status()
    if last is not None:
        last.raise_for_status()
    raise RuntimeError("Request failed without response.")

def _normalize_fmp_json(j):
    """Normalize possible JSON shapes to list[dict]."""
    if isinstance(j, list):
        return j
    if isinstance(j, dict):
        # surface API messages as exceptions
        for k in ("error", "Error", "message", "Note", "Error Message"):
            if k in j and isinstance(j[k], str):
                raise RuntimeError(f"API message: {j[k]}")
        # common list-bearing keys
        for k in ("items", "data", "results", "analystEstimates", "historical"):
            if k in j and isinstance(j[k], list):
                return j[k]
        # fallback: single record
        return [j]
    raise RuntimeError(f"Unexpected JSON type from API: {type(j)}")

# -------------------------- Core fetchers --------------------------
def fetch_analyst_estimates_one(
    ticker: str,
    api_key: str,
    session: Optional[requests.Session] = None,
    timeout: int = 30,
    per_call_limit: int = 50000,
    start: Optional[Union[str, pd.Timestamp]] = None,   # None => no lower bound
    end:   Optional[Union[str, pd.Timestamp]] = None,   # None => no upper bound
    include_future: bool = True,                         # keep forward dates by default
) -> pd.DataFrame:
    """
    Fetch *all available* analyst estimates for a single ticker.
    - Does NOT cap by 'today' unless include_future=False or end is provided.
    - Returns a tidy DataFrame sorted by symbol/date, with numeric fields coerced.
    """
    if session is None:
        session = requests.Session()

    url = f"{FMP_AE_BASE}/{ticker.upper()}"
    params = {"apikey": api_key, "limit": per_call_limit}

    r = _get_with_retries(session, url, params, timeout=timeout)
    try:
        data = r.json()
    except ValueError as e:
        raise RuntimeError(f"Non-JSON response for {ticker}: {r.text[:300]}") from e

    rows = _normalize_fmp_json(data)
    if not rows:
        return pd.DataFrame(columns=["symbol", "date"])

    df = pd.DataFrame.from_records(rows)

    # Robust symbol/date handling
    if "symbol" in df.columns:
        df["symbol"] = df["symbol"].astype(str)
        miss = df["symbol"].isin(["", "None", "nan", "NaN"]) | df["symbol"].isna()
        df.loc[miss, "symbol"] = ticker
        df["symbol"] = df["symbol"].str.upper()
    else:
        df["symbol"] = ticker.upper()

    if "date" not in df.columns:
        raise RuntimeError(f"'date' missing in analyst-estimates payload for {ticker}.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

    # Coerce numerics where possible (leave text cols as-is)
    for c in df.columns:
        if c in ("symbol", "date"):
            continue
        # try numeric; keep NAs
        ser = pd.to_numeric(df[c], errors="ignore")
        if pd.api.types.is_float_dtype(ser) or pd.api.types.is_integer_dtype(ser):
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Float64")
        else:
            df[c] = ser

    # Drop bad dates, de-dup, sort
    df = (
        df.dropna(subset=["date"])
          .drop_duplicates(subset=["symbol", "date"], keep="last")
          .sort_values(["symbol", "date"])
          .reset_index(drop=True)
    )

    # Date filtering
    if start is not None:
        df = df[df["date"] >= pd.to_datetime(start)]
    if end is not None:
        df = df[df["date"] <= pd.to_datetime(end)]
    elif not include_future:
        df = df[df["date"] <= pd.Timestamp.today().normalize()]

    # Keep base + all provided estimate fields
    base = ["symbol", "date"]
    extras = [c for c in df.columns if c not in base]
    df = df[base + extras]
    return df


def fetch_analyst_estimates(
    tickers: Union[str, Iterable[str]],
    api_key: str,
    batch_size: int = 25,
    sleep_between_batches: float = 1.0,
    timeout: int = 30,
    skip_errors: bool = True,
    verbose: bool = True,
    start: Optional[Union[str, pd.Timestamp]] = None,  # None => no lower bound
    end:   Optional[Union[str, pd.Timestamp]] = None,  # None => no upper bound
    include_future: bool = True,                        # keep forward dates
) -> pd.DataFrame:
    """Batched pull across many tickers; returns concatenated tidy DataFrame."""
    if isinstance(tickers, str):
        tickers = [tickers]
    tickers = [t.upper().strip() for t in tickers if str(t).strip()]

    session = requests.Session()
    frames: List[pd.DataFrame] = []
    skipped: List[Tuple[str, str]] = []
    total = len(tickers)

    for i in range(0, total, batch_size):
        batch = tickers[i:i + batch_size]
        if verbose:
            print(f"Batch {i//batch_size + 1}: {len(batch)} tickers ({i+1}–{min(i+len(batch), total)} of {total})")

        for t in batch:
            try:
                df_t = fetch_analyst_estimates_one(
                    t,
                    api_key=api_key,
                    session=session,
                    timeout=timeout,
                    start=start,
                    end=end,
                    include_future=include_future,
                )
                if not df_t.empty:
                    frames.append(df_t)
            except Exception as e:
                if skip_errors:
                    skipped.append((t, str(e)))
                    if verbose:
                        print(f"  [skip] {t}: {e}")
                else:
                    raise

        if i + batch_size < total:
            time.sleep(sleep_between_batches)

    if not frames:
        if verbose:
            print("No successful analyst-estimates pulls.")
            if skipped:
                print(f"Skipped {len(skipped)} tickers. Examples: {skipped[:5]}")
        return pd.DataFrame(columns=["symbol", "date"])

    df_all = pd.concat(frames, ignore_index=True)
    if verbose:
        print(f"✅ Success: {len(frames)} ticker chunks; rows: {len(df_all)}")
        if skipped:
            print(f"⚠️ Skipped {len(skipped)} tickers.")
    return df_all


# -------------------------- Optional: Postgres Upsert --------------------------
import math
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
from sqlalchemy.types import BigInteger, Float, Text, DateTime

def _pg_engine(conn_str: str) -> Engine:
    return create_engine(conn_str, pool_pre_ping=True)

def _existing_columns(engine: Engine, schema: str, table: str) -> Set[str]:
    sql = """
    SELECT lower(column_name) FROM information_schema.columns
    WHERE table_schema = :schema AND table_name = :table
    """
    with engine.begin() as conn:
        rows = conn.execute(text(sql), {"schema": schema, "table": table}).fetchall()
    return {r[0] for r in rows}

def _infer_sql_type_from_series(s: pd.Series) -> str:
    if pd.api.types.is_datetime64_any_dtype(s): return "TIMESTAMP"
    if pd.api.types.is_integer_dtype(s):        return "BIGINT"
    if pd.api.types.is_float_dtype(s):          return "DOUBLE PRECISION"
    return "TEXT"

def ensure_table_and_indexes(engine: Engine, schema: str, table: str):
    ddl = f'''
    CREATE TABLE IF NOT EXISTS "{schema}"."{table}" (
      symbol TEXT,
      date   TIMESTAMP
    );'''
    uq  = f"""
    DO $$
    BEGIN
      IF NOT EXISTS (
        SELECT 1 FROM pg_constraint WHERE conname = '{table}_symbol_date_key'
      ) THEN
        ALTER TABLE "{schema}"."{table}"
        ADD CONSTRAINT {table}_symbol_date_key UNIQUE (symbol, date);
      END IF;
    END$$;"""
    idx1 = f'CREATE INDEX IF NOT EXISTS {table}_symbol_idx ON "{schema}"."{table}" (symbol);'
    idx2 = f'CREATE INDEX IF NOT EXISTS {table}_date_idx   ON "{schema}"."{table}" (date);'
    with engine.begin() as conn:
        conn.execute(text(ddl)); conn.execute(text(uq)); conn.execute(text(idx1)); conn.execute(text(idx2))

def ensure_missing_columns(engine: Engine, schema: str, table: str, df: pd.DataFrame):
    have = _existing_columns(engine, schema, table)
    missing = [c for c in df.columns if c not in have]
    if not missing:
        return
    alters = []
    for c in missing:
        sql_t = _infer_sql_type_from_series(df[c])
        alters.append(f'ADD COLUMN IF NOT EXISTS {c} {sql_t}')
    with engine.begin() as conn:
        conn.execute(text(f'ALTER TABLE "{schema}"."{table}" ' + ", ".join(alters) + ";"))

def _dtype_map_for_to_sql(df: pd.DataFrame) -> Dict[str, object]:
    dmap: Dict[str, object] = {}
    for c in df.columns:
        s = df[c]
        if pd.api.types.is_datetime64_any_dtype(s):
            dmap[c] = DateTime(timezone=False)
        elif pd.api.types.is_integer_dtype(s):
            dmap[c] = BigInteger()
        elif pd.api.types.is_float_dtype(s):
            dmap[c] = Float()
        else:
            dmap[c] = Text()
    return dmap

def _stage_to_sql(engine: Engine, df: pd.DataFrame, schema: str, staging: str):
    df.to_sql(
        name=staging, con=engine, schema=schema,
        if_exists="replace", index=False,
        dtype=_dtype_map_for_to_sql(df),
        chunksize=10_000,
    )

def _merge_from_staging(engine: Engine, schema: str, table: str, staging: str, cols: Sequence[str]):
    non_key = [c for c in cols if c not in ("symbol", "date")]
    set_clause = ", ".join([f"{c}=EXCLUDED.{c}" for c in non_key]) or "symbol=EXCLUDED.symbol"
    sql = f"""
    INSERT INTO "{schema}"."{table}" ({", ".join(cols)})
    SELECT {", ".join(cols)} FROM "{schema}"."{staging}"
    ON CONFLICT (symbol, date)
    DO UPDATE SET {set_clause};
    """
    with engine.begin() as conn:
        conn.execute(text(sql))

def upsert_analyst_estimates_postgres(
    df: pd.DataFrame,
    conn_str: str = PG_CONN_STR,
    schema: str = SCHEMA,
    table: str = TABLE,
    chunk_rows: int = CHUNK_ROWS,
):
    """
    Upserts the analyst estimates DataFrame to Postgres:
      - lowercases column names
      - ensures table + unique(symbol,date) + indexes
      - auto-adds any extra columns present in df
      - chunked staging + merge
    """
    if df.empty:
        print("DataFrame is empty; nothing to ingest.")
        return

    d = df.copy()
    d.columns = d.columns.str.lower()
    d["symbol"] = d["symbol"].astype(str).str.upper()
    d["date"]   = pd.to_datetime(d["date"], errors="coerce").dt.tz_localize(None)

    eng = _pg_engine(conn_str)
    ensure_table_and_indexes(eng, schema, table)
    ensure_missing_columns(eng, schema, table, d)

    # keys first
    key_first = [c for c in ("symbol","date") if c in d.columns]
    rest = [c for c in d.columns if c not in key_first]
    d = d[key_first + rest]

    # chunked upsert
    n = len(d); n_chunks = math.ceil(n / chunk_rows)
    for i in range(n_chunks):
        lo, hi = i*chunk_rows, min((i+1)*chunk_rows, n)
        staging = f"stg_{table}"
        _stage_to_sql(eng, d.iloc[lo:hi].copy(), schema, staging)
        _merge_from_staging(eng, schema, table, staging, d.columns.tolist())
        print(f"Upserted rows {lo}–{hi} / {n}")
    print("✅ Analyst estimates ingestion complete.")


# -------------------------- Example usage --------------------------
if __name__ == "__main__":
    API_KEY = ""
    # e.g., your big list (same as you use elsewhere)
    tickers = universe

    # Pull *all* available data (no start/end caps, include future periods)
    df_ae = fetch_analyst_estimates(
        tickers=tickers,
        api_key=API_KEY,
        start=None,
        end=None,
        include_future=True,
        batch_size=5,
        sleep_between_batches=3.0,
        skip_errors=True,
        verbose=True,
    )
    print(df_ae.head(10))
    print(df_ae.tail(10))

    # Ingest to Postgres (optional)
    # upsert_analyst_estimates_postgres(df_ae)


Batch 1: 5 tickers (1–5 of 679)
Batch 2: 5 tickers (6–10 of 679)
Batch 3: 5 tickers (11–15 of 679)
Batch 4: 5 tickers (16–20 of 679)
Batch 5: 5 tickers (21–25 of 679)
Batch 6: 5 tickers (26–30 of 679)
Batch 7: 5 tickers (31–35 of 679)
Batch 8: 5 tickers (36–40 of 679)
Batch 9: 5 tickers (41–45 of 679)
Batch 10: 5 tickers (46–50 of 679)
Batch 11: 5 tickers (51–55 of 679)
Batch 12: 5 tickers (56–60 of 679)
Batch 13: 5 tickers (61–65 of 679)
Batch 14: 5 tickers (66–70 of 679)
Batch 15: 5 tickers (71–75 of 679)
Batch 16: 5 tickers (76–80 of 679)
Batch 17: 5 tickers (81–85 of 679)
Batch 18: 5 tickers (86–90 of 679)
Batch 19: 5 tickers (91–95 of 679)
Batch 20: 5 tickers (96–100 of 679)
Batch 21: 5 tickers (101–105 of 679)
Batch 22: 5 tickers (106–110 of 679)
Batch 23: 5 tickers (111–115 of 679)
Batch 24: 5 tickers (116–120 of 679)
Batch 25: 5 tickers (121–125 of 679)
Batch 26: 5 tickers (126–130 of 679)
Batch 27: 5 tickers (131–135 of 679)
Batch 28: 5 tickers (136–140 of 679)
Batch 29: 5 tic