# 5. Functions and Class — Finance Practice (Assignments Only)

This notebook follows the structure of **Lecture 3 (Part II)**: **Functions** and **Classes**.

Complete the TODO blocks. No plotting.

Data sources:
- **BCRPData API (Peru, official)**
- **FRED (US macro series via CSV download)**
- **U.S. Treasury Fiscal Data API (official)**


In [25]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 150)


In [26]:
from __future__ import annotations

from pathlib import Path
import hashlib
import re
import warnings

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

CACHE_DIR = Path(".cache")
CACHE_DIR.mkdir(exist_ok=True)

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

_ES_TO_EN_MONTH = {
    "Ene": "Jan", "Feb": "Feb", "Mar": "Mar", "Abr": "Apr", "May": "May", "Jun": "Jun",
    "Jul": "Jul", "Ago": "Aug", "Set": "Sep", "Sep": "Sep", "Oct": "Oct", "Nov": "Nov", "Dic": "Dec"
}

def _hash_key(*parts: str) -> str:
    h = hashlib.sha256()
    for p in parts:
        h.update(str(p).encode("utf-8"))
        h.update(b"|")
    return h.hexdigest()[:24]

def _to_float(x):
    """Robust numeric parsing (handles decimal comma, 'n.d.', etc.)."""
    if x is None:
        return np.nan
    if isinstance(x, (int, float, np.number)):
        return float(x)
    s = str(x).strip()
    if s.lower() in {"n.d.", "nd", "n.d", "na", "nan", ""}:
        return np.nan
    s = s.replace(" ", "").replace("\u00a0", "")
    # Spanish formatting: thousands "." and decimal ","
    if "," in s and "." in s:
        s = s.replace(".", "").replace(",", ".")
    elif "," in s and "." not in s:
        s = s.replace(",", ".")
    return pd.to_numeric(s, errors="coerce")

def _parse_bcrp_period(name: str) -> pd.Timestamp:
    s = str(name).strip()

    dt = pd.to_datetime(s, errors="coerce")
    if pd.notna(dt):
        return dt

    # Daily: 18Nov25, 02Ene97
    m = re.fullmatch(r"(\d{2})([A-Za-zÁÉÍÓÚÑñ]{3})(\d{2})", s)
    if m:
        d, mon_es, yy = m.groups()
        mon = _ES_TO_EN_MONTH.get(mon_es[:3], mon_es[:3])
        year = 2000 + int(yy) if int(yy) <= 69 else 1900 + int(yy)
        return pd.to_datetime(f"{d}{mon}{year}", format="%d%b%Y", errors="coerce")

    # Monthly: Mar.2020
    m = re.fullmatch(r"([A-Za-zÁÉÍÓÚÑñ]{3})\.(\d{4})", s)
    if m:
        mon_es, y = m.groups()
        mon = _ES_TO_EN_MONTH.get(mon_es[:3], mon_es[:3])
        return pd.to_datetime(f"01{mon}{y}", format="%d%b%Y", errors="coerce")

    # Monthly: Ene92 or Ene.92
    m = re.fullmatch(r"([A-Za-zÁÉÍÓÚÑñ]{3})\.?(\d{2})", s)
    if m:
        mon_es, yy = m.groups()
        mon = _ES_TO_EN_MONTH.get(mon_es[:3], mon_es[:3])
        year = 2000 + int(yy) if int(yy) <= 69 else 1900 + int(yy)
        return pd.to_datetime(f"01{mon}{year}", format="%d%b%Y", errors="coerce")

    # Numeric month: 2022-5
    m = re.fullmatch(r"(\d{4})-(\d{1,2})", s)
    if m:
        y, mo = m.groups()
        return pd.to_datetime(f"{int(y):04d}-{int(mo):02d}-01", errors="coerce")

    # Year only
    m = re.fullmatch(r"(\d{4})", s)
    if m:
        return pd.to_datetime(f"{m.group(1)}-01-01", errors="coerce")

    return pd.to_datetime(s, errors="coerce")

def _normalize_bcrp_period(code: str, period: str | None) -> str | None:
    """
    Normalize periods for BCRP API.
    Heuristic:
    - codes starting with PD: daily -> YYYY-MM-DD
    - codes starting with PN: monthly -> YYYY-m (no zero-pad)
    """
    if period is None:
        return None
    p = str(period).strip()

    if code.startswith("PD"):
        if re.fullmatch(r"\d{4}-\d{1,2}", p):
            y, m = p.split("-")
            return f"{int(y):04d}-{int(m):02d}-01"
        if re.fullmatch(r"\d{4}", p):
            return f"{int(p):04d}-01-01"
        return p

    if code.startswith("PN"):
        m = re.fullmatch(r"(\d{4})-(\d{1,2})-(\d{1,2})", p)
        if m:
            y, mo, _ = m.groups()
            return f"{int(y):04d}-{int(mo)}"
        m = re.fullmatch(r"(\d{4})-(\d{1,2})", p)
        if m:
            y, mo = m.groups()
            return f"{int(y):04d}-{int(mo)}"
        if re.fullmatch(r"\d{4}", p):
            return f"{int(p):04d}-1"
        return p

    return p

def _read_parquet_safe(path: Path) -> pd.DataFrame | None:
    try:
        return pd.read_parquet(path)
    except Exception:
        return None

def _write_parquet_safe(df: pd.DataFrame, path: Path) -> None:
    try:
        df.to_parquet(path)
    except Exception:
        pass

def bcrp_get(series_codes, start: str | None = None, end: str | None = None, lang: str = "esp") -> pd.DataFrame:
    """
    Fetch BCRPData series using the official BCRP API (JSON).
    Returns columns: ['date', <code1>, <code2>, ...]
    """
    try:
        import requests
    except Exception:
        warnings.warn("requests not available; returning empty DataFrame.")
        return pd.DataFrame()

    if isinstance(series_codes, (list, tuple)):
        codes_list = [str(c).strip() for c in series_codes]
        codes = "-".join(codes_list)
        first_code = codes_list[0]
    else:
        codes = str(series_codes).strip()
        codes_list = codes.split("-")
        first_code = codes_list[0]

    start_n = _normalize_bcrp_period(first_code, start)
    end_n = _normalize_bcrp_period(first_code, end)

    key = _hash_key("bcrp", codes, start_n or "", end_n or "", lang)
    cache_path = CACHE_DIR / f"bcrp_{key}.parquet"
    cached = _read_parquet_safe(cache_path)
    if cached is not None:
        return cached

    base_url = "https://estadisticas.bcrp.gob.pe/estadisticas/series/api"
    parts = [base_url, codes, "json"]
    if start_n and end_n:
        parts += [start_n, end_n]
    if lang:
        parts += [lang]
    url = "/".join(parts)

    try:
        r = requests.get(url, timeout=45)
        r.raise_for_status()
        obj = r.json()
    except Exception as e:
        warnings.warn(f"BCRP request failed ({repr(e)}). Returning empty DataFrame.")
        return pd.DataFrame(columns=["date"] + codes_list)

    periods = obj.get("periods", [])
    rows = []
    for p in periods:
        name = p.get("name")
        vals = p.get("values", [])
        if isinstance(vals, str):
            vals = [vals]
        if name is None or not isinstance(vals, list):
            continue
        vals = (vals + [None] * len(codes_list))[:len(codes_list)]
        rows.append([name] + vals)

    df = pd.DataFrame(rows, columns=["date"] + codes_list)
    if df.shape[0] == 0:
        return pd.DataFrame(columns=["date"] + codes_list)

    df["date"] = df["date"].apply(_parse_bcrp_period)
    for c in codes_list:
        df[c] = df[c].apply(_to_float)

    df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
    _write_parquet_safe(df, cache_path)
    return df

def fred_get(series_ids, start: str | None = None, end: str | None = None) -> pd.DataFrame:
    """
    Fetch FRED series WITHOUT an API key using the CSV export used by FRED graphs.

    Uses cosd/coed to limit the download window (more reliable than downloading full history).
    Returns columns: ['date', <series1>, <series2>, ...]
    """
    try:
        import requests
    except Exception as e:
        warnings.warn(f"requests not available ({e}); returning empty DataFrame.")
        cols = [str(s).strip() for s in series_ids] if isinstance(series_ids, (list, tuple)) else [str(series_ids).strip()]
        return pd.DataFrame(columns=["date"] + cols)

    from io import StringIO

    def _fetch_one(sid: str) -> pd.DataFrame:
        sid = str(sid).strip()
        key = _hash_key("fred_v3", sid, start or "", end or "")
        cache_path = CACHE_DIR / f"fred_{key}.parquet"
        cached = _read_parquet_safe(cache_path)
        if cached is not None:
            return cached

        url = "https://fred.stlouisfed.org/graph/fredgraph.csv"
        params = {"id": sid}
        if start:
            params["cosd"] = str(start)
        if end:
            params["coed"] = str(end)

        headers = {"User-Agent": "python-finance-course/1.0 (contact: student@example.com)"}

        try:
            r = requests.get(url, params=params, headers=headers, timeout=60)
            r.raise_for_status()
            text = r.text
        except Exception as e:
            warnings.warn(f"FRED request failed for {sid} ({repr(e)}). Returning empty.")
            return pd.DataFrame(columns=["date", sid])

        # Guardrails: detect HTML or unexpected payloads
        first_line = (text.splitlines()[0].strip() if text else "")
        if first_line.lower().startswith("<!doctype") or "<html" in first_line.lower():
            warnings.warn(f"FRED returned HTML for {sid}. Returning empty.")
            return pd.DataFrame(columns=["date", sid])

        if "DATE" not in first_line.upper() or sid.upper() not in first_line.upper():
            warnings.warn(f"FRED response header unexpected for {sid}: {first_line[:120]} ... Returning empty.")
            return pd.DataFrame(columns=["date", sid])

        df = pd.read_csv(StringIO(text))
        # Normalize column names
        df = df.rename(columns={df.columns[0]: "date", df.columns[1]: sid})
        df["date"] = pd.to_datetime(df["date"], errors="coerce")

        # FRED missing values are often "."
        s = df[sid].astype(str).str.strip().replace({".": np.nan, "": np.nan, "NA": np.nan, "NaN": np.nan})
        df[sid] = pd.to_numeric(s, errors="coerce")

        df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
        _write_parquet_safe(df, cache_path)
        return df

    if isinstance(series_ids, (list, tuple)):
        cols = [str(s).strip() for s in series_ids]
        out = None
        for sid in cols:
            dfi = _fetch_one(sid)
            out = dfi if out is None else out.merge(dfi, on="date", how="outer")
        if out is None:
            return pd.DataFrame(columns=["date"] + cols)
        return out.sort_values("date").reset_index(drop=True)

    return _fetch_one(series_ids)



def treasury_get_debt_to_penny(start_date: str = "2024-01-01") -> dict:
    """
    Fetch U.S. Treasury Fiscal Data (Debt to the Penny) as a raw dictionary.
    API docs: https://fiscaldata.treasury.gov/api-documentation/
    """
    try:
        import requests
    except Exception:
        warnings.warn("requests not available; returning empty dict.")
        return {}

    base = "https://api.fiscaldata.treasury.gov/services/api/fiscal_service/v2"
    endpoint = "accounting/od/debt_to_penny"
    url = f"{base}/{endpoint}"

    params = {
        "fields": "record_date,tot_pub_debt_out_amt,debt_held_public_amt,intragov_hold_amt",
        "filter": f"record_date:gte:{start_date}",
        "sort": "-record_date",
        "page[size]": 1000
    }

    headers = {"User-Agent": "python-finance-course/1.0 (contact: student@example.com)"}

    try:
        r = requests.get(url, params=params, headers=headers, timeout=45)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        warnings.warn(f"Treasury API request failed ({repr(e)}). Returning empty dict.")
        return {}

def save_pickle(obj, path: Path) -> None:
    import pickle
    path.parent.mkdir(exist_ok=True, parents=True)
    with open(path, "wb") as f:
        pickle.dump(obj, f)

def load_pickle(path: Path):
    import pickle
    with open(path, "rb") as f:
        return pickle.load(f)

def safe_head(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
    return df.head(n) if isinstance(df, pd.DataFrame) else pd.DataFrame()

## Real data setup

We fetch a small amount of real data.

**Peru (BCRPData):**
- `PD04650MD`: Net International Reserves (daily)
- `PN01652XM`: Copper price (LME, monthly)

**US (FRED):**
- `DGS10`: 10-year Treasury yield (daily)
- `FEDFUNDS`: Effective federal funds rate (monthly)

**US Treasury Fiscal Data:**
- Debt to the Penny: total public debt outstanding


In [27]:
# Time windows (adjust if you want)
START_DAILY = "2020-01-01"
END_DAILY = "2025-12-18"
START_MONTHLY = "2015-01-01"

# Peru (BCRP)
rin = bcrp_get("PD04650MD", start=START_DAILY, end=END_DAILY).rename(columns={"PD04650MD": "RIN_USD_mn"})
copper = bcrp_get("PN01652XM", start=START_MONTHLY, end=END_DAILY).rename(columns={"PN01652XM": "Copper_LME_cents_per_lb"})

# US (FRED)
macro_us = fred_get(["DGS10", "FEDFUNDS"], start=START_DAILY)

# US Treasury Fiscal Data (raw dict, then saved for the dictionary exercise)
treasury_raw = treasury_get_debt_to_penny(start_date="2024-01-01")
save_pickle(treasury_raw, Path("data/treasury_debt_to_penny_raw.pkl"))

rin.shape, copper.shape, macro_us.shape, list(treasury_raw.keys())

((877, 2), (131, 2), (1576, 3), ['data', 'meta', 'links'])

In [28]:
display(safe_head(rin))
display(safe_head(copper))
display(safe_head(macro_us))


Unnamed: 0,date,RIN_USD_mn
0,2020-02-03,68820.232306
1,2020-02-04,68648.663705
2,2020-02-05,68790.147192
3,2020-02-06,68812.399398
4,2020-02-07,68976.74011


Unnamed: 0,date,Copper_LME_cents_per_lb
0,2015-01-01,265.576789
1,2015-02-01,259.875545
2,2015-03-01,269.418923
3,2015-04-01,273.904333
4,2015-05-01,285.478621


Unnamed: 0,date,DGS10,FEDFUNDS
0,2020-01-01,,1.55
1,2020-01-02,1.88,
2,2020-01-03,1.8,
3,2020-01-06,1.81,
4,2020-01-07,1.83,


##  <a id='#5.1.'>5.1. Functions</a>

### <a id = '#5.1.1.'> 5.1.1. The importance of Python functions </a>

**Assignment:** write a function that standardizes a single-series DataFrame.

Requirements:
1. Input: a DataFrame with a `date` column and exactly **one value column**.
2. Output: a DataFrame with columns `date` and `value`.
3. Must:
   - convert `date` to datetime
   - sort by date
   - drop rows where `date` is missing
4. Use the function for both `rin` and `copper`.


In [29]:
# TODO (students)
def standardize_single_series(df: pd.DataFrame) -> pd.DataFrame:
    if not isinstance(df, pd.DataFrame) or df.empty:
        return pd.DataFrame(columns=["date", "value"])

    if "date" not in df.columns:
        raise ValueError("Input DataFrame must contain a 'date' column.")

    value_cols = [c for c in df.columns if c != "date"]
    if len(value_cols) != 1:
        raise ValueError(f"Expected exactly 1 value column besides 'date', got {len(value_cols)}: {value_cols}")

    val_col = value_cols[0]

    out = df[["date", val_col]].copy()
    out["date"] = pd.to_datetime(out["date"], errors="coerce")
    out[val_col] = pd.to_numeric(out[val_col], errors="coerce")
    out = out.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
    return out.rename(columns={val_col: "value"})

rin_std = standardize_single_series(rin)
copper_std = standardize_single_series(copper)

print(rin_std.columns.tolist(), rin_std.shape)
print(copper_std.columns.tolist(), copper_std.shape)

['date', 'value'] (0, 2)
['date', 'value'] (0, 2)


### <a id='#5.1.2.'> 5.1.2. Basic structure of a function </a>

**Assignment:** create a function that computes percent changes (in %).

Requirements:
- Input: `pd.Series`
- Parameter: `periods: int = 1`
- Output: `pd.Series`

Use it to compute:
- daily % change of `RIN_USD_mn`
- daily % change of `DGS10`


In [30]:
# Solution
def pct_change_percent(x: pd.Series, periods: int = 1) -> pd.Series:
    """Compute percent changes in **percent units** (e.g., 1.2 means 1.2%)."""
    if x is None or not isinstance(x, pd.Series) or x.empty:
        return pd.Series(dtype=float)

    s = pd.to_numeric(x, errors="coerce")
    out = s.pct_change(periods=periods) * 100.0
    out.name = getattr(x, "name", None)
    return out

rin_ret = pct_change_percent(rin.set_index("date")["RIN_USD_mn"])
dgs10_ret = pct_change_percent(macro_us.set_index("date")["DGS10"])

print(rin_ret.dropna().head())
print(dgs10_ret.dropna().head())


Series([], dtype: float64)
Series([], dtype: float64)


### <a id='#5.1.3.'>5.1.3. Function without `return` </a>

**Assignment:** write a function that prints a compact report:
- number of observations
- number of missing values
- min / max

Call it for `RIN_USD_mn` and for `DGS10`.


In [31]:
# Solution
def print_series_report(x: pd.Series, name: str) -> None:
    """Print a compact report: n, missing, min/max (ignoring missing)."""
    if x is None or not isinstance(x, pd.Series):
        print(f"[{name}] Invalid input (expected a pandas Series).")
        return

    s = pd.to_numeric(x, errors="coerce")
    n_obs = int(s.shape[0])
    n_missing = int(s.isna().sum())
    n_non_missing = int(s.notna().sum())

    if n_non_missing == 0:
        print(f"[{name}] n={n_obs} | missing={n_missing} | min=NA | max=NA")
        return

    s_min = float(s.min(skipna=True))
    s_max = float(s.max(skipna=True))
    print(f"[{name}] n={n_obs} | missing={n_missing} | min={s_min:.4f} | max={s_max:.4f}")

if rin.shape[0] > 0:
    print_series_report(rin["RIN_USD_mn"], "Peru: Net International Reserves (RIN)")
if macro_us.shape[0] > 0:
    print_series_report(macro_us["DGS10"], "US: 10Y Treasury Yield (DGS10)")


### <a id='#5.1.5.'>5.1.5. Multiple objects for return </a>

**Assignment:** create a function that returns **two objects**:
1. a cleaned return series (drop NaNs)
2. a scalar volatility estimate (standard deviation)

Use it on `DGS10` percent changes.


In [32]:
# Solution
def returns_and_volatility(x: pd.Series) -> tuple[pd.Series, float]:
    """Return clean returns and their sample volatility (std dev)."""
    if x is None or not isinstance(x, pd.Series) or x.empty:
        return pd.Series(dtype=float), float("nan")

    clean = pd.to_numeric(x, errors="coerce").dropna()
    vol = float(clean.std(ddof=1)) if clean.shape[0] >= 2 else float("nan")
    return clean, vol

dgs10_clean_ret, dgs10_vol = returns_and_volatility(dgs10_ret)
print(dgs10_clean_ret.head())
print("Vol:", dgs10_vol)


Series([], dtype: float64)
Vol: nan


### <a id='#5.1.5.'>5.1.5. If condition with return </a>

**Assignment:** write a validation function.

Requirements:
- Input: DataFrame with `date` and `value`
- If there are fewer than `min_n` rows, return `None`
- Otherwise, return the DataFrame

Test it with `copper_std` (monthly series can be shorter).


In [33]:
# TODO (students)
def validate_min_rows(df: pd.DataFrame, min_n: int = 24):
    if isinstance(df, pd.DataFrame) and df.shape[0] >= min_n:
        return df
    return None

copper_ok = validate_min_rows(copper_std, min_n=24)
print("Copper validated:", copper_ok is not None)

Copper validated: False


### <a id='#5.1.6.'>5.1.6. Default values to parameters </a>

**Assignment:** implement winsorization with default quantiles.

Write a function:
`winsorize(x, lower_q=0.01, upper_q=0.99)`

Use it on `RIN_USD_mn`.


In [34]:
# Solution
def winsorize(x: pd.Series, lower_q: float = 0.01, upper_q: float = 0.99) -> pd.Series:
    """Clip a series to the [lower_q, upper_q] quantile range."""
    if x is None or not isinstance(x, pd.Series) or x.empty:
        return pd.Series(dtype=float)

    s = pd.to_numeric(x, errors="coerce")
    lo = s.quantile(lower_q)
    hi = s.quantile(upper_q)
    return s.clip(lower=lo, upper=hi)

rin_w = winsorize(rin.set_index("date")["RIN_USD_mn"])
print(rin_w.dropna().head())


Series([], dtype: float64)


### <a id='#5.1.7.'>5.1.7. Type hints for parameters and return types </a>

**Assignment:** add type hints and implement a merge helper.

Write:
`merge_on_date(left: pd.DataFrame, right: pd.DataFrame, how: str = 'inner') -> pd.DataFrame`

Then merge `rin` with `macro_us[['date','DGS10']]`.


In [35]:
# Solution
def merge_on_date(left: pd.DataFrame, right: pd.DataFrame, how: str = "inner") -> pd.DataFrame:
    """Merge two DataFrames on a 'date' column after coercing to datetime."""
    if not isinstance(left, pd.DataFrame) or not isinstance(right, pd.DataFrame):
        return pd.DataFrame()

    l = left.copy()
    r = right.copy()

    if "date" not in l.columns or "date" not in r.columns:
        raise ValueError("Both DataFrames must contain a 'date' column.")

    l["date"] = pd.to_datetime(l["date"], errors="coerce")
    r["date"] = pd.to_datetime(r["date"], errors="coerce")

    l = l.dropna(subset=["date"])
    r = r.dropna(subset=["date"])

    out = l.merge(r, on="date", how=how)
    return out.sort_values("date").reset_index(drop=True)

rin_dgs10 = merge_on_date(rin, macro_us[["date", "DGS10"]], how="inner")
print(rin_dgs10.head())


Empty DataFrame
Columns: []
Index: []


### <a id='#5.1.8.'>5.1.8. Local variables vs Global variables </a>

**Assignment:** demonstrate the difference clearly.

1. Create a global variable `BASE_CCY = 'USD'`.
2. Write `format_label(series_name: str) -> str` that uses the global variable.
3. Inside the function, create a local variable `suffix = '(global ccy)'`.
4. Return a label like: `'{series_name} - USD (global ccy)'`.


In [36]:
# TODO (students)
BASE_CCY = "USD"

def format_label(series_name: str) -> str:
    suffix = "(global ccy)"
    return f"{series_name} - {BASE_CCY} {suffix}"

print(format_label("RIN"))

RIN - USD (global ccy)


### <a id='#5.1.9.'>5.1.9. `*args` </a>

**Assignment:** implement a function that accepts multiple series codes.

Write:
`fetch_bcrp_many(*codes: str, start: str, end: str) -> pd.DataFrame`

Requirements:
- Use the provided `bcrp_get` inside.
- Return a DataFrame with `date` and one column per code.
- Call it with two codes: `PD04650MD` and `PD04649MD`.


In [37]:
# Solution
def fetch_bcrp_many(*codes: str, start: str, end: str) -> pd.DataFrame:
    """Fetch multiple BCRP series and return a DataFrame with a datetime 'date' column."""
    if len(codes) == 0:
        return pd.DataFrame(columns=["date"])

    df = bcrp_get(list(codes), start=start, end=end)

    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

    return df

two_series = fetch_bcrp_many("PD04650MD", "PN01652XM", start=START_DAILY, end=END_DAILY)
print(two_series.head())


Empty DataFrame
Columns: []
Index: []


### <a id='#5.1.10.'>5.1.10. `**kwargs` </a>

**Assignment:** write a wrapper around `DataFrame.describe`.

Write:
`describe_df(df: pd.DataFrame, **kwargs) -> pd.DataFrame`

Then call it on `macro_us[['DGS10']]` with custom percentiles.


In [38]:
# Solution
def describe_df(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """Thin wrapper around DataFrame.describe(**kwargs) with numeric coercion."""
    if not isinstance(df, pd.DataFrame) or df.empty:
        return pd.DataFrame()

    tmp = df.copy()
    for c in tmp.columns:
        tmp[c] = pd.to_numeric(tmp[c], errors="ignore")
    return tmp.describe(**kwargs)

desc_yields = describe_df(macro_us[["DGS10"]], percentiles=[0.05, 0.5, 0.95])
print(desc_yields)


Empty DataFrame
Columns: []
Index: []


## Excersise
#### Importing a Dictionary

We use a **real** nested dictionary from the **U.S. Treasury Fiscal Data API** (saved as `data/treasury_debt_to_penny_raw.pkl`).


In [39]:
raw_dict = load_pickle(Path('data/treasury_debt_to_penny_raw.pkl'))
type(raw_dict), list(raw_dict.keys())


(dict, ['data', 'meta', 'links'])

### Part A — Extract to lists

**Assignment:** using a `for` loop, extract these into Python lists:
- `record_date`
- `tot_pub_debt_out_amt`
- `debt_held_public_amt`
- `intragov_hold_amt`

Then build a DataFrame with those columns and convert numeric columns to floats.


In [40]:
# Solution
data = raw_dict.get("data", []) if isinstance(raw_dict, dict) else []

record_dates = []
tot_debt = []
public_debt = []
intragov = []

for row in data:
    record_dates.append(row.get("record_date"))
    tot_debt.append(row.get("tot_pub_debt_out_amt"))
    public_debt.append(row.get("debt_held_public_amt"))
    intragov.append(row.get("intragov_hold_amt"))

debt_df = pd.DataFrame({
    "record_date": pd.to_datetime(record_dates, errors="coerce"),
    "tot_pub_debt_out_amt": pd.to_numeric(pd.Series(tot_debt), errors="coerce"),
    "debt_held_public_amt": pd.to_numeric(pd.Series(public_debt), errors="coerce"),
    "intragov_hold_amt": pd.to_numeric(pd.Series(intragov), errors="coerce"),
}).dropna(subset=["record_date"]).sort_values("record_date").reset_index(drop=True)

print(debt_df.head())


Empty DataFrame
Columns: []
Index: []


### Part B — Turn it into a function

**Assignment:** define:
`treasury_debt_dict_to_df(obj: dict) -> pd.DataFrame`

Return a clean DataFrame sorted by date.


In [41]:
# Solution
def treasury_debt_dict_to_df(obj: dict) -> pd.DataFrame:
    """Convert a Treasury Fiscal Data response dict into a clean DataFrame."""
    if not isinstance(obj, dict):
        return pd.DataFrame()

    data = obj.get("data", [])
    if not isinstance(data, list) or len(data) == 0:
        return pd.DataFrame(columns=[
            "record_date",
            "tot_pub_debt_out_amt",
            "debt_held_public_amt",
            "intragov_hold_amt",
        ])

    df = pd.DataFrame(data).copy()

    keep = ["record_date", "tot_pub_debt_out_amt", "debt_held_public_amt", "intragov_hold_amt"]
    for c in keep:
        if c not in df.columns:
            df[c] = np.nan
    df = df[keep]

    df["record_date"] = pd.to_datetime(df["record_date"], errors="coerce")
    for c in ["tot_pub_debt_out_amt", "debt_held_public_amt", "intragov_hold_amt"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    df = df.dropna(subset=["record_date"]).sort_values("record_date").reset_index(drop=True)
    return df

debt_df2 = treasury_debt_dict_to_df(raw_dict)
print(debt_df2.head())


Empty DataFrame
Columns: []
Index: []


### Part C — Row iteration

**Assignment:**
1. Create `raw_table = pd.DataFrame(raw_dict['data'])`.
2. Iterate over rows (e.g. `.iterrows()`), and build `debt_small` with only:
   `record_date` and `tot_pub_debt_out_amt`.


In [42]:
# Solution
data = raw_dict.get("data", []) if isinstance(raw_dict, dict) else []
raw_table = pd.DataFrame(data)

rows = []
if not raw_table.empty:
    for _, row in raw_table.iterrows():
        rows.append({
            "record_date": row.get("record_date"),
            "tot_pub_debt_out_amt": row.get("tot_pub_debt_out_amt"),
        })

debt_small = pd.DataFrame(rows)
if not debt_small.empty:
    debt_small["record_date"] = pd.to_datetime(debt_small["record_date"], errors="coerce")
    debt_small["tot_pub_debt_out_amt"] = pd.to_numeric(debt_small["tot_pub_debt_out_amt"], errors="coerce")
    debt_small = debt_small.dropna(subset=["record_date"]).sort_values("record_date").reset_index(drop=True)

print(debt_small.head())


Empty DataFrame
Columns: []
Index: []


### <a id='#5.2.'>5.2. Class </a>

Practice class basics using the same datasets.

###  <a id='#5.2.2.'> 5.2.2. Defining a class </a>

**Assignment:** define a class `MacroSeries`.

Attributes:
- `name` (str)
- `source` (str)
- `data` (pd.DataFrame with columns: date, value)

Methods:
- `latest_value()` → float
- `n_obs()` → int


In [43]:
# Solution
class MacroSeries:
    """Represents one macro/financial time series."""

    def __init__(self, name: str, source: str, data: pd.DataFrame):
        self.name = name
        self.source = source

        if not isinstance(data, pd.DataFrame):
            raise TypeError("data must be a pandas DataFrame.")
        if "date" not in data.columns or "value" not in data.columns:
            raise ValueError("data must contain columns ['date', 'value'].")

        df = data.copy()
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        df["value"] = pd.to_numeric(df["value"], errors="coerce")
        df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

        self.data = df

    def latest_value(self) -> float:
        """Latest non-missing value by date."""
        if self.data.empty:
            return float("nan")
        s = self.data["value"].dropna()
        if s.empty:
            return float("nan")
        return float(s.iloc[-1])

    def n_obs(self) -> int:
        """Number of non-missing observations."""
        if self.data.empty:
            return 0
        return int(self.data["value"].notna().sum())

    def pct_change(self, periods: int = 1) -> pd.Series:
        """Percent change in percent units (%). Returns a Series indexed by date."""
        if self.data.empty:
            return pd.Series(dtype=float)

        s = self.data.set_index("date")["value"]
        out = s.pct_change(periods=periods) * 100.0
        out.name = f"{self.name}_pct_change"
        return out


    def label(self) -> str:
        """Human-readable label for printing/logging."""
        nm = getattr(self, 'name', '')
        src_ = getattr(self, 'source', '')
        if nm and src_:
            return f"{nm} ({src_})"
        return str(nm or src_ or 'MacroSeries')

rin_obj = MacroSeries("Peru RIN (USD mn)", "BCRPData", rin_std)

dgs10_df = macro_us[["date", "DGS10"]].rename(columns={"DGS10": "value"})
dgs10_obj = MacroSeries("DGS10", "FRED", dgs10_df)

print(rin_obj.name, rin_obj.source)
print(dgs10_obj.name, dgs10_obj.source)

RIN BCRP
DGS10 FRED


### <a id='#5.2.3.'> 5.2.3. Attributes </a>

**Assignment:** access and modify attributes.

1. Print `rin_obj.name` and `rin_obj.source`.
2. Update `rin_obj.name` to `Peru RIN (USD mn)`.
3. Print again.


In [44]:
print(rin_obj.name, rin_obj.source)
rin_obj.name = 'Peru RIN (USD mn)'
print(rin_obj.name, rin_obj.source)


RIN BCRP
Peru RIN (USD mn) BCRP


### <a id='#5.2.5.'> 5.2.5. Method </a>

**Assignment:** implement a method:
`pct_change(self, periods: int = 1) -> pd.Series`
that returns percent changes of the `value` column.


In [45]:
# Solution: pct_change() inside MacroSeries
out = rin_obj.pct_change(periods=1)
print(out.dropna().head())


Implement pct_change inside MacroSeries. Current error: AttributeError("'MacroSeries' object has no attribute 'pct_change'")


###  <a id='#5.2.6.'> 5.2.6. __init__() </a>

**Assignment:** validate inputs inside `__init__`.

Modify `__init__` so that if `data` does not have columns `date` and `value`, it raises `ValueError`.
Then intentionally try to create a bad object and confirm it raises an error.


In [46]:
try:
    bad = MacroSeries("BAD", "TEST", pd.DataFrame({"x":[1,2], "y":[3,4]}))
    print("If you see this, you did not validate columns yet.")
except Exception as e:
    print("Expected error:", repr(e))

If you see this, you did not validate columns yet.


### <a id='#5.2.7.'> 5.2.7. Self</a>

**Assignment:** use `self` to build a label.

Add `label(self) -> str` returning `'{name} [{source}]'`.


In [47]:
print(rin_obj.label())

Implement label() inside MacroSeries. Current error: AttributeError("'MacroSeries' object has no attribute 'label'")


## Methods

**Mini-project (assigned):** create `MacroDashboard` to manage multiple series.

Class requirements:
- attribute `series_list` (list)
- method `add(self, s: MacroSeries) -> None`
- method `to_wide(self) -> pd.DataFrame` that merges all series on date (wide format)

Test it by adding `rin_obj` and `dgs10_obj`.


In [48]:
# TODO (students)
class MacroDashboard:
    def __init__(self):
        self.series_list = []

    def add(self, s: MacroSeries) -> None:
        if s is None:
            raise ValueError("Cannot add None.")
        if not hasattr(s, "name") or not hasattr(s, "data"):
            raise TypeError("Expected a MacroSeries-like object with attributes 'name' and 'data'.")
        if not isinstance(s.data, pd.DataFrame):
            raise TypeError("s.data must be a pandas DataFrame.")
        if "date" not in s.data.columns or "value" not in s.data.columns:
            raise ValueError("s.data must contain columns ['date', 'value'].")
        self.series_list.append(s)

    def to_wide(self) -> pd.DataFrame:
        if len(self.series_list) == 0:
            return pd.DataFrame(columns=["date"])

        out = None
        for s in self.series_list:
            df = s.data[["date", "value"]].copy()
            df["date"] = pd.to_datetime(df["date"], errors="coerce")
            df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

            df = df.rename(columns={"value": str(getattr(s, "name", "series"))})
            out = df if out is None else out.merge(df, on="date", how="outer")

        return out.sort_values("date").reset_index(drop=True)

dash = MacroDashboard()
dash.add(rin_obj)
dash.add(dgs10_obj)

wide = dash.to_wide()
print(wide.head())

Empty DataFrame
Columns: []
Index: []


## Homework (assigned, not solved)

Submit one completed notebook with:

1. All function TODOs completed (5.1.1 to 5.1.10).
2. The dictionary exercise completed (Parts A–C), producing a clean debt DataFrame with numeric columns.
3. `MacroSeries` implemented with:
   - `latest_value`, `n_obs`, `pct_change`, `label`, and input validation in `__init__`.
4. `MacroDashboard` implemented and tested, producing a wide merged dataset.
5. A short section **Data Sources** explaining:
   - BCRPData (Peru)
   - FRED (US)
   - U.S. Treasury Fiscal Data API

No plots.

# <a id='#5.3.'>5.3. References </a>

- BCRPData API Guide: https://estadisticas.bcrp.gob.pe/estadisticas/series/ayuda/api
- FRED: https://fred.stlouisfed.org/
- U.S. Treasury Fiscal Data API: https://fiscaldata.treasury.gov/api-documentation/
- Type hints (typing): https://docs.python.org/3/library/typing.html
