# 3. <a id='intro'>Pandas</a>

This practice notebook  is **guided by the original Lecture 2** structure. All exercises use **real financial / economic data** from Peru and the US.



## 3.1. <a id='def'>Definition</a>

Pandas is a Python library for working with tabular data (Series and DataFrames), including importing, cleaning, reshaping, and merging datasets.

In [870]:
# Use postponed evaluation of type annotations (helps with `str | None` on older Python versions).
from __future__ import annotations

# Path: cross-platform file/folder paths.
from pathlib import Path
# hashlib: create stable hashes for cache filenames.
import hashlib
# re: regular expressions for validating/parsing date strings.
import re
# warnings: control warning messages.
import warnings

# numpy: numeric operations + NaN handling.
import numpy as np
# pandas: tables (Series/DataFrame) + parsing dates + IO (parquet).
import pandas as pd

# Hide warnings in notebook output (keeps cells clean; you can remove this while debugging).
warnings.filterwarnings("ignore")

# Define a local folder to store cached downloads.
CACHE_DIR = Path(".cache")
# Create the cache folder if it doesn't exist.
CACHE_DIR.mkdir(exist_ok=True)

# Map Spanish 3-letter month abbreviations to English ones (needed for parsing BCRP date labels).
_ES_TO_EN_MONTH = {
    "Ene": "Jan", "Feb": "Feb", "Mar": "Mar", "Abr": "Apr", "May": "May", "Jun": "Jun",
    "Jul": "Jul", "Ago": "Aug", "Set": "Sep", "Sep": "Sep", "Oct": "Oct", "Nov": "Nov", "Dic": "Dec"
}

def _hash_key(*parts: str) -> str:
    # Create a SHA-256 hash object.
    h = hashlib.sha256()
    # Update the hash with each part (as UTF-8 bytes), plus a separator.
    for p in parts:
        h.update(str(p).encode("utf-8"))
        h.update(b"|")
    # Return a short hash prefix to use in filenames (still very unlikely to collide).
    return h.hexdigest()[:24]

def _normalize_period(code: str, period: str | None) -> str | None:
    # If no period provided, return None.
    if period is None:
        return None
    # Convert to string and trim spaces.
    period = str(period).strip()
    # Use the last 2 characters of the BCRP code to infer frequency (PD daily, PM monthly, PA annual).
    freq = code[-2:].upper() if len(code) >= 2 else ""

    if freq == "PD":  # daily frequency
        # If user passes "YYYY-M" or "YYYY-MM", convert to "YYYY-MM-01" (first day of month).
        if re.fullmatch(r"\d{4}-\d{1,2}", period):
            y, m = period.split("-")
            return f"{int(y):04d}-{int(m):02d}-01"
        # If user passes just "YYYY", convert to "YYYY-01-01".
        if re.fullmatch(r"\d{4}", period):
            return f"{int(period):04d}-01-01"
        # Otherwise keep the period as-is (e.g., already "YYYY-MM-DD").
        return period

    if freq == "PM":  # monthly frequency
        # If user passes "YYYY-MM-DD", convert to "YYYY-M" (month index).
        m = re.fullmatch(r"(\d{4})-(\d{1,2})-(\d{1,2})", period)
        if m:
            y, mo, _ = m.groups()
            return f"{int(y):04d}-{int(mo)}"
        # If user passes "YYYY-MM", convert to "YYYY-M".
        m = re.fullmatch(r"(\d{4})-(\d{1,2})", period)
        if m:
            y, mo = m.groups()
            return f"{int(y):04d}-{int(mo)}"
        # If user passes "YYYY", default to "YYYY-1" (January).
        if re.fullmatch(r"\d{4}", period):
            return f"{int(period):04d}-1"
        # Otherwise keep the period as-is.
        return period

    if freq == "PA":  # annual frequency
        # Extract the year "YYYY" if present at the start.
        m = re.match(r"(\d{4})", period)
        return m.group(1) if m else period

    # If frequency is unknown, return the original period string.
    return period

def _parse_bcrp_period_name(name: str) -> pd.Timestamp:
    # Convert to string and trim.
    s = str(name).strip()

    # --- Case 1: ISO-like strings: "YYYY", "YYYY-MM", "YYYY-MM-DD" ---
    try:
        # Validate ISO-like patterns with regex.
        if re.fullmatch(r"\d{4}(-\d{1,2}){0,2}", s):
            # Convert to datetime; raise on failure.
            return pd.to_datetime(s, errors="raise")
    except Exception:
        # If it fails, continue to other formats.
        pass

    # --- Case 2: Monthly label like "Mar.2020" (often used by BCRP monthly series) ---
    m = re.fullmatch(r"([A-Za-zÁÉÍÓÚÑñ]{3})\.(\d{4})", s)
    if m:
        # Extract Spanish month abbreviation and year.
        mon_es, y = m.groups()
        # Convert Spanish month to English month abbreviation if possible.
        mon = _ES_TO_EN_MONTH.get(mon_es[:3], mon_es[:3])
        # Parse using the specified format "%b.%Y".
        return pd.to_datetime(f"{mon}.{y}", format="%b.%Y", errors="coerce")

    # --- Case 3: Daily label like "18Nov25" or "02Ene97" (DDMonYY) ---
    m = re.fullmatch(r"(\d{2})([A-Za-zÁÉÍÓÚÑñ]{3})(\d{2})", s)
    if m:
        # Extract day, Spanish month abbreviation, 2-digit year.
        d, mon_es, yy = m.groups()
        # Convert Spanish month to English month abbreviation if possible.
        mon = _ES_TO_EN_MONTH.get(mon_es[:3], mon_es[:3])
        # Convert 2-digit year to 4-digit year (00–69 => 2000–2069, else 1900–1999).
        year = 2000 + int(yy) if int(yy) <= 69 else 1900 + int(yy)
        # Parse using "%d%b%Y" (e.g., "18Nov2025").
        return pd.to_datetime(f"{d}{mon}{year}", format="%d%b%Y", errors="coerce")

    # --- Fallback: let pandas try its best; invalid parses become NaT ---
    return pd.to_datetime(s, errors="coerce")

def bcrp_get(series_codes, start: str | None = None, end: str | None = None, lang: str = "esp") -> pd.DataFrame:
    """
    Fetch BCRPData series (JSON API) into a DataFrame.

    Returns columns: ["date", <code1>, <code2>, ...]
    """
    # Try importing requests (needed for HTTP calls). If missing, return empty DataFrame.
    try:
        import requests
    except Exception:
        return pd.DataFrame()

    # Accept one code or multiple codes.
    if isinstance(series_codes, (list, tuple)):
        # Clean each code string.
        codes_list = [str(c).strip() for c in series_codes]
        # BCRP API allows multiple codes joined by '-'.
        codes = "-".join(codes_list)
        # Use the first code to infer frequency for date normalization.
        freq_code = codes_list[0]
    else:
        # Single code (string).
        codes = str(series_codes).strip()
        # Split anyway so we keep a list for consistent column naming.
        codes_list = codes.split("-")
        # Use the first code to infer frequency.
        freq_code = codes_list[0]

    # Normalize start/end based on frequency (daily/monthly/annual).
    start_n = _normalize_period(freq_code, start)
    end_n = _normalize_period(freq_code, end)

    # Build a deterministic cache key and cache filename.
    key = _hash_key("bcrp", codes, start_n or "", end_n or "", lang)
    cache_path = CACHE_DIR / f"bcrp_{key}.parquet"
    # If cached file exists, load it and return immediately.
    if cache_path.exists():
        return pd.read_parquet(cache_path)

    # Base endpoint for the BCRP series API.
    base_url = "https://estadisticas.bcrp.gob.pe/estadisticas/series/api"
    # Start building URL parts.
    parts = [base_url, codes, "json"]
    # Add start/end only if both are provided.
    if start_n and end_n:
        parts += [start_n, end_n]
    # Add language parameter (e.g., "esp").
    if lang:
        parts += [lang]
    # Join into final URL string.
    url = "/".join(parts)

    # Make the HTTP request (30s timeout).
    r = requests.get(url, timeout=30)
    # Raise an exception if HTTP status is not 200.
    r.raise_for_status()
    # Parse JSON response body.
    obj = r.json()

    # Get the list of periods (each period has a label and values).
    periods = obj.get("periods", [])
    rows = []
    # Convert the JSON structure into rows for a DataFrame.
    for p in periods:
        # Period label (date-like string).
        name = p.get("name")
        # Values are ordered to match the requested codes.
        vals = p.get("values", [])
        # If API returns a single string, wrap it into a list for consistency.
        if isinstance(vals, str):
            vals = [vals]
        # Skip malformed entries.
        if name is None or not isinstance(vals, list):
            continue
        # Pad/truncate values to match number of codes.
        vals = (vals + [None] * len(codes_list))[:len(codes_list)]
        # Append row: [date_label, value1, value2, ...]
        rows.append([name] + vals)

    # Create a DataFrame with "date" + one column per code.
    df = pd.DataFrame(rows, columns=["date"] + codes_list)
    # If no rows, return an empty DataFrame with the right columns.
    if df.shape[0] == 0:
        return pd.DataFrame(columns=["date"] + codes_list)

    # Parse the "date" strings into actual timestamps.
    df["date"] = df["date"].apply(_parse_bcrp_period_name)
    # Convert each code column to numeric.
    for c in codes_list:
        # Replace known "no data" markers with NaN.
        df[c] = df[c].replace({"n.d.": np.nan, "nd": np.nan, "N.D.": np.nan})
        # Coerce to numeric (invalid -> NaN).
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Drop rows where date failed to parse; sort by date; reset index.
    df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
    # Save to parquet cache for faster reruns.
    df.to_parquet(cache_path)
    # Return the cleaned data.
    return df

def bcrp_get_cached_or_empty(series_codes, start: str, end: str) -> pd.DataFrame:
    # Safe wrapper: if network/API fails, return an empty DataFrame with expected columns.
    try:
        return bcrp_get(series_codes, start=start, end=end)
    except Exception:
        # Ensure we return the correct columns even when failing.
        if isinstance(series_codes, (list, tuple)):
            codes_list = [str(c).strip() for c in series_codes]
        else:
            codes_list = [str(series_codes).strip()]
        return pd.DataFrame(columns=["date"] + codes_list)

def yf_download_close_volume(tickers, start: str, end: str) -> pd.DataFrame:
    """
    Download Close and Volume (real market data) using yfinance.
    Returns a DataFrame with columns: ["date","ticker","close","volume"] in long format.
    If download fails, returns an empty DataFrame with those columns.
    """
    # Try importing yfinance. If missing, return empty DataFrame with expected columns.
    try:
        import yfinance as yf
    except Exception:
        return pd.DataFrame(columns=["date","ticker","close","volume"])

    # Allow passing a single ticker or a list/tuple of tickers.
    cols = tickers if isinstance(tickers, (list, tuple)) else [tickers]
    # Create a deterministic cache key.
    key = _hash_key("yf_long", ",".join(cols), start, end)
    # Cache filename for this request.
    cache_path = CACHE_DIR / f"yf_long_{key}.parquet"
    # If cached file exists, load it.
    if cache_path.exists():
        return pd.read_parquet(cache_path)

    try:
        # Download OHLCV data; auto_adjust=True returns adjusted prices.
        data = yf.download(cols, start=start, end=end, auto_adjust=True, progress=False)
        # If nothing returned, return empty DataFrame with expected columns.
        if data.empty:
            return pd.DataFrame(columns=["date","ticker","close","volume"])
        # If multiple tickers, yfinance returns MultiIndex columns: ("Close", ticker), etc.
        if isinstance(data.columns, pd.MultiIndex):
            close = data["Close"].copy()
            vol = data["Volume"].copy()
        else:
            # Single ticker: rename to keep ticker as column label.
            close = data[["Close"]].rename(columns={"Close": cols[0]})
            vol = data[["Volume"]].rename(columns={"Volume": cols[0]})
        # Name the index so it becomes a column after reset_index().
        close.index.name = "date"
        vol.index.name = "date"
        # Convert wide -> long: columns become rows with a "ticker" column.
        long_close = close.reset_index().melt(id_vars="date", var_name="ticker", value_name="close")
        long_vol = vol.reset_index().melt(id_vars="date", var_name="ticker", value_name="volume")
        # Merge close and volume long tables on (date, ticker).
        out = long_close.merge(long_vol, on=["date","ticker"], how="inner").dropna(subset=["close"])
        # Cache to parquet.
        out.to_parquet(cache_path)
        # Return the final long-format DataFrame.
        return out
    except Exception:
        # If anything fails, return an empty DataFrame with expected columns.
        return pd.DataFrame(columns=["date","ticker","close","volume"])

def safe_head(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
    # If df is a DataFrame, return df.head(n); otherwise return an empty DataFrame.
    return df.head(n) if isinstance(df, pd.DataFrame) else pd.DataFrame()


## 3.2. <a id='series'>Pandas Series</a>

We will use:
- **BCRPData API**: daily PEN/USD exchange rate (buy/sell)
- **Yahoo Finance** via `yfinance`: close/volume for US tickers

Data sources:
- BCRP API help: https://estadisticas.bcrp.gob.pe/estadisticas/series/ayuda/api
- yfinance: https://ranaroussi.github.io/yfinance/


In [871]:
START = "2022-01-01"
END = "2025-12-18"

# BCRP: daily USD/PEN buy & sell
fx = bcrp_get_cached_or_empty(["PD04637PD","PD04638PD"], start=START, end=END).rename(
    columns={"PD04637PD":"PEN_USD_buy", "PD04638PD":"PEN_USD_sell"}
)

# Yahoo Finance: long-format table (date, ticker, close, volume)
tickers = ["SPY", "QQQ", "TLT", "GLD", "EEM"]
us_mkt = yf_download_close_volume(tickers, start=START, end=END)

fx.shape, us_mkt.shape, safe_head(fx), safe_head(us_mkt)

((584, 3),
 (4970, 4),
         date  PEN_USD_buy  PEN_USD_sell
 0 2022-02-01     3.871333      3.877667
 1 2022-02-02     3.852000      3.857000
 2 2022-02-03     3.858500      3.860833
 3 2022-02-04     3.863000      3.867833
 4 2022-02-07     3.838500      3.845833,
         date ticker      close    volume
 0 2022-01-03    EEM  44.624969  27572700
 1 2022-01-04    EEM  44.470772  24579500
 2 2022-01-05    EEM  43.745163  46425100
 3 2022-01-06    EEM  43.944714  34288700
 4 2022-01-07    EEM  44.343792  32640900)

### 3.2.1. <a id='3.2.1'>From `lists` to `Series`</a>

**Exercise:** create a Series from a Python list using FX mid-rate.

1. Create `PEN_USD_mid = (buy + sell)/2`.
2. Take the **last 15 values** as a Python list.
3. Build a `pd.Series` with those values (index can be 0..14).
4. Name the Series `PEN_USD_mid_last15`.


In [872]:
# 3.2.1 — From list to Series (FX mid-rate)

# 1) Mid-rate: average of buy and sell (PEN per USD).
if {"PEN_USD_buy", "PEN_USD_sell"}.issubset(fx.columns) and fx.shape[0] > 0:
    PEN_USD_mid = (fx["PEN_USD_buy"] + fx["PEN_USD_sell"]) / 2
else:
    PEN_USD_mid = pd.Series(dtype=float)

# 2) Last 15 values as a plain Python list (drop missing first).
mid_last15_list = PEN_USD_mid.dropna().tail(15).tolist()

# 3) Build a Series from the list; default integer index 0..14 is fine.
PEN_USD_mid_last15 = pd.Series(mid_last15_list, name="PEN_USD_mid_last15")

# Optional self-check
print(PEN_USD_mid_last15.name, PEN_USD_mid_last15.shape)
print(PEN_USD_mid_last15.head())



PEN_USD_mid_last15 (15,)
0    3.370750
1    3.359500
2    3.368214
3    3.364071
4    3.370393
Name: PEN_USD_mid_last15, dtype: float64


### 3.2.2. <a id='3.2.2'>From `NumPy array` to `Series`</a>

**Exercise:** create a Series from a NumPy array using US market close prices.

1. Filter `us_mkt` for ticker `SPY`.
2. Extract the `close` column as a NumPy array.
3. Build a `pd.Series` with:
   - data = the NumPy array
   - index = the corresponding dates
4. Compute `mean`, `min`, `max` using Series methods.


In [873]:
# 3.2.2 — From NumPy array to Series (SPY close)

# 1) Filter us_mkt to keep only rows for ticker "SPY".
spy_df = us_mkt.loc[us_mkt["ticker"].eq("SPY")].copy() if isinstance(us_mkt, pd.DataFrame) else pd.DataFrame()

if not spy_df.empty:
    # 2) Make sure 'date' is datetime and sorted, since we'll use it as the Series index.
    spy_df["date"] = pd.to_datetime(spy_df["date"], errors="coerce")
    spy_df = spy_df.dropna(subset=["date"]).sort_values("date")

    # 3) Extract the 'close' column as a NumPy array.
    spy_close_np = spy_df["close"].to_numpy()

    # 4) Build a pandas Series using:
    #    - data = NumPy array of closes
    #    - index = the corresponding dates
    SPY_close_series = pd.Series(
        data=spy_close_np,
        index=spy_df["date"],      # keeping this as a DatetimeIndex is nice (no need for .to_numpy())
        name="SPY_close"
    )

    # 5) Compute mean, min, max using Series methods.
    summary_stats = {
        "mean": float(SPY_close_series.mean()),
        "min": float(SPY_close_series.min()),
        "max": float(SPY_close_series.max()),
    }
else:
    SPY_close_series = pd.Series(dtype=float)
    summary_stats = {"mean": np.nan, "min": np.nan, "max": np.nan}

# Optional self-check
print(SPY_close_series.head())
print(summary_stats)


date
2022-01-03    453.210388
2022-01-04    453.058594
2022-01-05    444.358948
2022-01-06    443.941528
2022-01-07    442.186310
Name: SPY_close, dtype: float64
{'mean': 487.0435106058716, 'min': 342.1902160644531, 'max': 689.1699829101562}


### 3.2.3. <a id='3.2.3'>From `Dictionary` to `Series`</a>

**Exercise:** build a dict and convert to a Series.

1. Using `us_mkt`, compute the **last available close** for each ticker in `tickers`.
2. Store results in a dict: `{ticker: last_close}`.
3. Convert to a Series and sort descending.


In [874]:
# 3.2.3 — From dictionary to Series (last close by ticker)

# Goal:
# 1) For each ticker in `tickers`, find the last available closing price in `us_mkt`.
# 2) Store results in a Python dict: {ticker: last_close}.
# 3) Convert that dict to a pandas Series and sort it in descending order.

last_close_by_ticker = {}

if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:
    tmp = us_mkt.copy()

    # Make sure dates are proper datetimes so sorting by time works correctly.
    tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce")

    # Drop rows where date is missing, then sort by ticker and date
    # so the "last row" for each ticker is the most recent observation.
    tmp = tmp.dropna(subset=["date"]).sort_values(["ticker", "date"])

    # Loop over the tickers we care about and pick the last non-missing close.
    for t in tickers:
        sub = tmp.loc[tmp["ticker"].eq(t), ["date", "close"]].dropna(subset=["close"])
        if sub.empty:
            continue
        # Because sub is sorted by date, iloc[-1] is the most recent close.
        last_close_by_ticker[t] = float(sub["close"].iloc[-1])

# Convert dict -> Series (index=ticker, values=last_close) and sort highest to lowest.
last_close_series = pd.Series(last_close_by_ticker, dtype=float).sort_values(ascending=False)

# Optional self-check
print(last_close_series)


SPY    671.400024
QQQ    600.409973
GLD    399.290009
TLT     87.800003
EEM     52.599998
dtype: float64


### 3.2.4. <a id='3.2.4'>`Series` vs `NumPy`</a>

1. Create two Series:
   - `fx_mid`: FX mid-rate indexed by date
   - `spy_close`: SPY close indexed by date
2. Create a DataFrame by combining them (pandas aligns on dates).
3. Separately, create two NumPy arrays of the same length by truncating to the same number of rows.
4. Explain in markdown why pandas alignment is safer.


In [875]:
# 3.2.4 — Series vs NumPy (alignment vs truncation)

# Goal:
# 1) Create two pandas Series indexed by date:
#    - fx_mid: FX mid-rate (PEN per USD) indexed by FX dates
#    - spy_close: SPY close indexed by trading dates
# 2) Combine them into a DataFrame. Pandas aligns by date labels (index).
# 3) Create two NumPy arrays of the same length by truncation (no date alignment).
# 4) Show why pandas alignment is safer.

# -------------------------
# 1) Build Series with a date index
# -------------------------

# FX mid-rate Series (indexed by FX dates)
if {"PEN_USD_buy", "PEN_USD_sell", "date"}.issubset(fx.columns) and not fx.empty:
    fx_tmp = fx[["date", "PEN_USD_buy", "PEN_USD_sell"]].copy()
    fx_tmp["date"] = pd.to_datetime(fx_tmp["date"], errors="coerce")
    fx_tmp = fx_tmp.dropna(subset=["date"]).sort_values("date")

    fx_mid = (fx_tmp["PEN_USD_buy"] + fx_tmp["PEN_USD_sell"]) / 2
    fx_mid.index = fx_tmp["date"]          # date labels live in the index
    fx_mid.name = "fx_mid"
else:
    fx_mid = pd.Series(dtype=float, name="fx_mid")

# SPY close Series (indexed by trading dates)
if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:
    spy_tmp = us_mkt.loc[us_mkt["ticker"].eq("SPY"), ["date", "close"]].copy()
    spy_tmp["date"] = pd.to_datetime(spy_tmp["date"], errors="coerce")
    spy_tmp = spy_tmp.dropna(subset=["date"]).sort_values("date")

    spy_close = pd.Series(spy_tmp["close"].to_numpy(), index=spy_tmp["date"], name="spy_close")
else:
    spy_close = pd.Series(dtype=float, name="spy_close")

# -------------------------
# 2) Pandas: safe alignment by date
# -------------------------
# Pandas matches values by the date index (labels), not by row position.
# If a date exists in SPY but not in FX (or vice versa), you will see NaN.
aligned_df = pd.concat([fx_mid, spy_close], axis=1)

# Optional: keep only dates where BOTH series are available (intersection)
aligned_common = aligned_df.dropna()

# -------------------------
# 3) NumPy: unsafe truncation by position
# -------------------------
# NumPy arrays have no date labels. Truncation forces the same length,
# but it pairs values by position (row 0 with row 0), even if the dates differ.
fx_dates = fx_mid.index.to_numpy()
spy_dates = spy_close.index.to_numpy()

fx_vals = fx_mid.to_numpy()
spy_vals = spy_close.to_numpy()

n = min(len(fx_vals), len(spy_vals))
fx_np = fx_vals[:n]
spy_np = spy_vals[:n]
fx_np_dates = fx_dates[:n]
spy_np_dates = spy_dates[:n]

# -------------------------
# Demonstration
# -------------------------
print("Aligned (head) — pandas alignment by date (NaNs are expected if calendars differ):")
print(aligned_df.head(), "\n")

print("Aligned (common dates) head — only dates where both exist:")
print(aligned_common.head(), "\n")

print("NumPy shapes (forced equal length by truncation):", fx_np.shape, spy_np.shape)
print("First 5 NumPy 'paired' dates (often NOT the same date):")
for i in range(min(5, n)):
    print(i, fx_np_dates[i], spy_np_dates[i])


Aligned (head) — pandas alignment by date (NaNs are expected if calendars differ):
            fx_mid   spy_close
date                          
2022-01-03     NaN  453.210388
2022-01-04     NaN  453.058594
2022-01-05     NaN  444.358948
2022-01-06     NaN  443.941528
2022-01-07     NaN  442.186310 

Aligned (common dates) head — only dates where both exist:
              fx_mid   spy_close
date                            
2022-02-01  3.874500  429.720306
2022-02-02  3.854500  433.894562
2022-02-03  3.859667  423.695923
2022-02-04  3.865417  425.688232
2022-02-07  3.842167  424.322052 

NumPy shapes (forced equal length by truncation): (584,) (584,)
First 5 NumPy 'paired' dates (often NOT the same date):
0 2022-02-01T00:00:00.000000000 2022-01-03T00:00:00.000000000
1 2022-02-02T00:00:00.000000000 2022-01-04T00:00:00.000000000
2 2022-02-03T00:00:00.000000000 2022-01-05T00:00:00.000000000
3 2022-02-04T00:00:00.000000000 2022-01-06T00:00:00.000000000
4 2022-02-07T00:00:00.000000000 2022-0

### 3.2.5. <a id='3.2.5'>Indexing</a>

**Exercise:** practice `.loc` and `.iloc`.

1. From `last_close_series`, use `.iloc` to take the top 3 tickers.
2. Use `.loc` to select the value for `SPY`.
3. If `SPY` is not present, explain why (in markdown).


In [876]:
# 3.2.5 — Indexing with .iloc and .loc

# last_close_series is a pandas Series whose:
#   - index = ticker symbols (e.g., "SPY", "QQQ", ...)
#   - values = last available close price for each ticker
# It is already sorted in descending order (highest close first).

# 1) Use .iloc (position-based indexing) to take the first 3 entries.
#    Since the Series is sorted descending, these are the "top 3" tickers by last close.
top3 = last_close_series.iloc[:3]

# 2) Use .loc (label-based indexing) to fetch the value for the ticker "SPY".
#    We first check membership to avoid a KeyError if "SPY" is not present.
if "SPY" in last_close_series.index:
    spy_value = float(last_close_series.loc["SPY"])
else:
    spy_value = np.nan

# self-check
print(top3)
print("SPY value:", spy_value)


SPY    671.400024
QQQ    600.409973
GLD    399.290009
dtype: float64
SPY value: 671.4000244140625


## 3.3. <a id='3.3'>DataFrame</a>

We now practice DataFrame creation and common methods using the same datasets.

### 3.3.1. <a id='3.3.1'>DataFrame Generation</a>

#### From `lists` and `dict` to `DataFrame`

**Exercie:** create a DataFrame of ticker metadata.

1. Make a list of tickers.
2. Make a list of last closes (same order).
3. Make a dict for an extra column, e.g. `{ticker: 'US'}`.
4. Build a DataFrame with columns: `ticker`, `last_close`, `market`.


In [877]:
# 3.3.1 — DataFrame from lists + dict (ticker metadata)

# Step 1) Create a list of tickers (taken from the index of last_close_series).
tickers_list = list(last_close_series.index)

# Step 2) Create a list of last closes in the same order as tickers_list.
#         last_close_series.values follows the same order as the index.
last_close_list = [float(v) for v in last_close_series.values]

# Step 3) Create a dictionary for an extra metadata column.
#         Example: tag every ticker as belonging to the "US" market.
market_dict = {t: "US" for t in tickers_list}

# Step 4) Build the DataFrame with the required columns.
#         - "ticker" and "last_close" come from the two lists
#         - "market" is created by looking up each ticker in market_dict
ticker_df = pd.DataFrame(
    {
        "ticker": tickers_list,
        "last_close": last_close_list,
        "market": [market_dict[t] for t in tickers_list],
    }
)
print(ticker_df.head())


  ticker  last_close market
0    SPY  671.400024     US
1    QQQ  600.409973     US
2    GLD  399.290009     US
3    TLT   87.800003     US
4    EEM   52.599998     US


#### From `lists` and `NumPy` to `DataFrame`

**Exercie:** build a DataFrame from NumPy arrays.

1. Take the `close` column for `SPY` and `QQQ` from `us_mkt`.
2. Convert each to a NumPy array.
3. Build a DataFrame with 2 columns: `SPY_close`, `QQQ_close`.
4. Add a column with row index (0..n-1) named `t`.


In [878]:
# 3.3.1 — DataFrame from NumPy arrays (SPY vs QQQ closes)

# Step 1) Filter `us_mkt` to get the Close prices for SPY and QQQ.
#         We keep only the "close" column, drop missing values, and convert to NumPy arrays.
spy_close = us_mkt.loc[us_mkt["ticker"].eq("SPY"), "close"].dropna().to_numpy()
qqq_close = us_mkt.loc[us_mkt["ticker"].eq("QQQ"), "close"].dropna().to_numpy()

# Step 2) NumPy arrays do not align by date labels.
#         To make them the same length, truncate both to the length of the shorter array.
n = min(len(spy_close), len(qqq_close))
spy_close = spy_close[:n]
qqq_close = qqq_close[:n]

# Step 3) Build a DataFrame using the two NumPy arrays as columns.
prices_np_df = pd.DataFrame({
    "SPY_close": spy_close,
    "QQQ_close": qqq_close,
})

# Step 4) Add a simple row counter column t = 0..n-1 (useful as a time index in this  example).
prices_np_df["t"] = np.arange(n)
print(prices_np_df.head())


    SPY_close   QQQ_close  t
0  453.210388  392.184082  0
1  453.058594  387.097229  1
2  444.358948  375.205200  2
3  443.941528  374.941589  3
4  442.186310  370.879913  4


### 3.3.2. <a id='3.3.2'>Indexing</a>

**Exercie:** `.loc` and `.iloc` on DataFrames.

1. Use `.iloc` to take first 5 rows of `us_mkt`.
2. Use `.loc` with a boolean condition to keep only rows where `ticker == 'SPY'`.
3. Select only the columns `date`, `ticker`, `close`.


In [879]:
# 3.3.2 — Indexing with .iloc and .loc

# 1) .iloc selects rows by POSITION (row numbers 0,1,2,3,4).
#    Here we take the first 5 rows of us_mkt.
first5 = us_mkt.iloc[:5].copy() if isinstance(us_mkt, pd.DataFrame) else pd.DataFrame()

# 2) .loc selects rows by LABEL / CONDITION.
#    Here we filter to keep only rows where ticker == "SPY"
# 3) and at the same time we select only the columns: date, ticker, close.
only_spy = (
    us_mkt.loc[us_mkt["ticker"].eq("SPY"), ["date", "ticker", "close"]].copy()
    if isinstance(us_mkt, pd.DataFrame)
    else pd.DataFrame()
)

# Optional self-check: show the first rows of the filtered result
if isinstance(only_spy, pd.DataFrame) and only_spy.shape[0] > 0:
    print(only_spy.head())


           date ticker       close
2982 2022-01-03    SPY  453.210388
2983 2022-01-04    SPY  453.058594
2984 2022-01-05    SPY  444.358948
2985 2022-01-06    SPY  443.941528
2986 2022-01-07    SPY  442.186310


### 3.3.3. <a id='3.3.3'>General Methods</a>

**Exercie:** basic methods: `.shape`, `.columns`, `.info`, `.describe`, `.sort_values`.

1. Show `us_mkt.shape` and `us_mkt.columns`.
2. Use `.describe()` on `close` and `volume`.
3. Sort `us_mkt` by `volume` descending and keep top 10 rows.


In [880]:
# 3.3.3 — General methods

# We first check that `us_mkt` is a non-empty pandas DataFrame.
if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:

    # 1) Basic structure: shape = (number of rows, number of columns)
    #    and the list of column names.
    print("Shape:", us_mkt.shape)
    print("Columns:", list(us_mkt.columns), "\n")

    # 2) Descriptive statistics for the numeric columns "close" and "volume".
    #    .describe() reports count, mean, std, min, quartiles, and max.
    desc = us_mkt[["close", "volume"]].describe()
    print("Describe (close, volume):")
    print(desc, "\n")

    # 3) Sort the DataFrame by "volume" in descending order (largest first)
    #    and keep only the top 10 rows with the highest volume.
    top10_volume = us_mkt.sort_values("volume", ascending=False).head(10)
    print("Top 10 by volume:")
    print(top10_volume)

else:
    # If `us_mkt` is empty or not a DataFrame, create empty placeholders.
    desc = pd.DataFrame()
    top10_volume = pd.DataFrame()
    print("us_mkt is empty or not a DataFrame.")


Shape: (4970, 4)
Columns: ['date', 'ticker', 'close', 'volume'] 

Describe (close, volume):
             close        volume
count  4970.000000  4.970000e+03
mean    250.238890  4.042105e+07
std     187.757433  2.956145e+07
min      31.036425  1.436500e+06
25%      85.874727  1.882345e+07
50%     187.864998  3.452785e+07
75%     399.523529  5.666980e+07
max     689.169983  2.566114e+08 

Top 10 by volume:
           date ticker       close     volume
3799 2025-04-07    SPY  501.502930  256611400
2996 2022-01-24    SPY  417.282562  251783900
3801 2025-04-09    SPY  545.490601  241867300
3798 2025-04-04    SPY  502.397797  217965100
3018 2022-02-24    SPY  406.334412  213942900
2995 2022-01-21    SPY  415.517944  202271200
2002 2022-01-24    QQQ  344.947845  198685800
3279 2023-03-10    SPY  372.058044  189253000
2998 2022-01-26    SPY  411.153931  186391100
3282 2023-03-15    SPY  375.307159  172996900


### 3.3.4. <a id='3.3.4'>Importing Data</a>

**Exercie:** `to_csv` + `read_csv` using real data.

1. Save a subset of `us_mkt` to `data/us_mkt_sample.csv` (e.g., 500 rows).
2. Read it back using `pd.read_csv`.
3. Rename columns to snake_case.
4. Check dtypes and missing values.


In [881]:
# 3.3.4 — Importing data: to_csv + read_csv
# We first check that `us_mkt` is a non-empty pandas DataFrame.
if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:

    # Make sure the output folder ("data") exists; create it if it doesn't.
    Path("data").mkdir(parents=True, exist_ok=True)

    # 1) Take a small sample (first 500 rows) from `us_mkt`
    #    and save it as a CSV file (without writing the index column).
    us_sample = us_mkt.head(500).copy()
    us_sample.to_csv("data/us_mkt_sample.csv", index=False)

    # 2) Read the CSV file back into a new DataFrame.
    us_from_csv = pd.read_csv("data/us_mkt_sample.csv")

    # 3) Rename columns to snake_case:
    #    - remove leading/trailing spaces
    #    - convert to lowercase
    #    - replace spaces with underscores
    us_from_csv.columns = [c.strip().lower().replace(" ", "_") for c in us_from_csv.columns]

    # 4) Check data types and missing values after reading the CSV.
    #    This helps verify that numeric columns stayed numeric and nothing was lost.
    print("Dtypes:")
    print(us_from_csv.dtypes, "\n")

    print("Missing values per column:")
    print(us_from_csv.isna().sum())

else:
    # If `us_mkt` is empty or not a DataFrame, return empty placeholders.
    us_sample = pd.DataFrame()
    us_from_csv = pd.DataFrame()


Dtypes:
date       object
ticker     object
close     float64
volume      int64
dtype: object 

Missing values per column:
date      0
ticker    0
close     0
volume    0
dtype: int64


### 3.3.5. <a id='3.3.5'>Filtering data</a>

**Exercise:** filtering with conditions.

1. Filter `us_mkt` for rows where `close` is above the 90th percentile **within each ticker**.
2. Filter rows with `volume` missing (if any) and count them.
3. Create a filtered DataFrame for tickers `['SPY','GLD']` only.


In [882]:
# 3.3.5 — Filtering data (well explained)

if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:
    tmp = us_mkt.copy()

    # 1) Close above the 90th percentile *within each ticker*
    #    - groupby("ticker") splits the data by ticker
    #    - transform(...) returns a threshold value for every row (same length as tmp)
    #    - then we keep rows where close > that ticker-specific threshold
    p90 = tmp.groupby("ticker")["close"].transform(lambda s: s.quantile(0.90))
    high_close = tmp.loc[tmp["close"] > p90].copy()

    # 2) Rows with missing volume (if any) + count them
    missing_volume = tmp.loc[tmp["volume"].isna()].copy()
    missing_volume_count = int(missing_volume.shape[0])

    # 3) Keep only SPY and GLD rows
    spy_gld = tmp.loc[tmp["ticker"].isin(["SPY", "GLD"])].copy()

else:
    # If us_mkt is empty or not a DataFrame, create empty placeholders
    tmp = pd.DataFrame()
    high_close = pd.DataFrame()
    missing_volume = pd.DataFrame()
    missing_volume_count = 0
    spy_gld = pd.DataFrame()

# Optional quick checks
print("High-close rows shape:", high_close.shape)
print("Missing volume rows:", missing_volume_count)
print("SPY+GLD subset shape:", spy_gld.shape)
print(spy_gld.head())


High-close rows shape: (500, 4)
Missing volume rows: 0
SPY+GLD subset shape: (1988, 4)
          date ticker       close    volume
994 2022-01-03    GLD  168.330002   9014400
995 2022-01-04    GLD  169.570007   6965600
996 2022-01-05    GLD  169.059998   8715600
997 2022-01-06    GLD  166.990005  10902700
998 2022-01-07    GLD  167.750000   8191900


### 3.3.6. <a id='3.3.6'>Dealing with nulls</a>

**Exercise:** introduce NaNs and handle them.

1. Copy `us_mkt` to `us_mkt_nan`.
2. Set 1% of the `close` values to NaN (fixed random seed).
3. Create two cleaned versions:
   - dropped NaNs
   - filled NaNs with the ticker-specific median close
4. Compare shapes.


In [883]:
# 3.3.6 — Dealing with nulls (introduce NaNs then clean)

# Step 1) Make a copy of the original dataset so we do not modify `us_mkt`.
us_mkt_nan = us_mkt.copy()

if isinstance(us_mkt_nan, pd.DataFrame) and us_mkt_nan.shape[0] > 0:

    # Step 2) Randomly set 1% of the 'close' values to NaN (missing).
    # - We use a fixed random seed (123) so results are reproducible.
    rng = np.random.default_rng(123)
    n = us_mkt_nan.shape[0]                 # total number of rows
    k = max(1, int(0.01 * n))               # number of rows to corrupt (1% of n, at least 1)

    # Choose k random row positions (0..n-1) without replacement
    idx = rng.choice(n, size=k, replace=False)

    # Convert those row positions into actual DataFrame index labels, then set close = NaN
    us_mkt_nan.loc[us_mkt_nan.index[idx], "close"] = np.nan

    # Step 3a) Clean version 1: drop rows where 'close' is missing.
    # This removes observations and therefore reduces the number of rows.
    us_drop = us_mkt_nan.dropna(subset=["close"]).copy()

    # Step 3b) Clean version 2: fill missing 'close' values using the ticker-specific median.
    # - groupby("ticker") computes the median close within each ticker
    # - transform("median") returns a median value for every row (aligned to the original rows)
    med = us_mkt_nan.groupby("ticker")["close"].transform("median")

    # Fill NaN closes with the corresponding ticker median (keeps the same number of rows)
    us_fill = us_mkt_nan.copy()
    us_fill["close"] = us_fill["close"].fillna(med)

else:
    # If input is empty or not a DataFrame, create empty placeholders.
    us_drop = pd.DataFrame()
    us_fill = pd.DataFrame()

# Step 4) Compare shapes to see the effect of dropping vs filling.
print("Original:", us_mkt.shape, "With NaNs:", us_mkt_nan.shape)
print("Drop:", us_drop.shape, "Fill:", us_fill.shape)


Original: (4970, 4) With NaNs: (4970, 4)
Drop: (4921, 4) Fill: (4970, 4)


### 3.3.7. <a id='3.3.7'>Duplicates</a>

**Exercise:** create duplicates and remove them.

1. Create `dup_df` by stacking the last 5 rows of `us_mkt` twice.
2. Use `.duplicated()` to detect duplicates.
3. Use `.drop_duplicates()` to remove duplicates.
4. Verify row counts before/after.


In [884]:
# 3.3.7 — Duplicates (create duplicates, detect them, and remove them)

if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:

    # Step 1) Take the last 5 rows of us_mkt.
    # These 5 rows will be used to create an example dataset with duplicates.
    last5 = us_mkt.tail(5).copy()

    # Stack (concatenate) the last 5 rows twice.
    # Result: dup_df has 10 rows, where rows 0–4 are repeated again in rows 5–9.
    dup_df = pd.concat([last5, last5], ignore_index=True)

    # Step 2) Detect duplicates.
    # .duplicated() returns a boolean Series:
    # - False for the first time a row appears
    # - True for later repeated copies of the same row
    dup_mask = dup_df.duplicated()

    # Step 3) Remove duplicates.
    # .drop_duplicates() keeps the first occurrence and removes repeated copies.
    dedup_df = dup_df.drop_duplicates().copy()

else:
    # If us_mkt is empty or not a DataFrame, create empty placeholders.
    dup_df = pd.DataFrame()
    dup_mask = pd.Series(dtype=bool)
    dedup_df = pd.DataFrame()

# Step 4) Verify row counts before and after removing duplicates
print("dup_df rows:", dup_df.shape[0])
print("Number of duplicated rows (True in dup_mask):", int(dup_mask.sum()) if len(dup_mask) else 0)
print("dedup_df rows:", dedup_df.shape[0])


dup_df rows: 10
Number of duplicated rows (True in dup_mask): 5
dedup_df rows: 5


### 3.3.8. <a id='3.3.8'>Groupby</a>

**Exercise:** groupby + aggregation.

1. Group `us_mkt` by `ticker` and compute:
   - mean close
   - median close
   - max volume
2. Rename the resulting columns clearly.
3. Sort by mean close descending.


In [885]:
# 3.3.8 — Groupby + aggregation (ticker-level summary)

if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:

    # 1) Group the data by ticker and compute summary statistics:
    #    - mean_close: average closing price for each ticker
    #    - median_close: median closing price for each ticker
    #    - max_volume: maximum daily trading volume observed for each ticker
    ticker_summary = (
        us_mkt.groupby("ticker")
        .agg(
            mean_close=("close", "mean"),
            median_close=("close", "median"),
            max_volume=("volume", "max"),
        )
        # 2) Move the group label (ticker) back from the index into a regular column
        .reset_index()
        # 3) Sort tickers by mean close (highest average price first)
        .sort_values("mean_close", ascending=False)
        .reset_index(drop=True)
    )

else:
    ticker_summary = pd.DataFrame()
print(ticker_summary)


  ticker  mean_close  median_close  max_volume
0    SPY  487.043511    462.101456   256611400
1    QQQ  411.806856    400.730148   198685800
2    GLD  220.130422    187.864998    62025000
3    TLT   91.751313     88.894325   131353500
4    EEM   40.462350     39.107405   134225700


### 3.3.9. <a id='3.3.9'>Reform</a>

##### From Wide to Long

**Exercise:** Merge a wide table.

1. Create a small, wide DataFrame with one row containing the last closing dates for each ticker.

2. Merge it into the long format with the columns: `ticker`, `last_close`.

#### From Long to Wide

**Exercise:** 

3. Using `us_mkt`, create a pivot table with:
- index = `date`
- columns = `ticker`
- values ​​= `close`
4. Keep only the first 50 dates.

In [886]:
# 3.3.9 — Reshape: wide ↔ long 

# Part A) From WIDE to LONG
# 1) Create a small WIDE DataFrame (one row) with the last close for each ticker.
#    This means: columns = tickers, one row = last_close values.
if isinstance(last_close_series, pd.Series) and len(last_close_series) > 0:
    wide_last = last_close_series.to_frame().T          # wide: 1 row, columns=tickers
    wide_last.index = ["last_close"]                    # optional row label
else:
    wide_last = pd.DataFrame()

# 2) Convert that WIDE table into LONG format with columns: ticker, last_close.
#    melt() turns column names (tickers) into a "ticker" column and values into "last_close".
if not wide_last.empty:
    long_last = (
        wide_last.reset_index(drop=True)                # drop the row label (not needed)
        .melt(var_name="ticker", value_name="last_close")
        .dropna(subset=["last_close"])
        .reset_index(drop=True)
    )
else:
    long_last = pd.DataFrame(columns=["ticker", "last_close"])


# Part B) From LONG to WIDE
# 3) Using us_mkt (long format), create a pivot table with:
#    - index   = date
#    - columns = ticker
#    - values  = close
#    If there are duplicates for the same (date, ticker), we take the last one (aggfunc="last").
if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:
    tmp = us_mkt.copy()
    tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce")
    tmp = tmp.dropna(subset=["date"])

    wide_close = (
        tmp.pivot_table(index="date", columns="ticker", values="close", aggfunc="last")
        .sort_index()                                   # make sure dates are in chronological order
    )

    # 4) Keep only the first 50 dates (rows).
    wide_close = wide_close.iloc[:50].reset_index()
else:
    wide_close = pd.DataFrame()
print("WIDE (last closes):")
print(wide_last, "\n")

print("LONG (ticker, last_close):")
print(long_last.head(), "\n")

print("WIDE (pivot: date x ticker, first 50 dates):")
print(wide_close.head())


WIDE (last closes):
                   SPY         QQQ         GLD        TLT        EEM
last_close  671.400024  600.409973  399.290009  87.800003  52.599998 

LONG (ticker, last_close):
  ticker  last_close
0    SPY  671.400024
1    QQQ  600.409973
2    GLD  399.290009
3    TLT   87.800003
4    EEM   52.599998 

WIDE (pivot: date x ticker, first 50 dates):
ticker       date        EEM         GLD         QQQ         SPY         TLT
0      2022-01-03  44.624969  168.330002  392.184082  453.210388  125.782967
1      2022-01-04  44.470772  169.570007  387.097229  453.058594  125.259987
2      2022-01-05  43.745163  169.059998  375.205200  444.358948  124.580070
3      2022-01-06  43.944714  166.990005  374.941589  443.941528  124.902588
4      2022-01-07  44.343792  167.750000  370.879913  442.186310  124.004715


### 3.3.10. <a id='3.3.10'>Merge</a>

**Exercise:** merge Peru macro data (BCRP) with US market data (Yahoo).

1. Fetch BCRP monthly policy rate: `PD12301MD`.
2. Create a monthly table from US market data by extracting `year` and `month` from the `date` column.
   Hint: you can use `pd.to_datetime` **only here**.
3. Compute the monthly average close for SPY.
4. Merge policy rate with monthly SPY average using `merge`.
5. Save to `outputs/lecture2_policy_spy_monthly.csv`.


In [887]:
# 3.3.10 — Merge (monthly policy rate with monthly SPY average)

from pathlib import Path

# 0) Ensure output folder exists
Path("outputs").mkdir(parents=True, exist_ok=True)

# 1) Fetch BCRP policy rate series and rename column
# Note: bcrp_get(...) already parses the "date" column into datetime.
policy = (
    bcrp_get_cached_or_empty("PD12301MD", start=START, end=END)
    .rename(columns={"PD12301MD": "policy_rate"})
)

# 2) Build a monthly table from US market data (SPY)
# We use pd.to_datetime here (as suggested) to extract year/month from the date column.
if isinstance(us_mkt, pd.DataFrame) and not us_mkt.empty:
    tmp = us_mkt.copy()
    tmp["date"] = pd.to_datetime(tmp["date"], errors="coerce")
    tmp = tmp.dropna(subset=["date"])

    tmp["year"] = tmp["date"].dt.year
    tmp["month"] = tmp["date"].dt.month

    # 3) Compute monthly average close for SPY
    spy_monthly = (
        tmp.loc[tmp["ticker"].eq("SPY")]
        .groupby(["year", "month"], as_index=False)["close"]
        .mean()
        .rename(columns={"close": "spy_close_avg"})
    )
else:
    spy_monthly = pd.DataFrame(columns=["year", "month", "spy_close_avg"])

# 1b/3b) Convert policy series into a monthly table (average within each month)
# Since policy["date"] should already be datetime, we can extract year/month directly.
if isinstance(policy, pd.DataFrame) and not policy.empty and "date" in policy.columns:
    pol = policy.dropna(subset=["date"]).copy()
    pol["year"] = pol["date"].dt.year
    pol["month"] = pol["date"].dt.month

    policy_monthly = (
        pol.groupby(["year", "month"], as_index=False)["policy_rate"]
        .mean()
    )
else:
    policy_monthly = pd.DataFrame(columns=["year", "month", "policy_rate"])

# 4) Merge on (year, month)
if not spy_monthly.empty and not policy_monthly.empty:
    merged_monthly = (
        spy_monthly.merge(policy_monthly, on=["year", "month"], how="inner")
        .sort_values(["year", "month"])
        .reset_index(drop=True)
    )
else:
    merged_monthly = pd.DataFrame(columns=["year", "month", "spy_close_avg", "policy_rate"])

# 5) Save output
merged_monthly.to_csv("outputs/lecture2_policy_spy_monthly.csv", index=False)
print("Saved:", "outputs/lecture2_policy_spy_monthly.csv", "| shape:", merged_monthly.shape)


Saved: outputs/lecture2_policy_spy_monthly.csv | shape: (28, 4)


## 3.4. <a id='3.4'>References</a>

- Pandas Series: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html
- Pandas DataFrame: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html
- Pandas melt: https://pandas.pydata.org/docs/reference/api/pandas.melt.html
- Pandas pivot_table: https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html
- Pandas merge: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html
- BCRP API help: https://estadisticas.bcrp.gob.pe/estadisticas/series/ayuda/api
- yfinance: https://ranaroussi.github.io/yfinance/
