# 1.0 Feature Engineering

## 1.1 Lending indicator engineering (lending_indicators.csv)

In [2]:
from pathlib import Path

import pandas as pd


def _resolve_data_dir(expected_filenames: list[str]) -> Path:
    """Find the `data` directory regardless of notebook CWD."""
    cwd = Path.cwd().resolve()

    # Common cases:
    # - running from repo root: ./market-type/data
    # - running from market-type/: ./data
    candidates = [cwd / "data", cwd / "market-type" / "data"]

    for d in candidates:
        if d.exists() and all((d / f).exists() for f in expected_filenames):
            return d

    # Try walking up a few parents (handles running from subfolders)
    for parent in [cwd, *cwd.parents][:6]:
        d = parent / "market-type" / "data"
        if d.exists() and all((d / f).exists() for f in expected_filenames):
            return d

    raise FileNotFoundError(
        "Could not locate market-type/data with required input CSVs from cwd="
        f"{cwd}."
    )


def _load_series(path: Path, date_col: str = "date") -> pd.Series:
    df = pd.read_csv(path)
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

    # Standardize dates to date-only to guarantee matching joins.
    df[date_col] = df[date_col].dt.normalize()

    value_cols = [c for c in df.columns if c != date_col]
    if len(value_cols) != 1:
        raise ValueError(f"Expected exactly 1 value column in {path.name}, got {value_cols}")

    s = df.set_index(date_col)[value_cols[0]].sort_index()
    s.name = path.stem
    return s


# Inputs
files = {
    "fhb_number": "first-home-buyer-new-loan-commitments-number.csv",
    "investor_value_m": "investor-new-loan-commitments-value.csv",
    "oo_number": "owner-occupier-new-loan-commitments-number.csv",
    "total_value_m": "total-new-loan-commitments-value.csv",
}

data_dir = _resolve_data_dir(list(files.values()))

fhb_number = _load_series(data_dir / files["fhb_number"])
investor_value_m = _load_series(data_dir / files["investor_value_m"])
oo_number = _load_series(data_dir / files["oo_number"])
total_value_m = _load_series(data_dir / files["total_value_m"])

# Align dates and forward-fill missing values (no interpolation).
df = pd.concat(
    {
        "fhb_number": fhb_number,
        "investor_value_m": investor_value_m,
        "oo_number": oo_number,
        "total_value_m": total_value_m,
    },
    axis=1,
).sort_index()

df = df.ffill()

# Feature 1: Investor Share (value of investor loans / total housing loans)
# Feature 2: FHB Share (number of FHB / total owner-occupier number)
features = pd.DataFrame(index=df.index)
features["investor_share"] = df["investor_value_m"] / df["total_value_m"]
features["fhb_share"] = df["fhb_number"] / df["oo_number"]

# Feature 3: Lending Momentum
# Data is quarterly in these files (3 months per step), so one-period pct_change is a 3-month change.
# If the inputs ever become monthly, this adapts to a 3-period change.
diffs = df.index.to_series().diff().dropna()
median_days = diffs.dt.days.median() if len(diffs) else None
periods = 1 if (median_days is None or median_days >= 80) else 3

features["lending_momentum_3m_pct"] = df["total_value_m"].pct_change(periods=periods) * 100.0

# Save
out_path = data_dir / "lending_indicators.csv"
features.reset_index(names="date").to_csv(out_path, index=False)

out_path, features.tail(5)

(WindowsPath('C:/Users/jack2/Desktop/2026-q1-propert-market/market-type/data/lending_indicators.csv'),
             investor_share  fhb_share  lending_momentum_3m_pct
 date                                                          
 2024-12-31        0.371929   0.351131                 0.517117
 2025-03-31        0.379477   0.351470                -0.647289
 2025-06-30        0.376337   0.355131                 2.308218
 2025-09-30        0.402704   0.346149                10.925003
 2025-12-31        0.396906   0.358889                 9.518825)

In [3]:
from pathlib import Path

import pandas as pd


# 2.0 Capital indicators (capital-indicators.csv)
# - Real rate = nominal mortgage rate - CPI proxy
#   (per instructions, we construct this using the available
#    rate series in nominal_mortgage_rate.csv and
#    overnight_cash_rate.csv).
# - Lagged real-rate features (t-1, t-3, t-6 months)
# - 12‑month percent change in real rate


def _resolve_data_dir(expected_filenames: list[str]) -> Path:
    """Find the `data` directory regardless of notebook CWD."""
    cwd = Path.cwd().resolve()
    candidates = [cwd / "data", cwd / "market-type" / "data"]

    for d in candidates:
        if d.exists() and all((d / f).exists() for f in expected_filenames):
            return d

    for parent in [cwd, *cwd.parents][:6]:
        d = parent / "market-type" / "data"
        if d.exists() and all((d / f).exists() for f in expected_filenames):
            return d

    raise FileNotFoundError(
        "Could not locate market-type/data with required input CSVs from cwd="
        f"{cwd}."
    )


def _load_rate_series(path: Path, value_col: str) -> pd.Series:
    df = pd.read_csv(path)
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["date"] = df["date"].dt.normalize()

    s = df.set_index("date")[value_col].sort_index()
    s.name = value_col
    return s


files = {
    "nominal": "nominal_mortgage_rate.csv",
    "overnight": "overnight_cash_rate.csv",
}

data_dir = _resolve_data_dir(list(files.values()))

nominal = _load_rate_series(data_dir / files["nominal"], "nominal_mortgage_rate")
overnight = _load_rate_series(data_dir / files["overnight"], "overnight_cash_rate")

# Align dates and forward-fill missing values.
rates = pd.concat({"nominal": nominal, "overnight": overnight}, axis=1).sort_index()
rates = rates.ffill()

# 1. Real rate
rates["real_rate"] = rates["nominal"] - rates["overnight"]

# 2. Lagged real-rate features (monthly lags)
features = pd.DataFrame(index=rates.index)
features["real_rate"] = rates["real_rate"]
features["real_rate_lag1"] = rates["real_rate"].shift(1)
features["real_rate_lag3"] = rates["real_rate"].shift(3)
features["real_rate_lag6"] = rates["real_rate"].shift(6)

# 3. 12‑month percent change of real rate
features["real_rate_12m_pct_change"] = rates["real_rate"].pct_change(periods=12) * 100.0

out_path = data_dir / "capital-indicators.csv"
features.reset_index(names="date").to_csv(out_path, index=False)

out_path, features.tail(5)

(WindowsPath('C:/Users/jack2/Desktop/2026-q1-propert-market/market-type/data/capital-indicators.csv'),
             real_rate  real_rate_lag1  real_rate_lag3  real_rate_lag6  \
 date                                                                    
 2025-10-31       4.42            4.42            4.42            4.42   
 2025-11-30       4.42            4.42            4.42            4.52   
 2025-12-31       4.42            4.42            4.42            4.42   
 2026-01-31       4.42            4.42            4.42            4.42   
 2026-02-28       4.17            4.42            4.42            4.42   
 
             real_rate_12m_pct_change  
 date                                  
 2025-10-31                  0.000000  
 2025-11-30                  0.000000  
 2025-12-31                  0.000000  
 2026-01-31                  0.000000  
 2026-02-28                 -7.743363  )