### 1. Project Proposal

### 2. Tooling Setup

### 3. Python Fundamentals

In [1]:
import numpy as np

a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])

print(a + b)   
print(b / a)   
print(b % a)  
print(b == 20) 
print(np.sqrt(a))

big_array = np.arange(1_000_000)

%timeit [x ** 2 for x in big_array]
%timeit big_array ** 2

[11 22 33 44 55]
[10. 10. 10. 10. 10.]
[0 0 0 0 0]
[False  True False False False]
[1.         1.41421356 1.73205081 2.         2.23606798]
53.8 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
741 μs ± 7.03 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### 4. Data Acquisition and Ingestion

In [2]:
import os, pathlib, datetime as dt
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

load_dotenv(); print('ALPHAVANTAGE_API_KEY loaded?', bool(os.getenv('ALPHAVANTAGE_API_KEY')))
RAW  = pathlib.Path(os.getenv('DATA_DIR_RAW', '../data/raw')); RAW.mkdir(parents=True, exist_ok=True)
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', '../data/processed')); PROC.mkdir(parents=True, exist_ok=True)

ALPHAVANTAGE_API_KEY loaded? True


In [3]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

In [4]:
import os, requests, pandas as pd
from dotenv import load_dotenv

load_dotenv()
ALPHA_KEY = os.getenv("ALPHAVANTAGE_API_KEY")

START_DATE = "2007-01-01"
END_DATE   = "2025-07-31"

def fetch_alpha_vig(symbol="VIG") -> pd.DataFrame:
    """Fetch daily adjusted close for VIG ETF."""
    url = "https://www.alphavantage.co/query"
    params = {
        "function": "TIME_SERIES_DAILY",
        "symbol": symbol,
        "outputsize": "full",
        "apikey": ALPHA_KEY,
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    key = [k for k in js if "Time Series" in k][0]
    df = (pd.DataFrame(js[key]).T
            .reset_index()
            .rename(columns={"index": "date", "4. close": symbol}))
    df["date"] = pd.to_datetime(df["date"])
    df[symbol] = pd.to_numeric(df[symbol], errors="coerce")
    return df[["date", symbol]].query("@START_DATE <= date <= @END_DATE")

def fetch_alpha_indicator(function: str, rename: str, **kwargs) -> pd.DataFrame:
    """Fetch a monthly Alpha Vantage indicator and return date + renamed value."""
    url = "https://www.alphavantage.co/query"
    params = {"function": function, "apikey": ALPHA_KEY, **kwargs}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json().get("data", [])
    if not data:
        raise RuntimeError(f"No data returned for {function}")
    df = pd.DataFrame(data)
    df = df.rename(columns={"date": "date", "value": rename})
    df["date"] = pd.to_datetime(df["date"])
    df[rename] = pd.to_numeric(df[rename], errors="coerce")
    return df[["date", rename]].query("@START_DATE <= date <= @END_DATE")

def fetch_all_factors():
    """Pull VIG + CPI + UNEMP + Yield Spread into one wide DataFrame."""
    # VIG daily
    vig_df = fetch_alpha_vig("VIG")

    # Monthly macro
    cpi_df  = fetch_alpha_indicator("CPI", "CPI")
    unemp_df= fetch_alpha_indicator("UNEMPLOYMENT", "UNEMP")
    y10_df  = fetch_alpha_indicator("TREASURY_YIELD", "Y10", interval="monthly", maturity="10year")
    y2_df   = fetch_alpha_indicator("TREASURY_YIELD", "Y2", interval="monthly", maturity="2year")

    # Spread = 10y - 2y
    spread_df = y10_df.merge(y2_df, on="date", how="inner")
    spread_df["SPREAD"] = spread_df["Y10"] - spread_df["Y2"]
    spread_df = spread_df[["date", "SPREAD"]]

    # Merge monthly factors
    monthly = cpi_df.merge(unemp_df, on="date", how="outer")
    monthly = monthly.merge(spread_df, on="date", how="outer")

    # Enforce date range
    monthly = monthly.query("@START_DATE <= date <= @END_DATE").sort_values("date")

    return vig_df, monthly


In [5]:
vig_df, monthly_df = fetch_all_factors()

print("VIG sample (daily):")
print(vig_df.head())

print("Macro sample (monthly):")
print(monthly_df.head())

VIG sample (daily):
         date     VIG
20 2025-07-31  206.06
21 2025-07-30  207.67
22 2025-07-29  208.38
23 2025-07-28  208.67
24 2025-07-25  209.14
Macro sample (monthly):
        date      CPI  UNEMP  SPREAD
0 2007-01-01  202.416    4.6   -0.12
1 2007-02-01  203.499    4.5   -0.13
2 2007-03-01  205.352    4.4   -0.01
3 2007-04-01  206.686    4.5    0.02
4 2007-05-01  207.949    4.4   -0.02


In [6]:
# Using median to convert daily data to monthly
vig_monthly = (
    vig_df.set_index("date")
       .resample("MS")["VIG"]
       .median()
       .rename("VIG")
       .reset_index()
)

vig_monthly.head()

Unnamed: 0,date,VIG
0,2007-01-01,54.295
1,2007-02-01,55.1
2,2007-03-01,53.495
3,2007-04-01,55.25925
4,2007-05-01,56.77005


In [7]:
# integrate all raw data
merged = vig_monthly.merge(monthly_df, on="date", how="left").sort_values("date").reset_index(drop=True)
merged

Unnamed: 0,date,VIG,CPI,UNEMP,SPREAD
0,2007-01-01,54.29500,202.416,4.6,-0.12
1,2007-02-01,55.10000,203.499,4.5,-0.13
2,2007-03-01,53.49500,205.352,4.4,-0.01
3,2007-04-01,55.25925,206.686,4.5,0.02
4,2007-05-01,56.77005,207.949,4.4,-0.02
...,...,...,...,...,...
218,2025-03-01,195.09000,319.799,4.2,0.31
219,2025-04-01,186.02000,320.795,4.2,0.50
220,2025-05-01,195.60000,321.465,4.2,0.50
221,2025-06-01,200.73500,322.561,4.1,0.49


In [8]:
# save data
_ = save_csv(merged, prefix='raw', site='project', table='data')

Saved ../data/raw/raw_site-project_table-data_20250829-172058.csv


### 6. Data Preprocessing

In [9]:
import sys, os
sys.path.append(os.path.abspath(".."))
from src import cleaning

In [10]:
merged = cleaning.fill_missing(merged, ['VIG', 'CPI','UNEMP', 'SPREAD'],'ffill', False)
merged = cleaning.drop_missing(merged, threshold=0.5)
merged

Unnamed: 0,date,VIG,CPI,UNEMP,SPREAD
0,2007-01-01,54.29500,202.416,4.6,-0.12
1,2007-02-01,55.10000,203.499,4.5,-0.13
2,2007-03-01,53.49500,205.352,4.4,-0.01
3,2007-04-01,55.25925,206.686,4.5,0.02
4,2007-05-01,56.77005,207.949,4.4,-0.02
...,...,...,...,...,...
218,2025-03-01,195.09000,319.799,4.2,0.31
219,2025-04-01,186.02000,320.795,4.2,0.50
220,2025-05-01,195.60000,321.465,4.2,0.50
221,2025-06-01,200.73500,322.561,4.1,0.49


In [11]:
merged.to_csv('../data/processed/data_cleaned.csv', index=False)

### 7. Outlier Analysis: Moved to `../sensitivity_outliers.ipynb`

### 8. EDA: moved to `../project_EDA.ipynb`