### Housekeeping (Importing libraries, loading datasets)

In [7]:
import pandas_datareader as pdr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.decomposition import PCA
from scipy.stats import qmc

In [13]:
# secured overnight financing rate, sourced from the st. louis fred
sofr = pd.read_csv('data/FRED_SOFR.csv', parse_dates=['DATE'], index_col='DATE')

In [14]:
sofr.head()

Unnamed: 0_level_0,SOFR
DATE,Unnamed: 1_level_1
2018-04-03,1.83
2018-04-04,1.74
2018-04-05,1.75
2018-04-06,1.75
2018-04-09,1.75


In [15]:
sofr.tail()

Unnamed: 0_level_0,SOFR
DATE,Unnamed: 1_level_1
2024-10-28,4.82
2024-10-29,4.82
2024-10-30,4.81
2024-10-31,4.9
2024-11-01,4.86


In [16]:
# Load Market Yield on U.S. Treasury Securities at X-Year Constant Maturity, Quoted on an Investment Basis
dgs1 = pd.read_csv('data/DGS1.csv', parse_dates=['DATE'], index_col='DATE')
dgs2 = pd.read_csv('data/DGS2.csv', parse_dates=['DATE'], index_col='DATE')
dgs5 = pd.read_csv('data/DGS5.csv', parse_dates=['DATE'], index_col='DATE')
dgs10 = pd.read_csv('data/DGS10.csv', parse_dates=['DATE'], index_col='DATE')
dgs30 = pd.read_csv('data/DGS30.csv', parse_dates=['DATE'], index_col='DATE')

# combine them all
yc = pd.concat([dgs1.rename(columns={'DGS1': '1Y'}),
                              dgs2.rename(columns={'DGS2': '2Y'}),
                              dgs5.rename(columns={'DGS5': '5Y'}),
                              dgs10.rename(columns={'DGS10': '10Y'}),
                              dgs30.rename(columns={'DGS30': '30Y'})], axis=1)

In [17]:
yc.head()

Unnamed: 0_level_0,1Y,2Y,5Y,10Y,30Y
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1962-01-02,3.22,,3.88,4.06,
1962-01-03,3.24,,3.87,4.03,
1962-01-04,3.24,,3.86,3.99,
1962-01-05,3.26,,3.89,4.02,
1962-01-08,3.31,,3.91,4.03,


In [18]:
yc.tail()

Unnamed: 0_level_0,1Y,2Y,5Y,10Y,30Y
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-10-25,4.29,4.11,4.07,4.25,4.51
2024-10-28,4.28,4.12,4.11,4.28,4.53
2024-10-29,4.29,4.11,4.11,4.28,4.52
2024-10-30,4.28,4.15,4.14,4.29,4.49
2024-10-31,4.27,4.16,4.15,4.28,4.47


### Exploratory Data Analysis

In [20]:
# check formissing values and prepare for imputation
print("\nMissing values in SOFR Data:", sofr.isna().sum())
print("Missing values in Yield Curve Data:", yc.isna().sum())


Missing values in SOFR Data: SOFR    0
dtype: int64
Missing values in Yield Curve Data: 1Y        0
2Y     3760
5Y        0
10Y       0
30Y    3945
dtype: int64


In [21]:
print("\nSOFR Summary Statistics:")
print(sofr.describe())


SOFR Summary Statistics:
        SOFR
count   1719
unique   188
top     0.05
freq     188


In [22]:
print("\nYield Curve Summary Statistics:")
print(yc.describe())


Yield Curve Summary Statistics:
           1Y     2Y     5Y    10Y    30Y
count   16393  12633  16393  16393  12448
unique   1896   1873   1468   1403   1323
top         .      .      .      .      .
freq      698    530    698    698    523
