In [1]:
import pandas as pd
import numpy as np

In [2]:
def cleanMccraken(m_df: pd.DataFrame) -> pd.DataFrame:
    """ Simple cleaning of McCraken Fed maro data. """
    # manual edits to the mccrakken data
    m_df = m_df[1:].reset_index(drop=True)

    # clean the date column
    m_df['date'] = pd.to_datetime(m_df['sasdate'], utc=True).dt.tz_localize(None)
    m_df = m_df.drop('sasdate', axis=1)

    # Shift the obs by one month to account for the lag
    m_df['date'] = m_df.date.shift(-1)
    m_df = m_df[:-1]

    # Cut to time period of interest
    m_df = m_df.set_index('date')
    m_df = m_df['2016-07-01':]
    m_df = m_df.reset_index()

    # forward fill missing obs
    m_df = m_df.sort_values(by='date').reset_index(drop=True)
    m_df = m_df.ffill()

    # Convert all columns to float32
    cols = list(m_df.columns.values)
    cols.remove('date')
    for col in cols:
        m_df[col] = m_df[col].astype('float32')

    # Set column order
    cols.sort()
    m_df = m_df[['date']+cols]

    # Confirm no missing obs
    assert 0 == m_df.isnull().sum().sum()

    # Rename
    m_df.columns = [col if col == 'date' else col.lower() + '_fed' for col in m_df.columns]

    # Sort and reset index
    m_df = m_df.sort_values(by='date').reset_index(drop=True)

    return m_df

In [3]:
def cleanFred(t_df: pd.DataFrame) -> pd.DataFrame:
    """ Simple cleaning of FRED macro data. """
    # clean the date column
    t_df = t_df.reset_index()
    t_df['date'] = pd.to_datetime(t_df['DATE'], utc=True).dt.tz_localize(None)
    t_df = t_df.drop('DATE', axis=1)

    # Shift the obs by one day to account for the lag
    t_df['date'] = t_df.date.shift(-1)
    t_df = t_df[:-1]

    # forward fill missing obs
    t_df = t_df.sort_values(by='date').reset_index(drop=True)
    t_df = t_df.ffill()

    # Cut to time period of interest
    t_df = t_df.set_index('date')
    t_df = t_df['2016-07-01':]
    t_df = t_df.reset_index()

    # Convert all columns to float32
    cols = list(t_df.columns.values)
    cols.remove('date')
    for col in cols:
        t_df[col] = t_df[col].astype('float32')

    # Set column order
    cols.sort()
    t_df = t_df[['date']+cols]

    # Confirm no missing obs
    assert 0 == t_df.isnull().sum().sum()

    # Rename
    t_df.columns = [col if col == 'date' else col.lower() + '_fred' for col in t_df.columns]

    # Sort and reset index
    t_df = t_df.sort_values(by='date').reset_index(drop=True)

    return t_df

In [4]:
def cleanUncertaintyMacro(u_df: pd.DataFrame) -> pd.DataFrame:
    """"""
    # clean the date column
    u_df['date'] = pd.to_datetime(u_df['date'], utc=True).dt.tz_localize(None)

    # Shift the obs by one day to account for the lag
    u_df['date'] = u_df.date.shift(-1)
    u_df = u_df[:-1]

    # Cut to time period of interest
    u_df = u_df.set_index('date')
    u_df = u_df['2016-07-01':'2023-01-02']
    u_df = u_df.reset_index()

    # forward fill missing obs
    u_df = u_df.sort_values(by='date').reset_index(drop=True)
    u_df = u_df.ffill()

    # Convert all columns to float32
    cols = list(u_df.columns.values)
    cols.remove('date')
    for col in cols:
        u_df[col] = u_df[col].astype('float32')

    # Set column order
    cols.sort()
    u_df = u_df[['date']+cols]

    # Confirm no missing obs
    assert 0 == u_df.isnull().sum().sum()

    # Rename
    u_df.columns = [col if col == 'date' else col.lower() + '_tbill' for col in u_df.columns]

    # Sort and reset index
    u_df = u_df.sort_values(by='date').reset_index(drop=True)
    
    return u_df

In [5]:
def mergeMacro(m_df: pd.DataFrame, t_df: pd.DataFrame, u_df: pd.DataFrame) -> pd.DataFrame:
    """ Form single macro time serieses. """
    # merge
    df = m_df.merge(t_df, on='date', how='outer', validate='one_to_one')
    df = df.merge(u_df, on='date', how='outer', validate='one_to_one')

    # forward fill missing obs
    df = df.sort_values(by='date').reset_index(drop=True)
    df = df.ffill()

    # cut time frame
    df = df.set_index('date')
    df = df['2016-07-01':'2023-01-02']
    df = df.reset_index()

    # Confirm no missing obs
    assert 0 == u_df.isnull().sum().sum()

    # Sort and reset index
    df = df.sort_values(by='date').reset_index(drop=True)

    return df

In [6]:
if __name__ == "__main__":
    # set args
    T_IN_FP = '../data/raw/treasury_macro.pkl'
    U_IN_FP    = '../data/raw/uncertainty_macro.pkl'
    M_IN_FP    = '../data/raw/mccraken_macro.csv'
    OUT_FP     = '../data/derived/macro.pkl'

    # import
    m_df = pd.read_csv(M_IN_FP)
    t_df = pd.read_pickle(T_IN_FP)
    u_df = pd.read_pickle(U_IN_FP)
    
    # clean
    m_df = cleanMccraken(m_df)
    t_df = cleanFred(t_df)
    u_df = cleanUncertaintyMacro(u_df)

    # form single time series
    df = mergeMacro(m_df, t_df, u_df)

    # output
    df.to_pickle(OUT_FP)
