In [58]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis as maha

## Data Cleaning

- Load Data
- Calculate yield curve slope and Y/Y changes in relevant columns


#### Data Definitions
- TNX: US 10y Treasury
- US_Corp: ML US Corporate Bond Total Return Index
- LIBOR: 3m LIBOR Rate
- BAA: Moody's long-term corporate bond yields index
- UNRATE: US seasonally-adjusted unemployment rate
- SPY: S\&P 500 Index
- IRX: US 3m treasury rate
- RGDP: US seasonally-adjusted Real GDP


In [7]:
#Missing Commodity Index Data
data = pd.read_csv('data/data.csv', index_col=0)
data.index = pd.to_datetime(data.index)

In [8]:
#Linear Interpolation Forward fill - for GDP data which is quarterly
#Could potentially want to only use quarterly data - set lin_interp = False
lin_interp = False

if lin_interp:
    data['RGDP'] = data['RGDP'].interpolate()
else:
    data = data.dropna(subset = ['RGDP'])

In [9]:
#Yield Curve Slope: 10y yields - 3m yields
data['YC_Slope'] =  pd.eval('data.TNX - data.IRX')
#Credit Spread: long-term BAA (corp bonds) - 10y treasury rate
data['Cred_Spread'] = pd.eval('data.BAA - data.TNX')

In [10]:
#Fill in Y/Y changes
YY_cols = ['CPI', 'RGDP']
data[[x + '_Growth' for x in YY_cols]] = data[YY_cols]/data[YY_cols].shift(1) - 1

#Drop null rows
data = data.dropna()

#Subtract mean
# data = data - data.mean()

## Defining Scenarios

- Define a scenario with a boolean string. Make sure to use spaces between operators (for calculation of mahalanobis distances.
- Filtering on this scenario, we calculate the Mahalanobis distance
- We then convert scenario Mahalanobis distance into likelihood measure:
$$ e^{\frac{-d}{2}}$$
- Rescale probabilities to sum to 1

In [52]:
def get_scenario_vars(scenario):
    #Get the variables in the scenario. Sort alphabetically for consistent replication
    return sorted(list(set([v.split(' ')[0] for v in scenario.split(' & ')])))

def get_scenario_vector(scenario):
    #Get a vector from a scenario.
    vector = {}
    for v in scenario.split(' & '):
        x = v.split(' ')
        cn = x[0]
        val = x[2]
        vector[cn] = float(val)
    vector = pd.DataFrame(vector, index = [0])
    colnames = get_scenario_vars(scenario)
    return vector[colnames].values

In [64]:
#Define scenarios, use spaces between operators for ease of parsing.
scenarios = {'Weak': 'RGDP_Growth = 0.01 & UNRATE = 6', 
             'Normal': 'UNRATE = 3',
             'Strong': 'RGDP_Growth = 0.3'}

likelihoods = {}

l_sum = 0.0

for scenario_name, scenario in scenarios.items():
    #Get relevant variables for scenario defined above
    scenario_vars = get_scenario_vars(scenario)

    #Get the empirical mean & correlation matrix of scenario vars
    v = data[scenario_vars].mean().values
    scen_corr = data[scenario_vars].corr()

    #Encode the scenario as a vector
    u = get_scenario_vector(scenario)

    #Mahalanobis distance, converted to likelihood
    l = np.exp(-maha(u, v, scen_corr)/2)
    likelihoods[scenario_name] = l
    l_sum += l

probs = {sn: l/l_sum for sn, l in likelihoods.items()}
print(likelihoods)
print(probs)
# print(mahala)

{'Weak': 0.8996118267005236, 'Normal': 0.24809954216648772, 'Strong': 0.8634019768227041}
{'Weak': 0.4473203007819532, 'Normal': 0.12336427615988071, 'Strong': 0.42931542305816617}


In [None]:
# scenario = 'RGDP_Growth = 0.01 & UNRATE = 6'

In [255]:
#Define scenario, use spaces between operators for ease of parsing.
scenario = 'RGDP_Growth = 0.01 & UNRATE = 6'

#Get relevant variables from the scenario defined above.
scenario_vars = [v.split(' ')[0] for v in scenario.split(' & ')]

#Filter data on the above scenario - not sure if useful or not...
scen_data = data.query(scenario)[scenario_vars]

#Get the empirical mean & correlation matrix of scenario vars
v = data[scenario_vars].mean().values
scen_corr = data[scenario_vars].corr()

#Calculate mahalanobis distance, transform to likelihood measure
data['L_1'] = data[scenario_vars].apply(lambda x: np.exp(-maha(x ,v, scen_corr)/2), raw = True, axis = 1)

#Rescale for likelihood
data['Prob_1'] = data['L_1'].dropna()/data['L_1'].sum()
# data['Prob_1'] = data['Prob_1'].fillna(0)
data.Prob_1

Date
1993-01-01    0.007402
1993-04-01    0.008183
1993-07-01    0.009043
1993-10-01    0.009513
1994-01-01    0.010511
1994-04-01    0.011619
1994-07-01    0.013492
1994-10-01    0.015674
1995-01-01    0.014360
1995-04-01    0.015669
1995-07-01    0.015091
1995-10-01    0.013657
1996-01-01    0.014356
1996-04-01    0.014343
1996-07-01    0.013654
1996-10-01    0.011751
1997-01-01    0.012357
1997-04-01    0.011172
1997-07-01    0.010112
1997-10-01    0.009153
1998-01-01    0.008706
1998-04-01    0.007493
1998-07-01    0.008279
1998-10-01    0.008277
1999-01-01    0.007493
1999-04-01    0.007494
1999-07-01    0.007491
1999-10-01    0.006776
2000-01-01    0.006453
2000-04-01    0.005832
                ...   
2012-04-01    0.004721
2012-07-01    0.004720
2012-10-01    0.005764
2013-01-01    0.005219
2013-04-01    0.006371
2013-07-01    0.007406
2013-10-01    0.007785
2014-01-01    0.010500
2014-04-01    0.012841
2014-07-01    0.012840
2014-10-01    0.015094
2015-01-01    0.015092
2015-0

In [23]:
scenario_1 = 'RGDP_Growth = 0.01 & UNRATE = 6'
scenario_2 = 'RGDP_Growth '

# l_1 = np.exp(-maha(np.array([float(v.split(' ')[2]) for v in scenario_1.split(' & ')]), v, scen_corr))

In [25]:
get_scenario_vars(scenario_1)

['RGDP_Growth', 'UNRATE']

In [None]:
get_s