In [103]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis as maha

## Data Cleaning

- Load Data
- Calculate yield curve slope and Y/Y changes in relevant columns


#### Data Definitions
- TNX: US 10y Treasury
- US_Corp: ML US Corporate Bond Total Return Index
- LIBOR: 3m LIBOR Rate
- BAA: Moody's long-term corporate bond yields index
- UNRATE: US seasonally-adjusted unemployment rate
- SPY: S\&P 500 Index
- IRX: US 3m treasury rate
- RGDP: US seasonally-adjusted Real GDP


In [267]:
#Missing Commodity Index Data
data = pd.read_csv('data/data.csv', index_col=0)
data.index = pd.to_datetime(data.index)

In [268]:
#Linear Interpolation Forward fill - for GDP data which is quarterly
#Could potentially want to only use quarterly data - set lin_interp = False
lin_interp = True

if lin_interp:
    data['RGDP'] = data['RGDP'].interpolate()
else:
    data = data.dropna(subset = ['RGDP'])

In [269]:
#Yield Curve Slope: 10y yields - 3m yields
data['YC_Slope'] =  pd.eval('data.TNX - data.IRX')
#Credit Spread: long-term BAA (corp bonds) - 10y treasury rate
data['Cred_Spread'] = pd.eval('data.BAA - data.TNX')

In [270]:
#Fill in Y/Y changes
YY_cols = ['CPI', 'RGDP']
data[[x + '_Growth' for x in YY_cols]] = data[YY_cols]/data[YY_cols].shift(12 if lin_interp else 4) - 1

#MISSING COMMODITIES INDEX AND INTEREST RATES INDEX
assets = ['SPY', 'US_Corp', 'LIBOR']
data[[x + '_Return' for x in assets]] = data[assets]/data[assets].shift(1) - 1

#Drop null rows
data = data.dropna()

#Subtract mean
# data = data - data.mean()

In [271]:
data.median()

TNX                   4.119000
US_Corp            1541.775909
LIBOR                 2.397496
CPI                 205.900000
BAA                   6.580000
UNRATE                5.400000
SPY                 125.500000
IRX                   1.867000
RGDP              15189.222000
YC_Slope              1.744000
Cred_Spread           2.340000
CPI_Growth            0.021816
RGDP_Growth           0.025619
SPY_Return            0.012632
US_Corp_Return        0.005864
LIBOR_Return          0.000522
dtype: float64

In [272]:
data.std()

TNX                  1.650079
US_Corp            737.207299
LIBOR                2.203278
CPI                 31.988520
BAA                  1.401699
UNRATE               1.632186
SPY                 62.236253
IRX                  2.062834
RGDP              2536.957367
YC_Slope             1.077613
Cred_Spread          0.779342
CPI_Growth           0.005223
RGDP_Growth          0.016148
SPY_Return           0.041065
US_Corp_Return       0.012934
LIBOR_Return         0.085830
dtype: float64

In [316]:
#Quick function to get num_sd standard deviations away from the median.
#Up pos determines if up is good or bad. Idea being that good scenario is at index 1, bad scenario at index 0.
def get_range(data, variable, num_sd, up_pos = True):
    i = 1 if up_pos else -1
    
    v = data[variable]
    v = v[(v < v.quantile(.85) ) & (v > v.quantile(.15))]
    m = v.mean()
    s = v.std()
    
    return tuple((m - i * num_sd * s, m + i * num_sd * s))

get_range(data, 'UNRATE', 1, False)

(6.267446586719577, 4.695344110954851)

## Defining Scenarios

- Define a scenario with a boolean string. Make sure to use spaces between operators (for calculation of mahalanobis distances.
- Find empirical mean and covariance matrix of the factors in this scenario.
- Encode scenario as a vector, then find mahalanobis distance using scenario vector and empirical mean, covariance.
- We then convert scenario Mahalanobis distance into likelihood measure:
$$ e^{\frac{-d}{2}}$$
- Rescale probabilities to sum to 1

In [317]:
def get_scenario_vars(scenario):
    #Get the variables in the scenario. Sort alphabetically for consistent replication
    return sorted(list(set([v.split(' ')[0] for v in scenario.split(' & ')])))

def get_scenario_vector(scenario):
    #Get a vector from a scenario.
    vector = {}
    for v in scenario.split(' & '):
        x = v.split(' ')
        cn = x[0]
        val = x[2]
        vector[cn] = float(val)
    vector = pd.DataFrame(vector, index = [0])
    colnames = get_scenario_vars(scenario)
    return vector[colnames].values

In [321]:
#Build the scenarios: Weak is >= 1 sd below median, strong is >= 1 sd above median, normal is in range [m - sd, m + sd].

scen_names = ['Weak', 'Strong', 'Normal']
# relevant_vars = ['RGDP_Growth', 'UNRATE', 'CPI_Growth', 'YC_Slope', 'Cred_Spread', 'TNX']
relevant_vars = ['RGDP_Growth', 'UNRATE', 'CPI_Growth']
# relevant_vars = ['RGDP_Growth']

scenarios = {sn: '' for sn in scen_names}

up_pos = {v:True for v in relevant_vars}
up_pos['UNRATE'] = False
up_pos['YC_Slope'] = False
up_pos['Cred_Spread'] = False

first_run = True

for var in relevant_vars:
    if first_run:
        first_run = False
    else:
        for v in scen_names:
            scenarios[v] += ' & '
    
    low_bound, u_bound = get_range(data, var, 1, up_pos[var])
    
    l_sign = '<=' if up_pos[var] else '>='
    u_sign = '>=' if up_pos[var] else '<='
    
    scenarios['Weak'] += ('{} {} {}'.format(var, l_sign, low_bound))
    scenarios['Normal'] += ('{} {} {} & {} {} {}'.format(var, u_sign, low_bound, var, l_sign, u_bound))
    scenarios['Strong'] += ('{} {} {}'.format(var, u_sign, u_bound))
    
    print(var, low_bound, u_bound)
    
scenarios

RGDP_Growth 0.019330980451302004 0.03436341235909478
UNRATE 6.267446586719577 4.695344110954851
CPI_Growth 0.019079318912049274 0.024393502210745426


{'Weak': 'RGDP_Growth <= 0.019330980451302004 & UNRATE >= 6.267446586719577 & CPI_Growth <= 0.019079318912049274',
 'Strong': 'RGDP_Growth >= 0.03436341235909478 & UNRATE <= 4.695344110954851 & CPI_Growth >= 0.024393502210745426',
 'Normal': 'RGDP_Growth >= 0.019330980451302004 & RGDP_Growth <= 0.03436341235909478 & UNRATE <= 6.267446586719577 & UNRATE >= 4.695344110954851 & CPI_Growth >= 0.019079318912049274 & CPI_Growth <= 0.024393502210745426'}

In [338]:
likelihoods = {}
l_sum = 0.0

for scenario_name, scenario in scenarios.items():
    #Get relevant variables for scenario defined above
    scenario_vars = get_scenario_vars(scenario)

    #Get the empirical mean & covariance matrix of scenario vars
    v = data[scenario_vars].mean().values
    scen_cov = data[scenario_vars].corr()

    #Encode the scenario as a vector - take empirical averages after conditioning on scenario
    u = data.query(scenario)[scenario_vars].mean().values
    print(u)
    #u = get_scenario_vector(scenario)

    #Mahalanobis distance, converted to likelihood
    l = np.exp(-maha(u, v, scen_cov)/2)
    likelihoods[scenario_name] = l
    l_sum += l

probs = {sn: l/l_sum for sn, l in likelihoods.items()}
print(likelihoods)
print(probs)
# print(mahala)

[1.60599299e-02 8.65029921e-04 8.55517241e+00]
[0.025002   0.04399006 4.16666667]
[0.02132038 0.0264021  5.092     ]
{'Weak': 0.2456491167361094, 'Strong': 0.4487394697597759, 'Normal': 0.7159236143297024}
{'Weak': 0.1741806648147183, 'Strong': 0.3181844909921978, 'Normal': 0.5076348441930839}


In [347]:
#Get expected historical returns for each scenario.
#Likely want historical variances also.
returns = pd.DataFrame()
covariances = {}
returns_cols = [x for x in scen_data.columns if 'Return' in x]


for scenario_name, scenario in scenarios.items():
    scen_data = data.query(scenario)[returns_cols]
    print(scenario_name, scen_data.shape)
    m = scen_data.mean().rename('{}'.format(scenario_name))
    returns = returns.append(m)
    cov_mat = scen_data.cov()
    covariances[scenario_name] = cov_mat
    print(cov_mat)

    
# x
#Column: Scenario
#Row: Expected return
print("\n *********** Returns *********** \n", returns.T)

Weak (29, 3)
                SPY_Return  US_Corp_Return  LIBOR_Return
SPY_Return        0.001815       -0.000006     -0.000028
US_Corp_Return   -0.000006        0.000297     -0.001422
LIBOR_Return     -0.000028       -0.001422      0.010969
Strong (6, 3)
                SPY_Return  US_Corp_Return  LIBOR_Return
SPY_Return        0.007265        0.000209      0.000100
US_Corp_Return    0.000209        0.000058     -0.000055
LIBOR_Return      0.000100       -0.000055      0.000138
Normal (25, 3)
                SPY_Return  US_Corp_Return  LIBOR_Return
SPY_Return        0.001567       -0.000105     -0.000428
US_Corp_Return   -0.000105        0.000112     -0.000210
LIBOR_Return     -0.000428       -0.000210      0.003164

 *********** Returns *********** 
                     Weak    Strong    Normal
LIBOR_Return   -0.071986 -0.002049  0.019163
SPY_Return      0.009426  0.013629  0.006680
US_Corp_Return  0.010003  0.014642  0.002303


### Disregard below - not right.

In [255]:
#Define scenario, use spaces between operators for ease of parsing.
scenario = 'RGDP_Growth = 0.01 & UNRATE = 6'

#Get relevant variables from the scenario defined above.
scenario_vars = [v.split(' ')[0] for v in scenario.split(' & ')]

#Filter data on the above scenario - not sure if useful or not...
scen_data = data.query(scenario)[scenario_vars]

#Get the empirical mean & correlation matrix of scenario vars
v = data[scenario_vars].mean().values
scen_corr = data[scenario_vars].corr()

#Calculate mahalanobis distance, transform to likelihood measure
data['L_1'] = data[scenario_vars].apply(lambda x: np.exp(-maha(x ,v, scen_corr)/2), raw = True, axis = 1)

#Rescale for likelihood
data['Prob_1'] = data['L_1'].dropna()/data['L_1'].sum()
# data['Prob_1'] = data['Prob_1'].fillna(0)
data.Prob_1

Date
1993-01-01    0.007402
1993-04-01    0.008183
1993-07-01    0.009043
1993-10-01    0.009513
1994-01-01    0.010511
1994-04-01    0.011619
1994-07-01    0.013492
1994-10-01    0.015674
1995-01-01    0.014360
1995-04-01    0.015669
1995-07-01    0.015091
1995-10-01    0.013657
1996-01-01    0.014356
1996-04-01    0.014343
1996-07-01    0.013654
1996-10-01    0.011751
1997-01-01    0.012357
1997-04-01    0.011172
1997-07-01    0.010112
1997-10-01    0.009153
1998-01-01    0.008706
1998-04-01    0.007493
1998-07-01    0.008279
1998-10-01    0.008277
1999-01-01    0.007493
1999-04-01    0.007494
1999-07-01    0.007491
1999-10-01    0.006776
2000-01-01    0.006453
2000-04-01    0.005832
                ...   
2012-04-01    0.004721
2012-07-01    0.004720
2012-10-01    0.005764
2013-01-01    0.005219
2013-04-01    0.006371
2013-07-01    0.007406
2013-10-01    0.007785
2014-01-01    0.010500
2014-04-01    0.012841
2014-07-01    0.012840
2014-10-01    0.015094
2015-01-01    0.015092
2015-0

In [23]:
scenario_1 = 'RGDP_Growth = 0.01 & UNRATE = 6'
scenario_2 = 'RGDP_Growth '

# l_1 = np.exp(-maha(np.array([float(v.split(' ')[2]) for v in scenario_1.split(' & ')]), v, scen_corr))

In [25]:
get_scenario_vars(scenario_1)

['RGDP_Growth', 'UNRATE']

In [None]:
get_s