In [376]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis as maha

## Data Cleaning

- Load Data
- Calculate yield curve slope and Y/Y changes in relevant columns


#### Data Definitions
- TNX: US 10y Treasury
- US_Corp: ML US Corporate Bond Total Return Index
- LIBOR: 3m LIBOR Rate
- BAA: Moody's long-term corporate bond yields index
- UNRATE: US seasonally-adjusted unemployment rate
- SPY: S\&P 500 Index
- IRX: US 3m treasury rate
- RGDP: US seasonally-adjusted Real GDP


In [399]:
#Missing Commodity Index Data
data = pd.read_csv('data/data.csv', index_col=0)
data.index = pd.to_datetime(data.index)

In [400]:
#Linear Interpolation Forward fill - for GDP data which is quarterly
#Could potentially want to only use quarterly data - set lin_interp = False
lin_interp = True

if lin_interp:
    data['RGDP'] = data['RGDP'].interpolate()
else:
    data = data.dropna(subset = ['RGDP'])

In [401]:
#Yield Curve Slope: 10y yields - 3m yields
data['YC_Slope'] =  pd.eval('data.TNX - data.IRX')
#Credit Spread: long-term BAA (corp bonds) - 10y treasury rate
data['Cred_Spread'] = pd.eval('data.BAA - data.TNX')

In [402]:
#Fill in Y/Y changes
YY_cols = ['CPI', 'RGDP']
data[[x + '_Growth' for x in YY_cols]] = data[YY_cols]/data[YY_cols].shift(12 if lin_interp else 4) - 1

#MISSING COMMODITIES INDEX AND INTEREST RATES INDEX
assets = ['SPY', 'US_Corp']
data[[x + '_Return' for x in assets]] = data[assets]/data[assets].shift(1) - 1

assets.append('TreasInd')
data['TreasInd_Return'] = data['TreasInd']
assets.append('Cash')
data['Cash_Return'] = data['Cash']

data = data.drop(columns = ['TreasInd' , 'Cash'])

#Drop null rows
data = data.dropna()

#Subtract mean
# data = data - data.mean()

In [403]:
data.median()

TNX                    3.158000
US_Corp             1969.855909
LIBOR                  1.262410
CPI                  221.194000
BAA                    6.020000
UNRATE                 5.500000
SPY                  133.820007
IRX                    0.932000
RGDP               15725.333000
YC_Slope               2.041000
Cred_Spread            2.615000
CPI_Growth             0.020346
RGDP_Growth            0.021447
SPY_Return             0.009834
US_Corp_Return         0.005527
TreasInd_Return        0.000048
Cash_Return            0.000043
dtype: float64

In [404]:
data.head()

Unnamed: 0_level_0,TNX,US_Corp,LIBOR,CPI,BAA,UNRATE,SPY,IRX,RGDP,YC_Slope,Cred_Spread,CPI_Growth,RGDP_Growth,SPY_Return,US_Corp_Return,TreasInd_Return,Cash_Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2001-02-01,4.908,1118.2795,5.348156,184.4,7.87,4.2,123.949997,4.72,13248.454667,0.188,2.962,0.027871,0.018872,-0.095388,0.014506,0.003573,0.000219
2001-03-01,4.915,1135.408261,4.963977,184.7,7.84,4.3,116.690002,4.18,13274.219333,0.735,2.925,0.026111,0.014697,-0.058572,0.015317,0.001863,4.4e-05
2001-04-01,5.338,1131.094,4.613882,185.1,8.07,4.4,126.660004,3.83,13299.984,1.508,2.732,0.026622,0.010572,0.08544,-0.0038,0.002098,0.000124
2001-05-01,5.413,1137.175217,4.103661,185.3,8.07,4.3,125.949997,3.54,13281.584,1.873,2.657,0.025457,0.008725,-0.005606,0.005376,0.002226,0.000106
2001-06-01,5.39,1153.920455,3.834078,186.0,7.97,4.5,122.599998,3.56,13263.184,1.83,2.58,0.027057,0.00688,-0.026598,0.014725,0.001951,0.000228


In [405]:
#Quick function to get num_sd standard deviations away from the median.
#Up pos determines if up is good or bad. Idea being that good scenario is at index 1, bad scenario at index 0.
def get_range(data, variable, num_sd, up_pos = True):
    i = 1 if up_pos else -1
    
    v = data[variable]
    v = v[(v < v.quantile(.85) ) & (v > v.quantile(.15))]
    m = v.mean()
    s = v.std()
    
    return tuple((m - i * num_sd * s, m + i * num_sd * s))

get_range(data, 'UNRATE', 1, False)

(6.8475652661926105, 4.7206466543372)

## Defining Scenarios

- Define a scenario with a boolean string. Make sure to use spaces between operators (for calculation of mahalanobis distances.
- Find empirical mean and covariance matrix of the factors in this scenario.
- Encode scenario as a vector, then find mahalanobis distance using scenario vector and empirical mean, covariance.
- We then convert scenario Mahalanobis distance into likelihood measure:
$$ e^{\frac{-d}{2}}$$
- Rescale probabilities to sum to 1

In [406]:
def get_scenario_vars(scenario):
    #Get the variables in the scenario. Sort alphabetically for consistent replication
    return sorted(list(set([v.split(' ')[0] for v in scenario.split(' & ')])))

def get_scenario_vector(scenario):
    #Get a vector from a scenario.
    vector = {}
    for v in scenario.split(' & '):
        x = v.split(' ')
        cn = x[0]
        val = x[2]
        vector[cn] = float(val)
    vector = pd.DataFrame(vector, index = [0])
    colnames = get_scenario_vars(scenario)
    return vector[colnames].values

In [407]:
#Build the scenarios: Weak is >= 1 sd below median, strong is >= 1 sd above median, normal is in range [m - sd, m + sd].

scen_names = ['Weak', 'Strong', 'Normal']
relevant_vars = ['RGDP_Growth', 'UNRATE', 'CPI_Growth', 'YC_Slope', 'Cred_Spread', 'TNX']
# relevant_vars = ['RGDP_Growth', 'UNRATE', 'CPI_Growth']
# relevant_vars = ['RGDP_Growth']

scenarios = {sn: '' for sn in scen_names}
rgdp_scen = {sn: '' for sn in scen_names}

up_pos = {v:True for v in relevant_vars}
up_pos['UNRATE'] = False
up_pos['YC_Slope'] = False
up_pos['Cred_Spread'] = False

first_run = True

for var in relevant_vars:
    if first_run:
        first_run = False
    else:
        for v in scen_names:
            scenarios[v] += ' & '
    
    low_bound, u_bound = get_range(data, var, 1, up_pos[var])
    
    l_sign = '<=' if up_pos[var] else '>='
    u_sign = '>=' if up_pos[var] else '<='
    
    scenarios['Weak'] += ('{} {} {}'.format(var, l_sign, low_bound))
    scenarios['Normal'] += ('{} {} {} & {} {} {}'.format(var, u_sign, low_bound, var, l_sign, u_bound))
    scenarios['Strong'] += ('{} {} {}'.format(var, u_sign, u_bound))
    
    if var == 'RGDP_Growth':
        rgdp_scen['Weak'] += ('{} {} {}'.format(var, l_sign, low_bound))
        rgdp_scen['Normal'] += ('{} {} {} & {} {} {}'.format(var, u_sign, low_bound, var, l_sign, u_bound))
        rgdp_scen['Strong'] += ('{} {} {}'.format(var, u_sign, u_bound))
    
    print(var, low_bound, u_bound)
    
scenarios

RGDP_Growth 0.016503513200072085 0.02755979252992184
UNRATE 6.8475652661926105 4.7206466543372
CPI_Growth 0.01786440182195505 0.022288559408427003
YC_Slope 2.6131647971651435 1.308007177357148
Cred_Spread 2.999529962512296 2.213743922838018
TNX 2.438264827611647 4.032308420795997


{'Weak': 'RGDP_Growth <= 0.016503513200072085 & UNRATE >= 6.8475652661926105 & CPI_Growth <= 0.01786440182195505 & YC_Slope >= 2.6131647971651435 & Cred_Spread >= 2.999529962512296 & TNX <= 2.438264827611647',
 'Strong': 'RGDP_Growth >= 0.02755979252992184 & UNRATE <= 4.7206466543372 & CPI_Growth >= 0.022288559408427003 & YC_Slope <= 1.308007177357148 & Cred_Spread <= 2.213743922838018 & TNX >= 4.032308420795997',
 'Normal': 'RGDP_Growth >= 0.016503513200072085 & RGDP_Growth <= 0.02755979252992184 & UNRATE <= 6.8475652661926105 & UNRATE >= 4.7206466543372 & CPI_Growth >= 0.01786440182195505 & CPI_Growth <= 0.022288559408427003 & YC_Slope <= 2.6131647971651435 & YC_Slope >= 1.308007177357148 & Cred_Spread <= 2.999529962512296 & Cred_Spread >= 2.213743922838018 & TNX >= 2.438264827611647 & TNX <= 4.032308420795997'}

In [408]:
for theta in [0, 0.5, 1]:

    likelihoods = {}
    l_sum = 0.0

    for scenario_name, scenario in scenarios.items():
        #Get relevant variables for scenario defined above
        scenario_vars = get_scenario_vars(scenario)

        #Get the empirical mean & covariance matrix of scenario vars
        v = (1-theta) * data[scenario_vars].mean().values + theta*data[scenario_vars].tail(1).values

        scen_cov = data[scenario_vars].corr()

        #Encode the scenario as a vector - take empirical averages after conditioning on scenario
        u = data.query(rgdp_scen[scenario_name])[scenario_vars].mean().values
    #     print(u)
        #u = get_scenario_vector(scenario)

        #Mahalanobis distance, converted to likelihood
        l = np.exp(-maha(u, v, scen_cov)/2)
        likelihoods[scenario_name] = l
        l_sum += l

    print('*********** Using Theta = {} ********************'.format(theta))
    probs = {sn: l/l_sum for sn, l in likelihoods.items()}
    print('Likelihoods:\t', likelihoods)
    print('Probs: \t\t', probs)
# print(mahala)

*********** Using Theta = 0 ********************
Likelihoods:	 {'Weak': 0.4972047808129311, 'Strong': 0.6478172894675673, 'Normal': 0.7726989488988671}
Probs: 		 {'Weak': 0.2592685671379332, 'Strong': 0.33780580334087507, 'Normal': 0.40292562952119165}
*********** Using Theta = 0.5 ********************
Likelihoods:	 {'Weak': 0.18732420883673773, 'Strong': 0.42176984617775043, 'Normal': 0.4610411794786621}
Probs: 		 {'Weak': 0.1750472302927773, 'Strong': 0.39412761357915094, 'Normal': 0.4308251561280717}
*********** Using Theta = 1 ********************
Likelihoods:	 {'Weak': 0.06811843607922399, 'Strong': 0.1583276238637126, 'Normal': 0.1658795549119014}
Probs: 		 {'Weak': 0.1736272970716635, 'Strong': 0.40356178100248297, 'Normal': 0.42281092192585357}


In [410]:
#Get expected historical returns for each scenario.
#Likely want historical variances also.
returns = pd.DataFrame()
covariances = {}
returns_cols = [x for x in data.columns if 'Return' in x]


for scenario_name, scenario in scenarios.items():
    scen_data = data.query(rgdp_scen[scenario_name])[returns_cols]
    print(scenario_name, scen_data.shape)
    m = scen_data.mean().rename('{}'.format(scenario_name))
    
    m.to_csv('{}_returns'.format(m))
    
    returns = returns.append(m)
    cov_mat = scen_data.cov()
    covariances[scenario_name] = cov_mat
    print(cov_mat)

    
# x
#Column: Scenario
#Row: Expected return
print("\n *********** Returns *********** \n", returns.T)

Weak (64, 4)
                   SPY_Return  US_Corp_Return  TreasInd_Return   Cash_Return
SPY_Return       2.785571e-03    3.204680e-04    -7.261031e-06 -5.917289e-07
US_Corp_Return   3.204680e-04    3.455868e-04    -5.248170e-07  4.202677e-07
TreasInd_Return -7.261031e-06   -5.248170e-07     5.933857e-06  7.523053e-09
Cash_Return     -5.917289e-07    4.202677e-07     7.523053e-09  1.023196e-08
Strong (66, 4)
                   SPY_Return  US_Corp_Return  TreasInd_Return   Cash_Return
SPY_Return       8.942520e-04    1.187977e-06    -1.028115e-05 -1.942357e-07
US_Corp_Return   1.187977e-06    1.161285e-04     8.698091e-06 -3.433970e-08
TreasInd_Return -1.028115e-05    8.698091e-06     4.543612e-06 -3.736356e-08
Cash_Return     -1.942357e-07   -3.433970e-08    -3.736356e-08  4.903151e-09
Normal (95, 4)
                   SPY_Return  US_Corp_Return  TreasInd_Return   Cash_Return
SPY_Return       1.580289e-03   -5.124242e-06    -1.400020e-05 -1.543577e-07
US_Corp_Return  -5.124242e-06    

### Disregard below - not right.

In [255]:
#Define scenario, use spaces between operators for ease of parsing.
scenario = 'RGDP_Growth = 0.01 & UNRATE = 6'

#Get relevant variables from the scenario defined above.
scenario_vars = [v.split(' ')[0] for v in scenario.split(' & ')]

#Filter data on the above scenario - not sure if useful or not...
scen_data = data.query(scenario)[scenario_vars]

#Get the empirical mean & correlation matrix of scenario vars
v = data[scenario_vars].mean().values
scen_corr = data[scenario_vars].corr()

#Calculate mahalanobis distance, transform to likelihood measure
data['L_1'] = data[scenario_vars].apply(lambda x: np.exp(-maha(x ,v, scen_corr)/2), raw = True, axis = 1)

#Rescale for likelihood
data['Prob_1'] = data['L_1'].dropna()/data['L_1'].sum()
# data['Prob_1'] = data['Prob_1'].fillna(0)
data.Prob_1

Date
1993-01-01    0.007402
1993-04-01    0.008183
1993-07-01    0.009043
1993-10-01    0.009513
1994-01-01    0.010511
1994-04-01    0.011619
1994-07-01    0.013492
1994-10-01    0.015674
1995-01-01    0.014360
1995-04-01    0.015669
1995-07-01    0.015091
1995-10-01    0.013657
1996-01-01    0.014356
1996-04-01    0.014343
1996-07-01    0.013654
1996-10-01    0.011751
1997-01-01    0.012357
1997-04-01    0.011172
1997-07-01    0.010112
1997-10-01    0.009153
1998-01-01    0.008706
1998-04-01    0.007493
1998-07-01    0.008279
1998-10-01    0.008277
1999-01-01    0.007493
1999-04-01    0.007494
1999-07-01    0.007491
1999-10-01    0.006776
2000-01-01    0.006453
2000-04-01    0.005832
                ...   
2012-04-01    0.004721
2012-07-01    0.004720
2012-10-01    0.005764
2013-01-01    0.005219
2013-04-01    0.006371
2013-07-01    0.007406
2013-10-01    0.007785
2014-01-01    0.010500
2014-04-01    0.012841
2014-07-01    0.012840
2014-10-01    0.015094
2015-01-01    0.015092
2015-0

In [23]:
scenario_1 = 'RGDP_Growth = 0.01 & UNRATE = 6'
scenario_2 = 'RGDP_Growth '

# l_1 = np.exp(-maha(np.array([float(v.split(' ')[2]) for v in scenario_1.split(' & ')]), v, scen_corr))

In [25]:
get_scenario_vars(scenario_1)

['RGDP_Growth', 'UNRATE']

In [None]:
get_s