In [339]:
import pandas as pd
import os
from ast import literal_eval

from sklearn.linear_model import LassoCV

import statsmodels.formula.api as smf

In [340]:
excel_file = '../working-csvs/20240114 Participants Key.xlsx'
all_sheets = pd.read_excel(excel_file, sheet_name=None)

data_sheet_names = list(all_sheets.keys())[1:]

dfs = []

for sheet_name in data_sheet_names:
    sheet = pd.read_excel(excel_file, sheet_name=sheet_name, skiprows=1)
    for col in ['Name', 'Number', 'Affiliation']:
        alt_name = f'Participant\n{col}'
        if alt_name in sheet.columns:
            sheet[f'Participant {col}'] = sheet[alt_name]
            sheet.drop(columns=alt_name, inplace=True)
    sheet['MTGDATE'] = sheet_name
    dfs.append(sheet)

pkey = pd.concat(dfs)

In [341]:
pkey

Unnamed: 0,Participant Number,Participant Name,Participant Affiliation,Vote,Note,Chair,MTGDATE
0,1,Frederic Mishkin,Board,1.0,,0.0,20071031
1,2,Gary Stern,Minneapolis,0.0,,0.0,20071031
2,3,Timothy Geithner,New York,1.0,,0.0,20071031
3,4,William Poole,St. Louis,1.0,,0.0,20071031
4,5,Randall Kroszner,Board,1.0,,0.0,20071031
...,...,...,...,...,...,...,...
12,13,Eric Rosengren,Boston,0.0,,0.0,20181219
13,14,Loretta Mester,Cleveland,1.0,,0.0,20181219
14,15,Patrick Harker,Philadelphia,0.0,,0.0,20181219
15,16,Charles Evans,Chicago,0.0,,0.0,20181219


In [342]:
proj = pd.read_excel('../working-csvs/20240114 FomcProjections.xlsx')

In [343]:
proj.dtypes

MTGDATE      int64
ID           int64
TARGET      object
HORIZON      int64
GDP        float64
UN         float64
PCE        float64
COREPCE    float64
FFD        float64
dtype: object

In [344]:
pkey['MTGDATE'] = pkey['MTGDATE'].astype(int)

In [345]:
pkey.dtypes

Participant Number           int64
Participant Name            object
Participant Affiliation     object
Vote                       float64
Note                        object
Chair                      float64
MTGDATE                      int64
dtype: object

In [346]:
proj = proj.merge(pkey, left_on=['MTGDATE', 'ID'], right_on=['MTGDATE', 'Participant Number'])

In [347]:
proj

Unnamed: 0,MTGDATE,ID,TARGET,HORIZON,GDP,UN,PCE,COREPCE,FFD,Participant Number,Participant Name,Participant Affiliation,Vote,Note,Chair
0,20071031,1,2007,1,2.4,4.7,3.0,1.8,,1,Frederic Mishkin,Board,1.0,,0.0
1,20071031,1,2008,5,1.7,4.8,1.8,1.9,,1,Frederic Mishkin,Board,1.0,,0.0
2,20071031,1,2009,9,2.2,4.8,1.9,2.0,,1,Frederic Mishkin,Board,1.0,,0.0
3,20071031,1,2010,13,2.2,4.8,2.0,2.0,,1,Frederic Mishkin,Board,1.0,,0.0
4,20071031,2,2007,1,2.5,4.7,3.0,1.9,,2,Gary Stern,Minneapolis,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2747,20181219,17,2018,1,3.1,3.7,1.8,1.9,2.38,17,Raphael Bostic,Atlanta,1.0,,0.0
2748,20181219,17,2019,5,2.3,3.5,2.0,2.1,2.63,17,Raphael Bostic,Atlanta,1.0,,0.0
2749,20181219,17,2020,9,1.8,3.7,2.0,2.0,2.88,17,Raphael Bostic,Atlanta,1.0,,0.0
2750,20181219,17,2021,13,1.8,3.9,2.0,2.0,2.88,17,Raphael Bostic,Atlanta,1.0,,0.0


In [348]:
proj['year'] = ((proj['MTGDATE'] - proj['MTGDATE'] % 10000)/10000).astype(int)
proj['md'] = (proj['MTGDATE'] - proj['year'] * 10000)
proj['m'] = ((proj['md'] - proj['md'] % 100)/100).astype(int)

In [349]:
proj['speaker'] = proj['Participant Name'].map(lambda x : x.split()[-1].lower())

In [350]:
# mfdff = pd.read_csv('../working-csvs/mfdff.csv', index_col=0)
mfdff = pd.read_pickle('../working-csvs/mfdff.pkl')

In [351]:
mfdff = mfdff[mfdff['section'] == 2].copy()

In [352]:
mfdff['date'] = pd.to_datetime(mfdff['date'])

In [353]:
mfdff['year'] = mfdff['date'].dt.year
mfdff['m'] = mfdff['date'].dt.month

In [302]:
# mfdff['norm_svect'] = mfdff['norm_svect'].map(lambda x : [float(item[:-1]) for item in x[1:-1].split()])

In [303]:
# mfdff['diff_exp_norm'] = mfdff['diff_exp_norm'].map(lambda x : [float(item[:-1]) for item in x[1:-1].split()])
# mfdff['norm_svect_diff_avg_prior1'] = mfdff['norm_svect_diff_avg_prior'].map(lambda x : [float(item[:-1]) for item in x[1:-1].split()])

In [354]:
mfdff[[f'tprob_{i}' for i in range(45)]] = mfdff['norm_svect'].to_list()
mfdff[[f'svect_tprob_{i}' for i in range(45)]] = mfdff['svect'].to_list()

In [355]:
df = proj.merge(mfdff, left_on=['year', 'm', 'speaker'], right_on=['year', 'm', 'lname'], how='left', indicator=True)

In [356]:
df['_merge'].value_counts()

_merge
both          2572
left_only      180
right_only       0
Name: count, dtype: int64

Check these

In [357]:
df[(df['_merge'] == 'left_only') & (df['year'] < 2018)].speaker

221     pianalto
222     pianalto
223     pianalto
224     pianalto
322         duke
          ...   
2387    mullinix
2388    mullinix
2389    mullinix
2390    mullinix
2391    mullinix
Name: speaker, Length: 160, dtype: object

In [358]:
df['HORIZON'].value_counts()

HORIZON
99    548
1     191
5     191
9     191
13    191
4     153
8     153
12    153
2     153
6     153
10    153
3     152
7     152
11    152
14     66
Name: count, dtype: int64

In [359]:
def horiz_mapper(horiz):
    if horiz <= 3:
        return '1Q'
    if horiz > 3 and horiz <= 6:
        return '2Q'
    if horiz > 6 and horiz <= 9:
        return '3Q'
    if horiz > 9 and horiz <= 12:
        return '4Q'
    if horiz > 12 and horiz <= 14:
        return '5Q'
    if horiz >= 99:
        return 'LR'


In [360]:
df['HORIZON_condensed'] = df['HORIZON'].map(horiz_mapper)

In [361]:
df

Unnamed: 0,MTGDATE,ID,TARGET,HORIZON,GDP,UN,PCE,COREPCE,FFD,Participant Number,...,svect_tprob_37,svect_tprob_38,svect_tprob_39,svect_tprob_40,svect_tprob_41,svect_tprob_42,svect_tprob_43,svect_tprob_44,_merge,HORIZON_condensed
0,20071031,1,2007,1,2.4,4.7,3.0,1.8,,1,...,0.0,0.000000,-1.519062,0.0,0.007806,0.0,0.000000,-0.007806,both,1Q
1,20071031,1,2008,5,1.7,4.8,1.8,1.9,,1,...,0.0,0.000000,-1.519062,0.0,0.007806,0.0,0.000000,-0.007806,both,2Q
2,20071031,1,2009,9,2.2,4.8,1.9,2.0,,1,...,0.0,0.000000,-1.519062,0.0,0.007806,0.0,0.000000,-0.007806,both,3Q
3,20071031,1,2010,13,2.2,4.8,2.0,2.0,,1,...,0.0,0.000000,-1.519062,0.0,0.007806,0.0,0.000000,-0.007806,both,5Q
4,20071031,2,2007,1,2.5,4.7,3.0,1.9,,2,...,0.0,0.000000,0.000000,0.0,0.000000,0.0,-0.590667,0.000000,both,1Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2747,20181219,17,2018,1,3.1,3.7,1.8,1.9,2.38,17,...,0.0,0.479137,0.000000,0.0,0.000000,0.0,0.000000,0.954020,both,1Q
2748,20181219,17,2019,5,2.3,3.5,2.0,2.1,2.63,17,...,0.0,0.479137,0.000000,0.0,0.000000,0.0,0.000000,0.954020,both,2Q
2749,20181219,17,2020,9,1.8,3.7,2.0,2.0,2.88,17,...,0.0,0.479137,0.000000,0.0,0.000000,0.0,0.000000,0.954020,both,3Q
2750,20181219,17,2021,13,1.8,3.9,2.0,2.0,2.88,17,...,0.0,0.479137,0.000000,0.0,0.000000,0.0,0.000000,0.954020,both,5Q


In [362]:
fore1Q = df[df['HORIZON_condensed'] == '1Q']

In [363]:
tprob_cols = [f'tprob_{i}' for i in range(45)]
svect_tprob_cols = [f'svect_tprob_{i}' for i in range(45)]

In [364]:
tprob_part = ' + '.join(tprob_cols)

In [365]:
panel = fore1Q.dropna(subset=tprob_cols + ['GDP', 'PCE', 'UN'])

In [366]:
panel[tprob_cols]

Unnamed: 0,tprob_0,tprob_1,tprob_2,tprob_3,tprob_4,tprob_5,tprob_6,tprob_7,tprob_8,tprob_9,...,tprob_35,tprob_36,tprob_37,tprob_38,tprob_39,tprob_40,tprob_41,tprob_42,tprob_43,tprob_44
0,1.605910,0.026694,0.018633,0.0,0.000000,0.25,3.087431,0.347848,0.479184,0.000000,...,-2.842843,0.250000,0.25,0.066608,-1.179394,0.0,0.341441,0.0,0.036598,0.320277
4,-0.344124,0.026694,0.018633,0.0,0.000000,0.25,-0.122998,-3.495635,0.479184,0.000000,...,-0.267616,0.250000,0.25,0.066608,0.597211,0.0,0.243834,0.0,-1.150294,0.337896
8,-0.344124,0.026694,-0.816672,0.0,0.000000,0.25,-0.122998,0.347848,0.479184,0.000000,...,-0.720643,0.250000,0.25,-1.480633,-0.685982,0.0,0.243834,0.0,0.036598,2.594874
12,-0.344124,0.026694,-1.746232,0.0,0.000000,0.25,-1.208972,0.347848,0.090989,0.000000,...,0.247095,0.250000,0.25,0.066608,0.253032,0.0,0.243834,0.0,-0.604500,-0.811436
16,-0.344124,2.707474,0.018633,0.0,0.000000,0.25,-0.122998,0.347848,0.170581,0.000000,...,-0.283126,0.250000,-4.00,0.066608,1.226576,0.0,-3.998949,0.0,0.036598,0.337896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2727,0.045921,0.000000,-0.462789,0.0,-0.153957,0.00,0.322942,0.142239,-0.857850,-0.267261,...,-1.006333,0.086662,0.00,-1.363386,-0.581952,0.0,-0.267261,0.0,-0.805703,-0.510234
2732,0.045921,0.000000,-0.462789,0.0,-0.153957,0.00,0.322942,-3.476746,0.544832,-0.267261,...,-0.268797,-1.267793,0.00,-2.844671,-0.619099,0.0,-0.267261,0.0,-2.225310,0.421751
2737,0.045921,0.000000,-0.462789,0.0,-0.153957,0.00,-0.477940,0.142239,-2.056210,-0.267261,...,-0.268797,0.086662,0.00,1.317250,-0.036089,0.0,-0.267261,0.0,0.114897,-1.340421
2742,0.045921,0.000000,-0.462789,0.0,-0.153957,0.00,-0.025140,0.142239,0.544832,-0.267261,...,-0.268797,0.086662,0.00,-0.553019,-1.106685,0.0,-0.267261,0.0,0.114897,0.446309


In [367]:
panel

Unnamed: 0,MTGDATE,ID,TARGET,HORIZON,GDP,UN,PCE,COREPCE,FFD,Participant Number,...,svect_tprob_37,svect_tprob_38,svect_tprob_39,svect_tprob_40,svect_tprob_41,svect_tprob_42,svect_tprob_43,svect_tprob_44,_merge,HORIZON_condensed
0,20071031,1,2007,1,2.4,4.7,3.0,1.8,,1,...,0.000000,0.000000,-1.519062,0.0,0.007806,0.0,0.000000,-0.007806,both,1Q
4,20071031,2,2007,1,2.5,4.7,3.0,1.9,,2,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,-0.590667,0.000000,both,1Q
8,20071031,3,2007,1,2.4,4.7,2.9,1.8,,3,...,0.000000,-0.493631,-1.097176,0.0,0.000000,0.0,0.000000,1.000000,both,1Q
12,20071031,4,2007,1,2.3,4.8,2.7,1.9,,4,...,0.000000,0.000000,-0.294286,0.0,0.000000,0.0,-0.319048,-0.509235,both,1Q
16,20071031,5,2007,1,2.5,4.7,2.9,1.9,,5,...,-0.538129,0.000000,0.538129,0.0,-0.339330,0.0,0.000000,0.000000,both,1Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2727,20181219,13,2018,1,3.0,3.7,1.8,1.8,2.38,13,...,0.000000,-1.000000,-0.238095,0.0,0.000000,0.0,-0.408730,0.000000,both,1Q
2732,20181219,14,2018,1,3.0,3.7,1.8,1.9,2.38,14,...,0.000000,-1.902390,-0.254298,0.0,0.000000,0.0,-1.039010,0.358434,both,1Q
2737,20181219,15,2018,1,3.0,3.7,1.9,1.9,2.13,15,...,0.000000,0.633028,0.000000,0.0,0.000000,0.0,0.000000,-0.319283,both,1Q
2742,20181219,16,2018,1,3.1,3.7,1.9,1.9,2.38,16,...,0.000000,-0.506329,-0.466974,0.0,0.000000,0.0,0.000000,0.367879,both,1Q


In [252]:
panel

Unnamed: 0,MTGDATE,ID,TARGET,HORIZON,GDP,UN,PCE,COREPCE,FFD,Participant Number,...,svect_tprob_37,svect_tprob_38,svect_tprob_39,svect_tprob_40,svect_tprob_41,svect_tprob_42,svect_tprob_43,svect_tprob_44,_merge,HORIZON_condensed
0,20071031,1,2007,1,2.4,4.7,3.0,1.8,,1,...,0.000000,0.000000,-1.519062,0.0,0.007806,0.0,0.000000,-0.007806,both,1Q
4,20071031,2,2007,1,2.5,4.7,3.0,1.9,,2,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,-0.590667,0.000000,both,1Q
8,20071031,3,2007,1,2.4,4.7,2.9,1.8,,3,...,0.000000,-0.493631,-1.097176,0.0,0.000000,0.0,0.000000,1.000000,both,1Q
12,20071031,4,2007,1,2.3,4.8,2.7,1.9,,4,...,0.000000,0.000000,-0.294286,0.0,0.000000,0.0,-0.319048,-0.509235,both,1Q
16,20071031,5,2007,1,2.5,4.7,2.9,1.9,,5,...,-0.538129,0.000000,0.538129,0.0,-0.339330,0.0,0.000000,0.000000,both,1Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2727,20181219,13,2018,1,3.0,3.7,1.8,1.8,2.38,13,...,0.000000,-1.000000,-0.238095,0.0,0.000000,0.0,-0.408730,0.000000,both,1Q
2732,20181219,14,2018,1,3.0,3.7,1.8,1.9,2.38,14,...,0.000000,-1.902390,-0.254298,0.0,0.000000,0.0,-1.039010,0.358434,both,1Q
2737,20181219,15,2018,1,3.0,3.7,1.9,1.9,2.13,15,...,0.000000,0.633028,0.000000,0.0,0.000000,0.0,0.000000,-0.319283,both,1Q
2742,20181219,16,2018,1,3.1,3.7,1.9,1.9,2.38,16,...,0.000000,-0.506329,-0.466974,0.0,0.000000,0.0,0.000000,0.367879,both,1Q


In [368]:
import statsmodels.api as sm
X = panel[svect_tprob_cols]
y = panel['PCE']
X = sm.add_constant(X)
model = sm.OLS(y,X)
results = model.fit()
results.summary()
# smf.ols()

0,1,2,3
Dep. Variable:,PCE,R-squared:,0.097
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,1.074
Date:,"Sun, 24 Mar 2024",Prob (F-statistic):,0.354
Time:,17:45:23,Log-Likelihood:,-524.74
No. Observations:,461,AIC:,1135.0
Df Residuals:,418,BIC:,1313.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.7621,0.041,42.565,0.000,1.681,1.843
svect_tprob_0,0.4227,0.185,2.289,0.023,0.060,0.786
svect_tprob_1,-0.1647,0.274,-0.602,0.548,-0.703,0.373
svect_tprob_2,0.1616,0.141,1.147,0.252,-0.115,0.438
svect_tprob_3,-0.9967,1.260,-0.791,0.429,-3.474,1.480
svect_tprob_4,0.0300,0.071,0.424,0.671,-0.109,0.169
svect_tprob_5,0.2553,0.308,0.828,0.408,-0.351,0.862
svect_tprob_6,0.0006,0.048,0.012,0.990,-0.094,0.095
svect_tprob_7,-0.0098,0.219,-0.045,0.964,-0.440,0.421

0,1,2,3
Omnibus:,71.677,Durbin-Watson:,0.388
Prob(Omnibus):,0.0,Jarque-Bera (JB):,110.342
Skew:,0.989,Prob(JB):,1.1e-24
Kurtosis:,4.352,Cond. No.,1.33e+16


In [369]:
import numpy as np

keep_cols = np.zeros(45)

for col in ['PCE', 'GDP', 'UN']:
    X = panel[svect_tprob_cols]
    y = panel[col]

    counts = np.zeros(45)

    for i in range(100):
        from sklearn.model_selection import KFold
        kfold = KFold(n_splits=10, shuffle=True, random_state=i)
        reg = LassoCV(cv=kfold).fit(X,y)
        # print(reg.coef_)
        counts += (reg.coef_ != 0) * 1
    cols_for_topic = counts > 50
    print([i for i, val in enumerate(cols_for_topic) if val])

    keep_cols = np.logical_or(keep_cols, cols_for_topic)

print([i for i, val in enumerate(keep_cols) if val])
sum(keep_cols)


[]
[2, 6, 10, 14, 18, 19, 22, 23, 34, 44]
[10, 18, 23, 27, 30, 33, 44]
[2, 6, 10, 14, 18, 19, 22, 23, 27, 30, 33, 34, 44]


13

In [370]:
import numpy as np
from sklearn.linear_model import ElasticNetCV

keep_cols = np.zeros(45)

for col in ['PCE', 'GDP', 'UN']:
    X = panel[svect_tprob_cols]
    y = panel[col]

    counts = np.zeros(45)

    for i in range(100):
        from sklearn.model_selection import KFold
        kfold = KFold(n_splits=10, shuffle=True, random_state=i)
        reg = ElasticNetCV(cv=kfold, l1_ratio=[0.1]).fit(X,y)
        # print(reg.l1_ratio_)
        # print(reg.coef_)
        counts += (reg.coef_ != 0) * 1
    cols_for_topic = counts > 50
    print([i for i, val in enumerate(cols_for_topic) if val])

    keep_cols = np.logical_or(keep_cols, cols_for_topic)
    # print(counts)

print([i for i, val in enumerate(keep_cols) if val])
sum(keep_cols)

[8, 18, 20, 23, 33, 34, 35, 39]
[2, 4, 6, 10, 12, 14, 18, 19, 20, 22, 23, 27, 30, 33, 34, 36, 44]
[2, 6, 7, 8, 10, 12, 13, 18, 19, 22, 23, 26, 27, 30, 33, 35, 36, 38, 43, 44]
[2, 4, 6, 7, 8, 10, 12, 13, 14, 18, 19, 20, 22, 23, 26, 27, 30, 33, 34, 35, 36, 38, 39, 43, 44]


25

In [371]:
keep = [i for i, val in enumerate(keep_cols) if val == True]

In [372]:
def trim_svect(svect):
    new = np.zeros(len(keep))
    for i, index in enumerate(keep):
        new[i] = svect[index]
    return new

In [373]:
mfdff.columns

Index(['date', 'section', 'lname', 'svect', 'voter', 'sent', 'region',
       'female', 'chair', 'exp',
       ...
       'svect_tprob_35', 'svect_tprob_36', 'svect_tprob_37', 'svect_tprob_38',
       'svect_tprob_39', 'svect_tprob_40', 'svect_tprob_41', 'svect_tprob_42',
       'svect_tprob_43', 'svect_tprob_44'],
      dtype='object', length=120)

In [374]:
mfdff['final_norm_svect'] = mfdff['norm_svect'].map(lambda x : trim_svect(x).tolist())
mfdff['final_diff_exp_norm'] = mfdff['diff_exp_norm'].map(lambda x : trim_svect(x).tolist())
mfdff['final_diff_prior'] = mfdff['norm_svect_diff_avg_prior'].map(lambda x : trim_svect(x).tolist())

In [375]:
mfdff['final_diff_exp_norm']

19      [0.5116545513891234, 0.0, -0.46909709371657066...
25      [0.5116545513891234, 0.0, -0.46909709371657066...
26      [0.5116545513891234, 0.0, -0.46909709371657066...
20      [1.7939576251925706, 0.0, -0.46909709371657066...
16      [-2.874030966120643, 0.0, 3.2836796560159947, ...
                              ...                        
5885    [-0.34867969519931247, -0.43871591198175014, 0...
5891    [-0.34867969519931247, -0.43871591198175014, 0...
5888    [-0.34867969519931247, -2.5165185489496475, 0....
5893    [1.7081359776490264, 2.5959033184644893, -0.09...
5890    [-0.34867969519931247, -0.43871591198175014, 0...
Name: final_diff_exp_norm, Length: 2564, dtype: object

In [376]:
mfdff.to_pickle('../working-csvs/mfdff_selected.pkl')