In [1]:
import pandas as pd
import os
from ast import literal_eval

from sklearn.linear_model import LassoCV

import statsmodels.formula.api as smf

In [2]:
excel_file = '../working-csvs/20240114 Participants Key.xlsx'
all_sheets = pd.read_excel(excel_file, sheet_name=None)

data_sheet_names = list(all_sheets.keys())[1:]

dfs = []

for sheet_name in data_sheet_names:
    sheet = pd.read_excel(excel_file, sheet_name=sheet_name, skiprows=1)
    for col in ['Name', 'Number', 'Affiliation']:
        alt_name = f'Participant\n{col}'
        if alt_name in sheet.columns:
            sheet[f'Participant {col}'] = sheet[alt_name]
            sheet.drop(columns=alt_name, inplace=True)
    sheet['MTGDATE'] = sheet_name
    dfs.append(sheet)

pkey = pd.concat(dfs)

In [3]:
pkey

Unnamed: 0,Participant Number,Participant Name,Participant Affiliation,Vote,Note,Chair,MTGDATE
0,1,Frederic Mishkin,Board,1.0,,0.0,20071031
1,2,Gary Stern,Minneapolis,0.0,,0.0,20071031
2,3,Timothy Geithner,New York,1.0,,0.0,20071031
3,4,William Poole,St. Louis,1.0,,0.0,20071031
4,5,Randall Kroszner,Board,1.0,,0.0,20071031
...,...,...,...,...,...,...,...
12,13,Eric Rosengren,Boston,0.0,,0.0,20181219
13,14,Loretta Mester,Cleveland,1.0,,0.0,20181219
14,15,Patrick Harker,Philadelphia,0.0,,0.0,20181219
15,16,Charles Evans,Chicago,0.0,,0.0,20181219


In [4]:
proj = pd.read_excel('../working-csvs/20240114 FomcProjections.xlsx')

In [5]:
proj.dtypes

MTGDATE      int64
ID           int64
TARGET      object
HORIZON      int64
GDP        float64
UN         float64
PCE        float64
COREPCE    float64
FFD        float64
dtype: object

In [6]:
pkey['MTGDATE'] = pkey['MTGDATE'].astype(int)

In [7]:
pkey.dtypes

Participant Number           int64
Participant Name            object
Participant Affiliation     object
Vote                       float64
Note                        object
Chair                      float64
MTGDATE                      int64
dtype: object

In [8]:
proj = proj.merge(pkey, left_on=['MTGDATE', 'ID'], right_on=['MTGDATE', 'Participant Number'])

In [9]:
proj

Unnamed: 0,MTGDATE,ID,TARGET,HORIZON,GDP,UN,PCE,COREPCE,FFD,Participant Number,Participant Name,Participant Affiliation,Vote,Note,Chair
0,20071031,1,2007,1,2.4,4.7,3.0,1.8,,1,Frederic Mishkin,Board,1.0,,0.0
1,20071031,1,2008,5,1.7,4.8,1.8,1.9,,1,Frederic Mishkin,Board,1.0,,0.0
2,20071031,1,2009,9,2.2,4.8,1.9,2.0,,1,Frederic Mishkin,Board,1.0,,0.0
3,20071031,1,2010,13,2.2,4.8,2.0,2.0,,1,Frederic Mishkin,Board,1.0,,0.0
4,20071031,2,2007,1,2.5,4.7,3.0,1.9,,2,Gary Stern,Minneapolis,0.0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2747,20181219,17,2018,1,3.1,3.7,1.8,1.9,2.38,17,Raphael Bostic,Atlanta,1.0,,0.0
2748,20181219,17,2019,5,2.3,3.5,2.0,2.1,2.63,17,Raphael Bostic,Atlanta,1.0,,0.0
2749,20181219,17,2020,9,1.8,3.7,2.0,2.0,2.88,17,Raphael Bostic,Atlanta,1.0,,0.0
2750,20181219,17,2021,13,1.8,3.9,2.0,2.0,2.88,17,Raphael Bostic,Atlanta,1.0,,0.0


In [10]:
proj['year'] = ((proj['MTGDATE'] - proj['MTGDATE'] % 10000)/10000).astype(int)
proj['md'] = (proj['MTGDATE'] - proj['year'] * 10000)
proj['m'] = ((proj['md'] - proj['md'] % 100)/100).astype(int)

In [11]:
proj['speaker'] = proj['Participant Name'].map(lambda x : x.split()[-1].lower())

In [12]:
# mfdff = pd.read_csv('../working-csvs/mfdff.csv', index_col=0)
mfdff = pd.read_pickle('../working-csvs/mfdff.pkl')

In [13]:
mfdff = mfdff[mfdff['section'] == 2].copy()

In [14]:
mfdff['date'] = pd.to_datetime(mfdff['date'])

In [15]:
mfdff['year'] = mfdff['date'].dt.year
mfdff['m'] = mfdff['date'].dt.month

In [16]:
mfdff[[f'tprob_{i}' for i in range(45)]] = mfdff['norm_svect'].to_list()
mfdff[[f'svect_tprob_{i}' for i in range(45)]] = mfdff['svect'].to_list()

In [17]:
df = proj.merge(mfdff, left_on=['year', 'm', 'speaker'], right_on=['year', 'm', 'lname'], how='left', indicator=True)

In [18]:
df['_merge'].value_counts()

_merge
both          2483
left_only      269
right_only       0
Name: count, dtype: int64

Check these

In [357]:
df[(df['_merge'] == 'left_only') & (df['year'] < 2018)].speaker

221     pianalto
222     pianalto
223     pianalto
224     pianalto
322         duke
          ...   
2387    mullinix
2388    mullinix
2389    mullinix
2390    mullinix
2391    mullinix
Name: speaker, Length: 160, dtype: object

In [19]:
df['HORIZON'].value_counts()

HORIZON
99    548
1     191
5     191
9     191
13    191
4     153
8     153
12    153
2     153
6     153
10    153
3     152
7     152
11    152
14     66
Name: count, dtype: int64

In [20]:
def horiz_mapper(horiz):
    if horiz <= 3:
        return '1Q'
    if horiz > 3 and horiz <= 6:
        return '2Q'
    if horiz > 6 and horiz <= 9:
        return '3Q'
    if horiz > 9 and horiz <= 12:
        return '4Q'
    if horiz > 12 and horiz <= 14:
        return '5Q'
    if horiz >= 99:
        return 'LR'


In [21]:
df['HORIZON_condensed'] = df['HORIZON'].map(horiz_mapper)

In [22]:
df

Unnamed: 0,MTGDATE,ID,TARGET,HORIZON,GDP,UN,PCE,COREPCE,FFD,Participant Number,...,svect_tprob_37,svect_tprob_38,svect_tprob_39,svect_tprob_40,svect_tprob_41,svect_tprob_42,svect_tprob_43,svect_tprob_44,_merge,HORIZON_condensed
0,20071031,1,2007,1,2.4,4.7,3.0,1.8,,1,...,0.0,0.0,0.0,0.0,0.007806,0.0,0.0,-0.007806,both,1Q
1,20071031,1,2008,5,1.7,4.8,1.8,1.9,,1,...,0.0,0.0,0.0,0.0,0.007806,0.0,0.0,-0.007806,both,2Q
2,20071031,1,2009,9,2.2,4.8,1.9,2.0,,1,...,0.0,0.0,0.0,0.0,0.007806,0.0,0.0,-0.007806,both,3Q
3,20071031,1,2010,13,2.2,4.8,2.0,2.0,,1,...,0.0,0.0,0.0,0.0,0.007806,0.0,0.0,-0.007806,both,5Q
4,20071031,2,2007,1,2.5,4.7,3.0,1.9,,2,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,both,1Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2747,20181219,17,2018,1,3.1,3.7,1.8,1.9,2.38,17,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.430490,both,1Q
2748,20181219,17,2019,5,2.3,3.5,2.0,2.1,2.63,17,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.430490,both,2Q
2749,20181219,17,2020,9,1.8,3.7,2.0,2.0,2.88,17,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.430490,both,3Q
2750,20181219,17,2021,13,1.8,3.9,2.0,2.0,2.88,17,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.430490,both,5Q


In [23]:
fore1Q = df[df['HORIZON_condensed'] == '1Q']

In [24]:
tprob_cols = [f'tprob_{i}' for i in range(45)]
svect_tprob_cols = [f'svect_tprob_{i}' for i in range(45)]

In [25]:
tprob_part = ' + '.join(tprob_cols)

In [27]:
panel = fore1Q.dropna(subset=tprob_cols + ['GDP', 'PCE', 'UN'])

In [28]:
panel[tprob_cols]

Unnamed: 0,tprob_0,tprob_1,tprob_2,tprob_3,tprob_4,tprob_5,tprob_6,tprob_7,tprob_8,tprob_9,...,tprob_35,tprob_36,tprob_37,tprob_38,tprob_39,tprob_40,tprob_41,tprob_42,tprob_43,tprob_44
0,4.000000,-0.25,0.341268,0.0,0.000000,0.25,-0.072378,0.250000,0.479669,0.000000,...,-1.738918,0.000000,0.25,-0.068207,0.408820,0.0,0.341441,0.0,0.098620,0.165783
4,-0.250000,-0.25,0.341268,0.0,0.000000,0.25,-0.072378,0.250000,0.479669,0.000000,...,0.476600,0.000000,0.25,-0.068207,0.408820,0.0,0.243834,0.0,0.098620,0.179321
8,-0.250000,-0.25,-1.522483,0.0,0.000000,0.25,-0.072378,0.250000,0.479669,0.000000,...,-0.680931,0.000000,0.25,-1.011965,-0.159103,0.0,0.243834,0.0,0.098620,1.913530
12,-0.250000,-0.25,-3.596538,0.0,0.000000,0.25,-2.314796,0.250000,-0.089598,0.000000,...,0.171452,0.000000,0.25,-0.068207,0.038865,0.0,0.243834,0.0,-0.802004,-0.703798
16,-0.250000,4.00,0.341268,0.0,0.000000,0.25,-0.072378,0.250000,0.027119,0.000000,...,-0.477177,0.000000,-4.00,-0.068207,1.085319,0.0,-3.998949,0.0,0.098620,0.179321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2727,0.045921,0.00,-0.152401,0.0,-0.067325,0.00,0.212989,0.267261,-0.866041,-0.267261,...,-3.741657,-0.003864,0.00,-1.912674,-0.887515,0.0,-0.267261,0.0,-0.662326,-0.315790
2732,0.045921,0.00,-0.152401,0.0,-0.067325,0.00,0.212989,-3.741657,0.551168,-0.267261,...,0.267261,-0.003864,0.00,-2.085733,-0.935846,0.0,-0.267261,0.0,-1.131065,-0.315790
2737,0.045921,0.00,-0.152401,0.0,-0.067325,0.00,-0.098324,0.267261,-2.076812,-0.267261,...,0.267261,-0.003864,0.00,1.746328,-0.177299,0.0,-0.267261,0.0,0.302676,-1.216877
2742,0.045921,0.00,-0.152401,0.0,-0.067325,0.00,-0.464429,0.267261,-1.414755,-0.267261,...,0.267261,-0.003864,0.00,-0.806543,-1.570239,0.0,-0.267261,0.0,0.302676,0.722442


In [32]:
import statsmodels.api as sm
X = panel[svect_tprob_cols]
y = panel['PCE']
X = sm.add_constant(X)
model = sm.OLS(y,X)
results = model.fit()
results.summary()
# smf.ols()

0,1,2,3
Dep. Variable:,PCE,R-squared:,0.092
Model:,OLS,Adj. R-squared:,-0.003
Method:,Least Squares,F-statistic:,0.9665
Date:,"Thu, 04 Apr 2024",Prob (F-statistic):,0.534
Time:,06:47:30,Log-Likelihood:,-506.14
No. Observations:,446,AIC:,1098.0
Df Residuals:,403,BIC:,1275.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.7758,0.042,41.922,0.000,1.693,1.859
svect_tprob_0,0.4292,0.215,2.000,0.046,0.007,0.851
svect_tprob_1,-0.4981,0.292,-1.704,0.089,-1.073,0.077
svect_tprob_2,0.1020,0.175,0.584,0.560,-0.241,0.445
svect_tprob_3,-0.7490,1.251,-0.599,0.550,-3.208,1.709
svect_tprob_4,-0.0043,0.076,-0.056,0.955,-0.154,0.145
svect_tprob_5,0.4405,0.327,1.347,0.179,-0.202,1.083
svect_tprob_6,-0.0209,0.059,-0.351,0.726,-0.138,0.096
svect_tprob_7,0.0434,0.243,0.179,0.858,-0.435,0.521

0,1,2,3
Omnibus:,57.262,Durbin-Watson:,0.377
Prob(Omnibus):,0.0,Jarque-Bera (JB):,79.558
Skew:,0.894,Prob(JB):,5.3e-18
Kurtosis:,4.042,Cond. No.,1.31e+16


In [33]:
import numpy as np

keep_cols = np.zeros(45)

for col in ['PCE', 'GDP', 'UN']:
    X = panel[svect_tprob_cols]
    y = panel[col]

    counts = np.zeros(45)

    for i in range(100):
        from sklearn.model_selection import KFold
        kfold = KFold(n_splits=10, shuffle=True, random_state=i)
        reg = LassoCV(cv=kfold).fit(X,y)
        # print(reg.coef_)
        counts += (reg.coef_ != 0) * 1
    cols_for_topic = counts > 50
    print([i for i, val in enumerate(cols_for_topic) if val])

    keep_cols = np.logical_or(keep_cols, cols_for_topic)

print([i for i, val in enumerate(keep_cols) if val])
sum(keep_cols)


[6, 18, 23, 32, 35]
[6, 12, 14, 18, 19, 23, 34, 44]
[10, 18, 23, 27, 30]
[6, 10, 12, 14, 18, 19, 23, 27, 30, 32, 34, 35, 44]


13

In [34]:
import numpy as np
from sklearn.linear_model import ElasticNetCV

keep_cols = np.zeros(45)

for col in ['PCE', 'GDP', 'UN']:
    X = panel[svect_tprob_cols]
    y = panel[col]

    counts = np.zeros(45)

    for i in range(100):
        from sklearn.model_selection import KFold
        kfold = KFold(n_splits=10, shuffle=True, random_state=i)
        reg = ElasticNetCV(cv=kfold, l1_ratio=[0.1]).fit(X,y)
        # print(reg.l1_ratio_)
        # print(reg.coef_)
        counts += (reg.coef_ != 0) * 1
    cols_for_topic = counts > 50
    print([i for i, val in enumerate(cols_for_topic) if val])

    keep_cols = np.logical_or(keep_cols, cols_for_topic)
    # print(counts)

print([i for i, val in enumerate(keep_cols) if val])
sum(keep_cols)

[6, 18, 23, 32, 35, 39]
[2, 4, 6, 12, 14, 18, 19, 20, 23, 27, 34, 38, 44]
[8, 10, 12, 18, 20, 22, 23, 26, 27, 30, 33, 38, 43, 44]
[2, 4, 6, 8, 10, 12, 14, 18, 19, 20, 22, 23, 26, 27, 30, 32, 33, 34, 35, 38, 39, 43, 44]


23

In [35]:
keep = [i for i, val in enumerate(keep_cols) if val == True]

In [36]:
def trim_svect(svect):
    new = np.zeros(len(keep))
    for i, index in enumerate(keep):
        new[i] = svect[index]
    return new

In [37]:
mfdff.columns

Index(['date', 'section', 'lname', 'svect', 'voter', 'sent', 'region',
       'female', 'chair', 'exp',
       ...
       'svect_tprob_35', 'svect_tprob_36', 'svect_tprob_37', 'svect_tprob_38',
       'svect_tprob_39', 'svect_tprob_40', 'svect_tprob_41', 'svect_tprob_42',
       'svect_tprob_43', 'svect_tprob_44'],
      dtype='object', length=124)

In [38]:
mfdff['final_norm_svect'] = mfdff['norm_svect'].map(lambda x : trim_svect(x).tolist())
mfdff['final_diff_exp_norm'] = mfdff['diff_avg_norm_expd'].map(lambda x : trim_svect(x).tolist())
mfdff['final_diff_prior'] = mfdff['norm_svect_diff_avg_prior'].map(lambda x : trim_svect(x).tolist())

In [39]:
mfdff['final_diff_exp_norm']

19      [-0.3573212656919145, 0.0, -0.4690970937165708...
25      [-0.3573212656919145, 0.0, -0.4690970937165708...
26      [-0.3573212656919145, 0.0, -0.4690970937165708...
20      [2.3239762429896382, 0.0, -0.4690970937165708,...
16      [-0.3573212656919145, 0.0, 3.283679656015995, ...
                              ...                        
5655    [-0.06933086735034355, -0.3202906983772544, 0....
5661    [-0.06933086735034355, -0.3202906983772544, 0....
5658    [-0.06933086735034355, -2.5133198962667267, 0....
5663    [2.7844456449544284, 2.8826162853952892, 0.953...
5660    [-0.06933086735034355, -0.3202906983772544, 0....
Name: final_diff_exp_norm, Length: 2340, dtype: object

In [40]:
mfdff.to_pickle('../working-csvs/mfdff_selected.pkl')