In [1]:
import pandas as pd
import numpy as np

from nltk.stem import PorterStemmer

from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import setup_modules
from lib.preprocessing import setup_enhance, ngram_enhance, process, load_stopwords

In [2]:
df = pd.read_csv('../working-csvs/fomc-sents-w-sentiment.csv', index_col=0)

In [3]:
bigrams, trigrams = setup_enhance('../ngrams/bigrams', '../ngrams/trigrams')
df['enhanced'] = df['content'].map(lambda x : ngram_enhance(x, bigrams, trigrams))

In [4]:
stopwords = load_stopwords('../stopwords/stopwords.txt')
stemmer = PorterStemmer()
df['tokens'] = df['enhanced'].map(lambda x : process(x, stemmer=stemmer, stopwords=stopwords))

In [5]:
topic_model = LdaModel.load('../models/02-29/02-29lda')
model_dict = Dictionary.load('../models/02-29/02-29dict')
# topic_model = LdaModel.load('../models/12-10lda')
# model_dict = Dictionary.load('../models/12-10dict')

In [6]:
df['tprob_vect'] = df['tokens'].map(lambda x : topic_model[model_dict.doc2bow(x)])

In [7]:
def topk_topics(topic_vec, k=3):
    probs = [item[1] for item in topic_vec]
    probs.sort(reverse=True)
    threshold = probs[k - 1]
    topk = [item for item in topic_vec if item[1] >= max(threshold, 0.03)]
    rounded = [(topic, round(prob,4)) for topic, prob in topk]
    return rounded

In [8]:
df['topk'] = df['tprob_vect'].map(lambda x : topk_topics(x, k=3))

In [9]:
for i, row in df.sample(20)[['content', 'topk']].iterrows():
    if len(row['topk']) > 0:
        print(row['content'])
        print(row['topk'])

Under the policy stance of Tealbook A, the Board staff projects that inflation will return to target only after 2020.
[(6, 0.0337), (10, 0.0302)]
The bad one is the well-known codependency thesis: We are over-consuming and other countries are inappropriately supporting their export industry by intervening to strengthen the dollar.
[(9, 0.0529), (35, 0.0311)]
So I guess it would be an understatement to say that the degree of near-term uncertainty surrounding the outlook is unusually high.
[(4, 0.0329)]
This positioning would, in effect, say that quantitative easing is useful and is having some effect, but we have multiple tools to use tactically, and the question is, what combination is best in the context of evolving economic conditions?
[(19, 0.0522), (23, 0.0331), (27, 0.0378)]
What we do and what we say should be designed to contribute to this end, and I think Governor Kelley made that point extremely well.
[(37, 0.0489)]
Recent revisions to Texas employment data suggest that Texas 

In [10]:
k = 45

def partial_stance_vect(topics, sent_label):
    # maybe normalize
    total = sum([topic[1] for topic in topics])
    svect = [0] * k
    for topic, weight in topics:
        svect[topic] = float(sent_label) * float(weight) / total
    
    return np.array(svect)

In [11]:
min_sent_prob = 0.5

In [12]:
dff = df[(df['topk'].map(len) > 0) & (df['sent_prob'] > 0.5) & (df['sent'] != 0)].copy()

In [13]:
dff['svect'] = dff.apply(lambda x : partial_stance_vect(x['topk'], x['sent']), axis=1)

In [14]:
dff

Unnamed: 0,lname,date,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk,svect
0,bernanke,2002-08-13,1,As we search for the signal of an incipient re...,0.819317,-1,As we search for the signal of an incipient re...,"[search, signal, incipi, recoveri, heavi, nois...","[(0, 0.020856535), (1, 0.021945685), (2, 0.020...","[(36, 0.0359)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,The direct effects of the stock market are bei...,0.654100,-1,The direct effects of the stock market are bei...,"[direct, effect, stock, market, partli, offset...","[(0, 0.023129959), (1, 0.02173418), (2, 0.0206...","[(34, 0.0308)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,"Second, regarding the other source of noise, t...",0.840512,-1,"Second, regarding the other source of noise, t...","[sourc, nois, data, revis, longer, blame, shal...","[(0, 0.019851433), (1, 0.020303724), (2, 0.020...","[(13, 0.0327), (18, 0.0312)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,"In particular, the revision has largely left i...",0.827082,1,"In particular, the revision has largely left i...","[revis, larg, left, intact, growth, product, s...","[(0, 0.02214655), (1, 0.030765276), (2, 0.0211...","[(1, 0.0308), (37, 0.0487), (39, 0.0395)]","[0.0, 0.2588235220458918, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,I note the comments that Governor Kohn made ab...,0.803425,-1,I note the comments that Governor Kohn made ab...,"[note, comment, governor, kohn, made, financi,...","[(0, 0.019764122), (1, 0.020307602), (2, 0.020...","[(27, 0.0347), (35, 0.0491)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...
6701,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.019811908), (1, 0.022919705), (2, 0.038...","[(2, 0.039)]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650389), (1, 0.019794215), (2, 0.020...","[(4, 0.056)]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,"I suppose, although nothing obviously is set i...",0.907364,1,"I suppose, although nothing obviously is set i...","[suppos, set, stone, s, a, pretti, strong, inc...","[(0, 0.020269258), (1, 0.019947713), (2, 0.021...","[(12, 0.0306), (37, 0.0392)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,"The minutes, if you have listened to the discu...",0.602243,1,"The minutes, if you have listened to the discu...","[minut, listen, discuss, tabl, signal, greater...","[(0, 0.019247059), (1, 0.018644728), (2, 0.033...","[(2, 0.0335), (39, 0.0591)]","[0.0, 0.0, 0.36177107061022307, 0.0, 0.0, 0.0,..."


In [15]:
fomc = pd.read_csv('../working-csvs/fomc.csv', index_col=0)

In [20]:
fomc['date_pd'] = pd.to_datetime(fomc['date'])
fomc['year'] = fomc['date_pd'].dt.year
fomc['month'] = fomc['date_pd'].dt.month

In [21]:
dff['date_pd'] = pd.to_datetime(dff['date'].map(str))
dff['year'] = dff['date_pd'].dt.year
dff['month'] = dff['date_pd'].dt.month

In [22]:
fdff = dff.merge(fomc, how='left', left_on=['year', 'month', 'lname'], right_on=['year', 'month', 'member'])

In [23]:
fdff

Unnamed: 0,lname,date_x,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk,...,year,month,date_y,member,voter,region,female,chair,exp,date_pd_y
0,bernanke,2002-08-13,1,As we search for the signal of an incipient re...,0.819317,-1,As we search for the signal of an incipient re...,"[search, signal, incipi, recoveri, heavi, nois...","[(0, 0.020856535), (1, 0.021945685), (2, 0.020...","[(36, 0.0359)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
1,bernanke,2002-08-13,1,The direct effects of the stock market are bei...,0.654100,-1,The direct effects of the stock market are bei...,"[direct, effect, stock, market, partli, offset...","[(0, 0.023129959), (1, 0.02173418), (2, 0.0206...","[(34, 0.0308)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
2,bernanke,2002-08-13,1,"Second, regarding the other source of noise, t...",0.840512,-1,"Second, regarding the other source of noise, t...","[sourc, nois, data, revis, longer, blame, shal...","[(0, 0.019851433), (1, 0.020303724), (2, 0.020...","[(13, 0.0327), (18, 0.0312)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
3,bernanke,2002-08-13,1,"In particular, the revision has largely left i...",0.827082,1,"In particular, the revision has largely left i...","[revis, larg, left, intact, growth, product, s...","[(0, 0.02214655), (1, 0.030765276), (2, 0.0211...","[(1, 0.0308), (37, 0.0487), (39, 0.0395)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
4,bernanke,2002-08-13,1,I note the comments that Governor Kohn made ab...,0.803425,-1,I note the comments that Governor Kohn made ab...,"[note, comment, governor, kohn, made, financi,...","[(0, 0.019764122), (1, 0.020307602), (2, 0.020...","[(27, 0.0347), (35, 0.0491)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79933,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.019811908), (1, 0.022919705), (2, 0.038...","[(2, 0.039)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79934,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650389), (1, 0.019794215), (2, 0.020...","[(4, 0.056)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79935,yellen,2018-01-31,2,"I suppose, although nothing obviously is set i...",0.907364,1,"I suppose, although nothing obviously is set i...","[suppos, set, stone, s, a, pretti, strong, inc...","[(0, 0.020269258), (1, 0.019947713), (2, 0.021...","[(12, 0.0306), (37, 0.0392)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79936,yellen,2018-01-31,2,"The minutes, if you have listened to the discu...",0.602243,1,"The minutes, if you have listened to the discu...","[minut, listen, discuss, tabl, signal, greater...","[(0, 0.019247059), (1, 0.018644728), (2, 0.033...","[(2, 0.0335), (39, 0.0591)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31


In [160]:
mfdff = fdff.groupby(['date_x', 'section', 'lname'])[['svect', 'voter', 'sent', 'region', 'female', 'chair', 'exp']].agg({'svect': 'sum', 'voter': 'max', 'sent': 'count', 'region': 'first', 'female': 'first', 'chair': 'first', 'exp':'first'}).reset_index()

In [161]:
mfdff['exp_tertile'] = mfdff.groupby(['date_x', 'section'])['exp'].transform(lambda x : np.quantile(x, 1/3))

In [162]:
mfdff['expd'] = mfdff['exp'] > mfdff['exp_tertile']

In [163]:
mfdff

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,exp_tertile,expd
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,3,Philadelphia,0.0,Greenspan,13.016438,2.296804,True
1,1994-02-04,1,broaddus,"[-0.09559456218574064, 1.3333333333333333, 0.0...",1,12,Richmond,0.0,Greenspan,1.093151,2.296804,False
2,1994-02-04,1,forrestal,"[1.1557382475407931, 0.0, 0.2706530339997656, ...",1,11,Atlanta,0.0,Greenspan,10.169863,2.296804,True
3,1994-02-04,1,hoenig,"[-0.40276034193397064, 0.4730077107282468, 0.0...",0,6,Kansas City,0.0,Greenspan,2.347945,2.296804,True
4,1994-02-04,1,jordan,"[0.0, -2.948044778664286, 0.0, 0.0, 0.0, 0.0, ...",1,10,Cleveland,0.0,Greenspan,1.906849,2.296804,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5880,2018-12-19,2,kashkari,"[-0.3975757616807607, 0.0, 0.0, 0.0, -0.520186...",0,9,Minneapolis,0.0,Powell,2.967123,3.282192,False
5881,2018-12-19,2,mester,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...",1,9,Cleveland,1.0,Powell,4.553425,3.282192,True
5882,2018-12-19,2,powell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,1,governor,0.0,Powell,6.572603,3.282192,True
5883,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0,...",1,5,governor,0.0,Powell,1.254795,3.282192,False


In [164]:
import numpy as np

In [165]:
def array_group_std(x):
    stdarr = np.std(np.stack(x), axis = 0)
    new = np.tile(stdarr, (len(x), 1))
    return pd.Series(new.tolist(), index=x.index)

In [166]:
mfdff['date_section_mean'] = mfdff.groupby(['date_x', 'section'])['svect'].transform('mean')
mfdff['date_section_std'] = mfdff.groupby(['date_x', 'section'])['svect'].transform(array_group_std).map(np.nan_to_num)

mfdff['norm_svect'] = ((mfdff['svect'] - mfdff['date_section_mean']) / mfdff['date_section_std']).map(np.nan_to_num)

In [176]:
# mfdff['date_section_exp_mean'] = 
mfdff['temp_svect'] = mfdff['svect'] * (mfdff['expd'] == True)
mfdff['total_expd'] = mfdff.groupby(['date_x', 'section'])['temp_svect'].transform('sum')
mfdff['num_expd'] = mfdff.groupby(['date_x', 'section'])['expd'].transform('sum')
mfdff['avg_expd'] = mfdff['total_expd'] / mfdff['num_expd']
mfdff['avg_expd']

0       [-0.3295970016507576, -0.24363113253095836, -0...
1       [-0.3295970016507576, -0.24363113253095836, -0...
2       [-0.3295970016507576, -0.24363113253095836, -0...
3       [-0.3295970016507576, -0.24363113253095836, -0...
4       [-0.3295970016507576, -0.24363113253095836, -0...
                              ...                        
5880    [0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...
5881    [0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...
5882    [0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...
5883    [0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...
5884    [0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...
Name: avg_expd, Length: 5885, dtype: object

In [180]:
mfdff['diff_exp_norm'] = ((mfdff['svect'] - mfdff['avg_expd']) / mfdff['date_section_std']).map(np.nan_to_num)

In [181]:
mfdff

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,...,date_section_mean,date_section_std,norm_svect,temp_svect,temp_total,total_exp,total_expd,num_expd,avg_expd,diff_exp_norm
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,3,Philadelphia,0.0,Greenspan,13.016438,...,"[-0.20229081641970367, -0.3151150390251947, -0...","[1.0949212859056963, 0.9475574203939641, 0.180...","[0.1847537526429335, 0.3325550855737903, 0.141...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-0.3295970016507576, -0.24363113253095836, -0...","[0.30102346706879646, 0.2571149012053162, 0.21..."
1,1994-02-04,1,broaddus,"[-0.09559456218574064, 1.3333333333333333, 0.0...",1,12,Richmond,0.0,Greenspan,1.093151,...,"[-0.20229081641970367, -0.3151150390251947, -0...","[1.0949212859056963, 0.9475574203939641, 0.180...","[0.09744650652736749, 1.7396817721855375, 0.14...","[-0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0...","[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-0.3295970016507576, -0.24363113253095836, -0...","[0.21371622095323042, 1.6642415878170636, 0.21..."
2,1994-02-04,1,forrestal,"[1.1557382475407931, 0.0, 0.2706530339997656, ...",1,11,Atlanta,0.0,Greenspan,10.169863,...,"[-0.20229081641970367, -0.3151150390251947, -0...","[1.0949212859056963, 0.9475574203939641, 0.180...","[1.2402983497002373, 0.3325550855737903, 1.638...","[1.1557382475407931, 0.0, 0.2706530339997656, ...","[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-0.3295970016507576, -0.24363113253095836, -0...","[1.3565680641261004, 0.2571149012053162, 1.708..."
3,1994-02-04,1,hoenig,"[-0.40276034193397064, 0.4730077107282468, 0.0...",0,6,Kansas City,0.0,Greenspan,2.347945,...,"[-0.20229081641970367, -0.3151150390251947, -0...","[1.0949212859056963, 0.9475574203939641, 0.180...","[-0.18309035370377583, 0.8317414151279248, 0.1...","[-0.40276034193397064, 0.4730077107282468, 0.0...","[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-0.3295970016507576, -0.24363113253095836, -0...","[-0.06682063927791286, 0.7563012307594507, 0.2..."
4,1994-02-04,1,jordan,"[0.0, -2.948044778664286, 0.0, 0.0, 0.0, 0.0, ...",1,10,Cleveland,0.0,Greenspan,1.906849,...,"[-0.20229081641970367, -0.3151150390251947, -0...","[1.0949212859056963, 0.9475574203939641, 0.180...","[0.1847537526429335, -2.778649275464914, 0.141...","[0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-3.2959700165075763, -2.4363113253095836, -0....",10,"[-0.3295970016507576, -0.24363113253095836, -0...","[0.30102346706879646, -2.854089459833388, 0.21..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5880,2018-12-19,2,kashkari,"[-0.3975757616807607, 0.0, 0.0, 0.0, -0.520186...",0,9,Minneapolis,0.0,Powell,2.967123,...,"[-0.006774948725900236, 0.0, 0.077411826324159...","[0.13957653573688994, 0.0, 0.19901225947274961...","[-2.7999033712338526, 0.0, -0.3889801891061880...","[-0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...","[-2.8484426811553383, 0.0, -0.1854931789744027..."
5881,2018-12-19,2,mester,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...",1,9,Cleveland,1.0,Powell,4.553425,...,"[-0.006774948725900236, 0.0, 0.077411826324159...","[0.13957653573688994, 0.0, 0.19901225947274961...","[0.04853930992148578, 0.0, -0.3889801891061880...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...","[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...","[0.0, 0.0, -0.18549317897440273, 0.0, -0.24718..."
5882,2018-12-19,2,powell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,1,governor,0.0,Powell,6.572603,...,"[-0.006774948725900236, 0.0, 0.077411826324159...","[0.13957653573688994, 0.0, 0.19901225947274961...","[0.04853930992148578, 0.0, -0.3889801891061880...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...","[0.0, 0.0, -0.18549317897440273, 0.0, -0.24718..."
5883,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0,...",1,5,governor,0.0,Powell,1.254795,...,"[-0.006774948725900236, 0.0, 0.077411826324159...","[0.13957653573688994, 0.0, 0.19901225947274961...","[0.04853930992148578, 0.0, -0.3889801891061880...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0,...","[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.29532333331583216, 0.0, 0.3396039...",8,"[0.0, 0.0, 0.03691541666447902, 0.0, 0.0424504...","[0.0, 0.0, -0.18549317897440273, 0.0, -0.24718..."


In [182]:
mfdff['use'] = mfdff['sent'] >= 3

In [183]:
mfdff.to_csv('../working-csvs/mfdff.csv')