In [2]:
import pandas as pd
import numpy as np

from nltk.stem import PorterStemmer

from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import setup_modules
from lib.preprocessing import setup_enhance, ngram_enhance, process, load_stopwords

In [3]:
df = pd.read_csv('../working-csvs/fomc-sents-w-sentiment.csv', index_col=0)

In [4]:
bigrams, trigrams = setup_enhance('../ngrams/bigrams', '../ngrams/trigrams')
df['enhanced'] = df['content'].map(lambda x : ngram_enhance(x, bigrams, trigrams))

In [5]:
stopwords = load_stopwords('../stopwords/stopwords.txt')
stemmer = PorterStemmer()
df['tokens'] = df['enhanced'].map(lambda x : process(x, stemmer=stemmer, stopwords=stopwords))

In [6]:
topic_model = LdaModel.load('../models/02-29/02-29lda')
model_dict = Dictionary.load('../models/02-29/02-29dict')
# topic_model = LdaModel.load('../models/12-10lda')
# model_dict = Dictionary.load('../models/12-10dict')

In [7]:
df['tprob_vect'] = df['tokens'].map(lambda x : topic_model[model_dict.doc2bow(x)])

In [8]:
def topk_topics(topic_vec, k=3):
    probs = [item[1] for item in topic_vec]
    probs.sort(reverse=True)
    threshold = probs[k - 1]
    topk = [item for item in topic_vec if item[1] >= max(threshold, 0.03)]
    rounded = [(topic, round(prob,4)) for topic, prob in topk]
    return rounded

In [9]:
df['topk'] = df['tprob_vect'].map(lambda x : topk_topics(x, k=3))

In [10]:
for i, row in df.sample(20)[['content', 'topk']].iterrows():
    if len(row['topk']) > 0:
        print(row['content'])
        print(row['topk'])

Economic activity in the Eighth District has been improving at a moderate pace during the intermeeting period.
[(21, 0.0483)]
For example, I've maintained my forecast for 2013 and 2014 at about 3 percent, and I believe that looking at the horizon of 18 months to two years is a better horizon for which policy ought to be judged, and ought to be the relevant metric we should look at.
[(13, 0.037), (44, 0.0344)]
You will recall that in 1992 the Swedes tried to curb a run on their exchange rate by raising their overnight interest rate to 500 percent.
[(8, 0.0357), (37, 0.0402), (38, 0.0387)]
I take more signal about the underlying strength of the economy from the improvement in labor market conditions, which has exceeded the expectations of many economists.
[(36, 0.0377)]
Turning to the inflation outlook for the immediate future, I think the most likely outcome is for stable or even slightly lower core inflation this year.
[(34, 0.0502), (39, 0.031)]
For now, I lean toward the view that lo

In [11]:
k = 45

def partial_stance_vect(topics, sent_label):
    # maybe normalize
    total = sum([topic[1] for topic in topics])
    svect = [0] * k
    for topic, weight in topics:
        svect[topic] = float(sent_label) * float(weight) / total
    
    return np.array(svect)

In [12]:
min_sent_prob = 0.5

In [13]:
dff = df[(df['topk'].map(len) > 0) & (df['sent_prob'] > 0.5) & (df['sent'] != 0)].copy()

In [14]:
dff['svect'] = dff.apply(lambda x : partial_stance_vect(x['topk'], x['sent']), axis=1)

In [15]:
dff

Unnamed: 0,lname,date,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk,svect
0,bernanke,2002-08-13,1,As we search for the signal of an incipient re...,0.819317,-1,As we search for the signal of an incipient re...,"[search, signal, incipi, recoveri, heavi, nois...","[(0, 0.020856535), (1, 0.021945685), (2, 0.020...","[(36, 0.0359)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,The direct effects of the stock market are bei...,0.654100,-1,The direct effects of the stock market are bei...,"[direct, effect, stock, market, partli, offset...","[(0, 0.023129959), (1, 0.02173418), (2, 0.0206...","[(34, 0.0308)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,"Second, regarding the other source of noise, t...",0.840512,-1,"Second, regarding the other source of noise, t...","[sourc, nois, data, revis, longer, blame, shal...","[(0, 0.019851433), (1, 0.020303724), (2, 0.020...","[(13, 0.0327), (18, 0.0312)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,"In particular, the revision has largely left i...",0.827082,1,"In particular, the revision has largely left i...","[revis, larg, left, intact, growth, product, s...","[(0, 0.02214655), (1, 0.030765276), (2, 0.0211...","[(1, 0.0308), (37, 0.0487), (39, 0.0395)]","[0.0, 0.2588235220458918, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,I note the comments that Governor Kohn made ab...,0.803425,-1,I note the comments that Governor Kohn made ab...,"[note, comment, governor, kohn, made, financi,...","[(0, 0.019764122), (1, 0.020307602), (2, 0.020...","[(27, 0.0347), (35, 0.0491)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...
6701,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.019811908), (1, 0.022919705), (2, 0.038...","[(2, 0.039)]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650389), (1, 0.019794215), (2, 0.020...","[(4, 0.056)]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,"I suppose, although nothing obviously is set i...",0.907364,1,"I suppose, although nothing obviously is set i...","[suppos, set, stone, s, a, pretti, strong, inc...","[(0, 0.020269258), (1, 0.019947713), (2, 0.021...","[(12, 0.0306), (37, 0.0392)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,"The minutes, if you have listened to the discu...",0.602243,1,"The minutes, if you have listened to the discu...","[minut, listen, discuss, tabl, signal, greater...","[(0, 0.019247059), (1, 0.018644728), (2, 0.033...","[(2, 0.0335), (39, 0.0591)]","[0.0, 0.0, 0.36177107061022307, 0.0, 0.0, 0.0,..."


In [16]:
fomc = pd.read_csv('../working-csvs/fomc.csv', index_col=0)

In [17]:
fomc['date_pd'] = pd.to_datetime(fomc['date'])
fomc['year'] = fomc['date_pd'].dt.year
fomc['month'] = fomc['date_pd'].dt.month

In [18]:
dff['date_pd'] = pd.to_datetime(dff['date'].map(str))
dff['year'] = dff['date_pd'].dt.year
dff['month'] = dff['date_pd'].dt.month

In [19]:
fdff = dff.merge(fomc, how='left', left_on=['year', 'month', 'lname'], right_on=['year', 'month', 'member'])

In [20]:
fdff

Unnamed: 0,lname,date_x,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk,...,year,month,date_y,member,voter,region,female,chair,exp,date_pd_y
0,bernanke,2002-08-13,1,As we search for the signal of an incipient re...,0.819317,-1,As we search for the signal of an incipient re...,"[search, signal, incipi, recoveri, heavi, nois...","[(0, 0.020856535), (1, 0.021945685), (2, 0.020...","[(36, 0.0359)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
1,bernanke,2002-08-13,1,The direct effects of the stock market are bei...,0.654100,-1,The direct effects of the stock market are bei...,"[direct, effect, stock, market, partli, offset...","[(0, 0.023129959), (1, 0.02173418), (2, 0.0206...","[(34, 0.0308)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
2,bernanke,2002-08-13,1,"Second, regarding the other source of noise, t...",0.840512,-1,"Second, regarding the other source of noise, t...","[sourc, nois, data, revis, longer, blame, shal...","[(0, 0.019851433), (1, 0.020303724), (2, 0.020...","[(13, 0.0327), (18, 0.0312)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
3,bernanke,2002-08-13,1,"In particular, the revision has largely left i...",0.827082,1,"In particular, the revision has largely left i...","[revis, larg, left, intact, growth, product, s...","[(0, 0.02214655), (1, 0.030765276), (2, 0.0211...","[(1, 0.0308), (37, 0.0487), (39, 0.0395)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
4,bernanke,2002-08-13,1,I note the comments that Governor Kohn made ab...,0.803425,-1,I note the comments that Governor Kohn made ab...,"[note, comment, governor, kohn, made, financi,...","[(0, 0.019764122), (1, 0.020307602), (2, 0.020...","[(27, 0.0347), (35, 0.0491)]",...,2002,8,2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79933,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.019811908), (1, 0.022919705), (2, 0.038...","[(2, 0.039)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79934,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650389), (1, 0.019794215), (2, 0.020...","[(4, 0.056)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79935,yellen,2018-01-31,2,"I suppose, although nothing obviously is set i...",0.907364,1,"I suppose, although nothing obviously is set i...","[suppos, set, stone, s, a, pretti, strong, inc...","[(0, 0.020269258), (1, 0.019947713), (2, 0.021...","[(12, 0.0306), (37, 0.0392)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79936,yellen,2018-01-31,2,"The minutes, if you have listened to the discu...",0.602243,1,"The minutes, if you have listened to the discu...","[minut, listen, discuss, tabl, signal, greater...","[(0, 0.019247059), (1, 0.018644728), (2, 0.033...","[(2, 0.0335), (39, 0.0591)]",...,2018,1,2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31


In [21]:
mfdff = fdff.groupby(['date_x', 'section', 'lname'])[['svect', 'voter', 'sent']].agg({'svect': 'sum', 'voter': 'max', 'sent': 'count'}).reset_index()

In [22]:
def func(ins):
    print(ins)

In [23]:
import numpy as np

In [163]:
def array_group_std(x):
    stdarr = np.std(np.stack(x), axis = 0)
    new = np.tile(stdarr, (len(x), 1))
    return pd.Series(new.tolist(), index=x.index)

In [176]:
mfdff['date_section_mean'] = mfdff.groupby(['date_x', 'section'])['svect'].transform('mean')
mfdff['date_section_std'] = mfdff.groupby(['date_x', 'section'])['svect'].transform(array_group_std).map(np.nan_to_num)

mfdff['norm_svect'] = ((mfdff['svect'] - mfdff['date_section_mean']) / mfdff['date_section_std']).map(np.nan_to_num)


In [185]:
mfdff['use'] = mfdff['sent'] >= 3

In [186]:
mfdff.to_csv('../working-csvs/mfdff.csv')