In [10]:
import pandas as pd
import numpy as np

from nltk.stem import PorterStemmer

from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import setup_modules
from lib.preprocessing import setup_enhance, ngram_enhance, process, load_stopwords

In [2]:
df = pd.read_csv('../working-csvs/fomc-sents-w-sentiment.csv', index_col=0)

In [3]:
bigrams, trigrams = setup_enhance('../ngrams/bigrams', '../ngrams/trigrams')
df['enhanced'] = df['content'].map(lambda x : ngram_enhance(x, bigrams, trigrams))

In [4]:
stopwords = load_stopwords('../stopwords/stopwords.txt')
stemmer = PorterStemmer()
df['tokens'] = df['enhanced'].map(lambda x : process(x, stemmer=stemmer, stopwords=stopwords))

In [5]:
topic_model = LdaModel.load('../models/02-29/02-29lda')
model_dict = Dictionary.load('../models/02-29/02-29dict')
# topic_model = LdaModel.load('../models/12-10lda')
# model_dict = Dictionary.load('../models/12-10dict')

In [6]:
df['tprob_vect'] = df['tokens'].map(lambda x : topic_model[model_dict.doc2bow(x)])

In [7]:
def topk_topics(topic_vec, k=3):
    probs = [item[1] for item in topic_vec]
    probs.sort(reverse=True)
    threshold = probs[k - 1]
    topk = [item for item in topic_vec if item[1] >= max(threshold, 0.03)]
    rounded = [(topic, round(prob,4)) for topic, prob in topk]
    return rounded

In [8]:
df['topk'] = df['tprob_vect'].map(lambda x : topk_topics(x, k=3))

In [9]:
for i, row in df.sample(20)[['content', 'topk']].iterrows():
    if len(row['topk']) > 0:
        print(row['content'])
        print(row['topk'])

I expect the unemployment rate to continue its decline and fall below 6 percent to about 5.8 percent by the end of this year and to 5.6 percent by the end of 2015 through 2016--5.6 percent is roughly my guess of the steady-state natural rate.
[(10, 0.0386)]
I had been under the impression that less-developed countries like China, because they are "energy efficiency disadvantaged," would be relatively more hurt by the past rise of oil prices than the United States.
[(33, 0.0837)]
I am fine with "decided to," but put back the strike-out that begins "to support a stronger economic recovery."
[(8, 0.0303), (20, 0.0344), (27, 0.0305)]
That seems likely for real estate commissions in particular.
[(0, 0.0401)]
Housing activity remains strong, and consumer spending continues to be relatively healthy.
[(21, 0.0381)]
For example, our energy industry contacts expect a 30 to 50 percent reduction in drilling activity and a 35 to 60 percent reduction in capital expenditures in '09.
[(17, 0.0377), (2

In [26]:
k = 45

def partial_stance_vect(topics, sent_label):
    # maybe normalize
    total = sum([topic[1] for topic in topics])
    svect = [0] * k
    for topic, weight in topics:
        svect[topic] = float(sent_label) * float(weight) / total
    
    return np.array(svect)

In [15]:
min_sent_prob = 0.5

In [27]:
dff = df[(df['topk'].map(len) > 0) & (df['sent_prob'] > 0.5) & (df['sent'] != 0)].copy()

In [108]:
dff['svect'] = dff.apply(lambda x : partial_stance_vect(x['topk'], x['sent']), axis=1)

In [109]:
dff

Unnamed: 0,lname,date,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk,date_pd,year,month,svect
0,bernanke,2002-08-13,1,As we search for the signal of an incipient re...,0.819317,-1,As we search for the signal of an incipient re...,"[search, signal, incipi, recoveri, heavi, nois...","[(0, 0.020856535), (1, 0.021945685), (2, 0.020...","[(36, 0.0359)]",2002-08-13,2002,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,The direct effects of the stock market are bei...,0.654100,-1,The direct effects of the stock market are bei...,"[direct, effect, stock, market, partli, offset...","[(0, 0.023129959), (1, 0.02173418), (2, 0.0206...","[(34, 0.0308)]",2002-08-13,2002,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,"Second, regarding the other source of noise, t...",0.840512,-1,"Second, regarding the other source of noise, t...","[sourc, nois, data, revis, longer, blame, shal...","[(0, 0.019851433), (1, 0.020303724), (2, 0.020...","[(13, 0.0327), (18, 0.0312)]",2002-08-13,2002,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,"In particular, the revision has largely left i...",0.827082,1,"In particular, the revision has largely left i...","[revis, larg, left, intact, growth, product, s...","[(0, 0.02214655), (1, 0.030765276), (2, 0.0211...","[(1, 0.0308), (37, 0.0487), (39, 0.0395)]",2002-08-13,2002,8,"[0.0, 0.2588235220458918, 0.0, 0.0, 0.0, 0.0, ..."
0,bernanke,2002-08-13,1,I note the comments that Governor Kohn made ab...,0.803425,-1,I note the comments that Governor Kohn made ab...,"[note, comment, governor, kohn, made, financi,...","[(0, 0.019764122), (1, 0.020307602), (2, 0.020...","[(27, 0.0347), (35, 0.0491)]",2002-08-13,2002,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6701,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.019811908), (1, 0.022919705), (2, 0.038...","[(2, 0.039)]",2018-01-31,2018,1,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650389), (1, 0.019794215), (2, 0.020...","[(4, 0.056)]",2018-01-31,2018,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,"I suppose, although nothing obviously is set i...",0.907364,1,"I suppose, although nothing obviously is set i...","[suppos, set, stone, s, a, pretti, strong, inc...","[(0, 0.020269258), (1, 0.019947713), (2, 0.021...","[(12, 0.0306), (37, 0.0392)]",2018-01-31,2018,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6701,yellen,2018-01-31,2,"The minutes, if you have listened to the discu...",0.602243,1,"The minutes, if you have listened to the discu...","[minut, listen, discuss, tabl, signal, greater...","[(0, 0.019247059), (1, 0.018644728), (2, 0.033...","[(2, 0.0335), (39, 0.0591)]",2018-01-31,2018,1,"[0.0, 0.0, 0.36177107061022307, 0.0, 0.0, 0.0,..."


In [110]:
fomc = pd.read_csv('../working-csvs/fomc.csv', index_col=0)

In [111]:
fomc['date_pd'] = pd.to_datetime(fomc['date'])
fomc['year'] = fomc['date_pd'].dt.year
fomc['month'] = fomc['date_pd'].dt.month

In [112]:
dff['date_pd'] = pd.to_datetime(dff['date'].map(str))
dff['year'] = dff['date_pd'].dt.year
dff['month'] = dff['date_pd'].dt.month

In [113]:
fdff = dff.merge(fomc, how='left', left_on=['year', 'month', 'lname'], right_on=['year', 'month', 'member'])

In [120]:
fdff

Unnamed: 0,lname,date_x,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk,...,month,svect,date_y,member,voter,region,female,chair,exp,date_pd_y
0,bernanke,2002-08-13,1,As we search for the signal of an incipient re...,0.819317,-1,As we search for the signal of an incipient re...,"[search, signal, incipi, recoveri, heavi, nois...","[(0, 0.020856535), (1, 0.021945685), (2, 0.020...","[(36, 0.0359)]",...,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
1,bernanke,2002-08-13,1,The direct effects of the stock market are bei...,0.654100,-1,The direct effects of the stock market are bei...,"[direct, effect, stock, market, partli, offset...","[(0, 0.023129959), (1, 0.02173418), (2, 0.0206...","[(34, 0.0308)]",...,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
2,bernanke,2002-08-13,1,"Second, regarding the other source of noise, t...",0.840512,-1,"Second, regarding the other source of noise, t...","[sourc, nois, data, revis, longer, blame, shal...","[(0, 0.019851433), (1, 0.020303724), (2, 0.020...","[(13, 0.0327), (18, 0.0312)]",...,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
3,bernanke,2002-08-13,1,"In particular, the revision has largely left i...",0.827082,1,"In particular, the revision has largely left i...","[revis, larg, left, intact, growth, product, s...","[(0, 0.02214655), (1, 0.030765276), (2, 0.0211...","[(1, 0.0308), (37, 0.0487), (39, 0.0395)]",...,8,"[0.0, 0.2588235220458918, 0.0, 0.0, 0.0, 0.0, ...",2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
4,bernanke,2002-08-13,1,I note the comments that Governor Kohn made ab...,0.803425,-1,I note the comments that Governor Kohn made ab...,"[note, comment, governor, kohn, made, financi,...","[(0, 0.019764122), (1, 0.020307602), (2, 0.020...","[(27, 0.0347), (35, 0.0491)]",...,8,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2002-08-13,bernanke,1,governor,0.0,Greenspan,0.021918,2002-08-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79933,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.019811908), (1, 0.022919705), (2, 0.038...","[(2, 0.039)]",...,1,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79934,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650389), (1, 0.019794215), (2, 0.020...","[(4, 0.056)]",...,1,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79935,yellen,2018-01-31,2,"I suppose, although nothing obviously is set i...",0.907364,1,"I suppose, although nothing obviously is set i...","[suppos, set, stone, s, a, pretti, strong, inc...","[(0, 0.020269258), (1, 0.019947713), (2, 0.021...","[(12, 0.0306), (37, 0.0392)]",...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31
79936,yellen,2018-01-31,2,"The minutes, if you have listened to the discu...",0.602243,1,"The minutes, if you have listened to the discu...","[minut, listen, discuss, tabl, signal, greater...","[(0, 0.019247059), (1, 0.018644728), (2, 0.033...","[(2, 0.0335), (39, 0.0591)]",...,1,"[0.0, 0.0, 0.36177107061022307, 0.0, 0.0, 0.0,...",2018-01-31,yellen,1,governor,1.0,Yellen,16.161644,2018-01-31


In [123]:
mfdff = fdff.groupby(['date_x', 'section', 'lname'])[['svect', 'voter', 'sent']].agg({'svect': 'sum', 'voter': 'max', 'sent': 'count'}).reset_index()

In [132]:
mfdff.to_csv('../working-csvs/mfdff.csv')