In [63]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk import sent_tokenize

In [2]:
import setup_modules
from lib.preprocessing import process, ngram_enhance, setup_enhance

/home/bdurham/dev/fed-nlp


In [48]:
fomc = pd.read_csv('../working-csvs/fomc.csv', index_col=0)
dfs = pd.read_csv('../working-csvs/raw_transcripts_sectioned.csv', index_col=0)

In [100]:
sg = dfs.groupby(['date', 'speaker', 'section'])['content'].agg(" ".join).reset_index()

In [101]:
sg['sentences'] = sg['content'].map(sent_tokenize)
sg.drop(['content'], axis=1, inplace=True)
sgs = sg.explode('sentences')
sgs.rename(columns={'sentences': 'content'}, inplace=True)


In [102]:
sgs

Unnamed: 0,date,speaker,section,content
0,2007-01-31,CHAIRMAN BERNANKE.,1,"Seeing that there are no further questions, I ..."
0,2007-01-31,CHAIRMAN BERNANKE.,1,"Remember, we do have the two-handed option if ..."
0,2007-01-31,CHAIRMAN BERNANKE.,1,President Yellen.
0,2007-01-31,CHAIRMAN BERNANKE.,1,Thank you.
0,2007-01-31,CHAIRMAN BERNANKE.,1,President Moskow.
...,...,...,...,...
3391,2017-12-13,VICE CHAIRMAN DUDLEY.,2,So we do observe that inverted yield curves pr...
3391,2017-12-13,VICE CHAIRMAN DUDLEY.,2,"In this cycle, I expect the yield curve will c..."
3391,2017-12-13,VICE CHAIRMAN DUDLEY.,2,"But, to me, worrying now that we're committing..."
3391,2017-12-13,VICE CHAIRMAN DUDLEY.,2,Our tightening moves have not yet tightened ov...


In [103]:
bigrams, trigrams = setup_enhance('../ngrams/bigrams', '../ngrams/trigrams')
sgs['enhanced'] = sgs['content'].map(lambda x : ngram_enhance(x, bigrams, trigrams))

In [104]:
stemmer = PorterStemmer()
sgs['tokens'] = sgs['enhanced'].map(lambda x : process(x, stemmer=stemmer))

In [106]:
sgs.iloc[:10]

Unnamed: 0,date,speaker,section,content,enhanced,tokens
0,2007-01-31,CHAIRMAN BERNANKE.,1,"Seeing that there are no further questions, I ...","Seeing that there are no further questions, I ...","[see, question, propos, start, econom, go, rou..."
0,2007-01-31,CHAIRMAN BERNANKE.,1,"Remember, we do have the two-handed option if ...","Remember, we do have the two-handed option if ...","[rememb, two, hand, option, anyon, care, exercis]"
0,2007-01-31,CHAIRMAN BERNANKE.,1,President Yellen.,President Yellen.,"[presid, yellen]"
0,2007-01-31,CHAIRMAN BERNANKE.,1,Thank you.,Thank you.,[thank]
0,2007-01-31,CHAIRMAN BERNANKE.,1,President Moskow.,President Moskow.,"[presid, moskow]"
0,2007-01-31,CHAIRMAN BERNANKE.,1,I notice that they're not playing in the Seven...,I notice that they're not playing in the Seven...,"[notic, play, seventh, feder, reserv, district..."
0,2007-01-31,CHAIRMAN BERNANKE.,1,Thank you.,Thank you.,[thank]
0,2007-01-31,CHAIRMAN BERNANKE.,1,President Stern.,President Stern.,"[presid, stern]"
0,2007-01-31,CHAIRMAN BERNANKE.,1,Thank you.,Thank you.,[thank]
0,2007-01-31,CHAIRMAN BERNANKE.,1,President Minehan.,President Minehan.,"[presid, minehan]"


In [111]:
sgs['probs'] = sgs['tokens'].map(lambda x : ldamodel[dict.doc2bow(x)])

In [150]:
def topk_topics(topic_vec, k=3):
    # print(topic_vec[0][1])
    probs = [item[1] for item in topic_vec]
    probs.sort(reverse=True)
    threshold = probs[k - 1]
    topk = [item for item in topic_vec if item[1] >= max(threshold, 0.03)]
    rounded = [(topic, round(prob,4)) for topic, prob in topk]
    # print(topk)
    return rounded
topk_topics(sgs.iloc[0]['probs'])

[(22, 0.0745)]

In [152]:
sgs['topk'] = sgs['probs'].map(lambda x : topk_topics(x, k=3))

In [163]:
for i, row in sgs.sample(20)[['content', 'topk']].iterrows():
    if len(row['topk']) > 0:
        print(row['content'])
        print(row['topk'])

In fact, I believe that we should be working harder to return inflation to 2 percent over a reasonable horizon.
[(25, 0.0344)]
Meanwhile reserve balances have risen by more than $200 billion over the same period.
[(24, 0.0643), (28, 0.0478)]
Unfortunately, the more recent releases have been less encouraging, and financial markets are unsettled.
[(29, 0.0423)]
President Fisher.
[(22, 0.0336)]
I don't think I've looked at what the yield curve looks like in that situation.
[(28, 0.0398), (35, 0.0339)]
In particular, Brian was extraordinarily patient with me as he helped me get up to speed in learning all of the intricacies of Federal Reserve governance and communication, such as when I asked him, "So when we send a memo to the FOMC, what do we actually include in that memo?"
[(23, 0.0503), (24, 0.0471), (30, 0.0362)]
Turning to costs, I think there is a political risk associated with the scenario of ending remittances and booking a deferred asset while the interest on reserves that we pay

In [166]:
from transformers import pipeline
pipe = pipeline("text-classification", model="ProsusAI/finbert")

Downloading config.json: 100%|██████████| 758/758 [00:00<00:00, 1.35MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [01:23<00:00, 5.26MB/s] 
Downloading tokenizer_config.json: 100%|██████████| 252/252 [00:00<00:00, 1.89MB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.19MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 183kB/s]


In [169]:
pipe(["testing", "terrible inflation plaguing the country"])

[{'label': 'neutral', 'score': 0.8503028154373169},
 {'label': 'negative', 'score': 0.9071136713027954}]

In [191]:
sgs = sgs.reset_index().drop(columns=['index'], axis=1)
sgs['sentiment'] = pipe(sgs['content'].tolist())

In [193]:
sgs.to_csv('sgs.csv')