In [1]:
import pandas as pd
import numpy as np

from nltk.stem import PorterStemmer

from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import setup_modules
from lib.preprocessing import setup_enhance, ngram_enhance, process, load_stopwords

In [2]:
df = pd.read_csv('../working-csvs/fomc-sents-w-sentiment.csv', index_col=0)

In [3]:
bigrams, trigrams = setup_enhance('../ngrams/bigrams', '../ngrams/trigrams')
df['enhanced'] = df['content'].map(lambda x : ngram_enhance(x, bigrams, trigrams))

In [4]:
stopwords = load_stopwords('../stopwords/stopwords.txt')
stemmer = PorterStemmer()
df['tokens'] = df['enhanced'].map(lambda x : process(x, stemmer=stemmer, stopwords=stopwords))

In [5]:
topic_model = LdaModel.load('../models/02-29/02-29lda')
model_dict = Dictionary.load('../models/02-29/02-29dict')
# topic_model = LdaModel.load('../models/12-10lda')
# model_dict = Dictionary.load('../models/12-10dict')

In [6]:
df['tprob_vect'] = df['tokens'].map(lambda x : topic_model[model_dict.doc2bow(x)])

In [7]:
def topk_topics(topic_vec, k=3):
    probs = [item[1] for item in topic_vec]
    probs.sort(reverse=True)
    threshold = probs[k - 1]
    topk = [item for item in topic_vec if item[1] >= max(threshold, 0.03)]
    rounded = [(topic, round(prob,4)) for topic, prob in topk]
    return rounded

In [8]:
df['topk'] = df['tprob_vect'].map(lambda x : topk_topics(x, k=3))

In [9]:
for i, row in df.sample(20)[['content', 'topk']].iterrows():
    if len(row['topk']) > 0:
        print(row['content'])
        print(row['topk'])

Shipping rates are down 5 percent, and this is very general in the trucking industry.
[(17, 0.0536)]
On the national scene we have seen substantial improvements in financial markets, in surveys of consumer attitudes, and in a number of business indexes.
[(1, 0.0455), (21, 0.0361)]
I think President Lacker is actually thinking that that is  the definition we are using.
[(27, 0.0373)]
The funds rate will either stay the same, or we'll be shading it down.
[(44, 0.0338)]
One of the reasons I like to go late is that it gives me a chance to respond and it makes it harder for the other person to respond to my response.
[(12, 0.0304)]
As Bill said, we have pushed our financial institutions, the big ones, to build capital through SCAP and through CCAR earlier this year, so they are in substantially better shape because of our concerns about potential downgrades of a few U.S. institutions.
[(8, 0.043), (18, 0.0411), (35, 0.0643)]
Now, regarding more serious matters at home, the Twelfth District 

In [10]:
k = 45

def partial_stance_vect(topics, sent_label):
    # maybe normalize
    total = sum([topic[1] for topic in topics])
    svect = [0] * k
    for topic, weight in topics:
        svect[topic] = float(sent_label) * float(weight) / total
    
    return np.array(svect)

In [11]:
min_sent_prob = 2/3

In [12]:
dff = df[(df['topk'].map(len) > 0) & (df['sent_prob'] > min_sent_prob) & (df['sent'] != 0)].copy()

In [13]:
dff

Unnamed: 0,lname,date,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk
5,barkin,2018-01-31,1,The Fifth District economy continues to expand...,0.938962,1,The Fifth District economy continues to expand...,"[district, economi, continu, expand, a, good, ...","[(0, 0.02903068), (1, 0.021786518), (2, 0.0217...","[(21, 0.0329)]"
7,barkin,2018-01-31,1,Manufacturers are particularly positive about ...,0.952982,1,Manufacturers are particularly positive about ...,"[manufactur, posit, term, report, strong, grow...","[(0, 0.022793414), (1, 0.02582223), (2, 0.0214...","[(21, 0.0425)]"
9,barkin,2018-01-31,1,I did find quite striking the Tealbook's stron...,0.828219,1,I did find quite striking the Tealbook's stron...,"[find, strike, tealbook, s, stronger, forecast...","[(0, 0.025066629), (1, 0.021126034), (2, 0.021...","[(10, 0.0581), (39, 0.0304)]"
12,barkin,2018-01-31,1,"Specifically, I fear we may see some near-term...",0.927251,-1,"Specifically, I fear we may see some near-term...","[specif, fear, term, distort, inflat, metric]","[(0, 0.020662986), (1, 0.020527827), (2, 0.021...","[(13, 0.0389)]"
16,barkin,2018-01-31,1,Recognizing that assessing the precise effect ...,0.767703,1,Recognizing that assessing the precise effect ...,"[recogn, assess, precis, effect, a, corpor, ta...","[(0, 0.019094575), (1, 0.021136412), (2, 0.020...","[(22, 0.0445), (33, 0.0342)]"
...,...,...,...,...,...,...,...,...,...,...
261907,yellen,2018-01-31,2,"Were you referring, Governor Brainard, to late...",0.852719,-1,"Were you referring, Governor Brainard, to late...","[refer, governor, brainard, late, inflat, comp...","[(0, 0.021121234), (1, 0.020942925), (2, 0.022...","[(4, 0.0495)]"
261914,yellen,2018-01-31,2,"As Thomas said, the largest downward move in 2...",0.930716,-1,"As Thomas said, the largest downward move in 2...","[thoma, largest, downward, move, basi, point, ...","[(0, 0.021952055), (1, 0.022569876), (2, 0.024...","[(44, 0.0313)]"
261945,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.01981191), (1, 0.02292015), (2, 0.03894...","[(2, 0.0389)]"
261953,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650487), (1, 0.019794486), (2, 0.020...","[(4, 0.0559)]"


In [14]:
dff['svect'] = dff.apply(lambda x : partial_stance_vect(x['topk'], x['sent']), axis=1)

In [15]:
dff

Unnamed: 0,lname,date,section,content,sent_prob,sent,enhanced,tokens,tprob_vect,topk,svect
5,barkin,2018-01-31,1,The Fifth District economy continues to expand...,0.938962,1,The Fifth District economy continues to expand...,"[district, economi, continu, expand, a, good, ...","[(0, 0.02903068), (1, 0.021786518), (2, 0.0217...","[(21, 0.0329)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,barkin,2018-01-31,1,Manufacturers are particularly positive about ...,0.952982,1,Manufacturers are particularly positive about ...,"[manufactur, posit, term, report, strong, grow...","[(0, 0.022793414), (1, 0.02582223), (2, 0.0214...","[(21, 0.0425)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,barkin,2018-01-31,1,I did find quite striking the Tealbook's stron...,0.828219,1,I did find quite striking the Tealbook's stron...,"[find, strike, tealbook, s, stronger, forecast...","[(0, 0.025066629), (1, 0.021126034), (2, 0.021...","[(10, 0.0581), (39, 0.0304)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12,barkin,2018-01-31,1,"Specifically, I fear we may see some near-term...",0.927251,-1,"Specifically, I fear we may see some near-term...","[specif, fear, term, distort, inflat, metric]","[(0, 0.020662986), (1, 0.020527827), (2, 0.021...","[(13, 0.0389)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
16,barkin,2018-01-31,1,Recognizing that assessing the precise effect ...,0.767703,1,Recognizing that assessing the precise effect ...,"[recogn, assess, precis, effect, a, corpor, ta...","[(0, 0.019094575), (1, 0.021136412), (2, 0.020...","[(22, 0.0445), (33, 0.0342)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...
261907,yellen,2018-01-31,2,"Were you referring, Governor Brainard, to late...",0.852719,-1,"Were you referring, Governor Brainard, to late...","[refer, governor, brainard, late, inflat, comp...","[(0, 0.021121234), (1, 0.020942925), (2, 0.022...","[(4, 0.0495)]","[0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0,..."
261914,yellen,2018-01-31,2,"As Thomas said, the largest downward move in 2...",0.930716,-1,"As Thomas said, the largest downward move in 2...","[thoma, largest, downward, move, basi, point, ...","[(0, 0.021952055), (1, 0.022569876), (2, 0.024...","[(44, 0.0313)]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
261945,yellen,2018-01-31,2,I have thoroughly enjoyed interacting with all...,0.763187,1,I have thoroughly enjoyed interacting with all...,"[enjoy, interact, terrif, staff, incred, honor...","[(0, 0.01981191), (1, 0.02292015), (2, 0.03894...","[(2, 0.0389)]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
261953,yellen,2018-01-31,2,I have heard considerable support during the g...,0.862407,1,I have heard considerable support during the g...,"[heard, consider, support, round, alt, b, writ...","[(0, 0.019650487), (1, 0.019794486), (2, 0.020...","[(4, 0.0559)]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [16]:
fomc = pd.read_csv('../working-csvs/fomc.csv', index_col=0)

In [17]:
fomc['date_pd'] = pd.to_datetime(fomc['date'])
fomc['year'] = fomc['date_pd'].dt.year
fomc['month'] = fomc['date_pd'].dt.month

In [18]:
dff['date_pd'] = pd.to_datetime(dff['date'].map(str))
dff['year'] = dff['date_pd'].dt.year
dff['month'] = dff['date_pd'].dt.month

In [19]:
fdff = dff.merge(fomc, how='left', left_on=['year', 'month', 'lname'], right_on=['year', 'month', 'member'])

In [20]:
mfdff = fdff.groupby(['date_x', 'section', 'lname'])[['svect', 'voter', 'sent', 'region', 'female', 'chair', 'exp', 'Econ_PhD']].agg({'svect': 'sum', 'voter': 'max', 'sent': 'count', 'region': 'first', 'female': 'first', 'chair': 'first', 'exp':'first', 'Econ_PhD': 'max'}).reset_index()

In [21]:
mfdff[(mfdff['date_x'] == '1994-02-04') & (mfdff['section'] == 2)]

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,Econ_PhD
16,1994-02-04,2,boehne,"[0.0, 0.390893463834136, 0.0, 0.0, 0.0, 0.0, 0...",0.0,2,Philadelphia,0.0,Greenspan,13.016438,1.0
17,1994-02-04,2,broaddus,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,Richmond,0.0,Greenspan,1.093151,1.0
18,1994-02-04,2,forrestal,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.35...",1.0,1,Atlanta,0.0,Greenspan,10.169863,0.0
19,1994-02-04,2,greenspan,"[-0.29740134824317155, -0.32435034602443663, 0...",1.0,21,governor,0.0,Greenspan,6.490411,1.0
20,1994-02-04,2,jordan,"[0.0, 0.0, 0.3787425226506579, 0.0, 0.0, 0.0, ...",1.0,1,Cleveland,0.0,Greenspan,1.906849,1.0
21,1994-02-04,2,keehn,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1,Chicago,0.0,Greenspan,12.605479,0.0
22,1994-02-04,2,kelley,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,governor,0.0,Greenspan,6.70137,0.0
23,1994-02-04,2,laware,"[0.0, 0.0, 0.40378289131049094, 0.0, 0.0, 0.0,...",1.0,1,governor,0.0,Greenspan,5.476712,0.0
24,1994-02-04,2,lindsey,"[-1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.0,4,governor,0.0,Greenspan,2.194521,1.0
25,1994-02-04,2,mcdonough,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,New York,0.0,Greenspan,0.547945,0.0


In [32]:
mfdff = mfdff[mfdff['exp'].notna()]

In [33]:
mfdff['exp_tertile'] = mfdff.groupby(['date_x', 'section'])['exp'].transform(lambda x : np.quantile(x, 1/3))

In [35]:
mfdff['expd'] = mfdff['exp'] > mfdff['exp_tertile']

In [38]:
import numpy as np

In [39]:
def array_group_std(x):
    stdarr = np.std(np.stack(x), axis = 0)
    new = np.tile(stdarr, (len(x), 1))
    return pd.Series(new.tolist(), index=x.index)

In [40]:
mfdff['date_section_mean'] = mfdff.groupby(['date_x', 'section'])['svect'].transform('mean')
mfdff['date_section_std'] = mfdff.groupby(['date_x', 'section'])['svect'].transform(array_group_std).map(np.nan_to_num)

mfdff['norm_svect'] = ((mfdff['svect'] - mfdff['date_section_mean']) / mfdff['date_section_std']).map(np.nan_to_num)

In [41]:
mfdff

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,Econ_PhD,exp_tertile,expd,date_section_mean,date_section_std,norm_svect
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,2,Philadelphia,0.0,Greenspan,13.016438,1.0,2.347945,True,"[-0.22733416431632344, -0.13440852330556788, -...","[1.0786211275254163, 0.6449931984649045, 0.254...","[0.21076368570479964, 0.20838750489999366, 0.1..."
1,1994-02-04,1,broaddus,"[-0.09660027630591544, 1.3331039275764582, 0.0...",1.0,10,Richmond,0.0,Greenspan,1.093151,1.0,2.347945,False,"[-0.22733416431632344, -0.13440852330556788, -...","[1.0786211275254163, 0.6449931984649045, 0.254...","[0.12120464236625796, 2.2752370945534497, 0.17..."
2,1994-02-04,1,forrestal,"[1.1557700791714343, 0.0, 0.2708661502319228, ...",1.0,9,Atlanta,0.0,Greenspan,10.169863,0.0,2.347945,True,"[-0.22733416431632344, -0.13440852330556788, -...","[1.0786211275254163, 0.6449931984649045, 0.254...","[1.282289219256153, 0.20838750489999366, 1.241..."
3,1994-02-04,1,hoenig,"[0.0, 0.4742930543253638, 0.0, 0.0, 0.0, 0.0, ...",0.0,4,Kansas City,0.0,Greenspan,2.347945,1.0,2.347945,False,"[-0.22733416431632344, -0.13440852330556788, -...","[1.0786211275254163, 0.6449931984649045, 0.254...","[0.21076368570479964, 0.9437333278547005, 0.17..."
4,1994-02-04,1,jordan,"[0.0, -1.949719620815178, 0.0, 0.0, 0.0, 0.0, ...",1.0,5,Cleveland,0.0,Greenspan,1.906849,1.0,2.347945,False,"[-0.22733416431632344, -0.13440852330556788, -...","[1.0786211275254163, 0.6449931984649045, 0.254...","[0.21076368570479964, -2.814465488675048, 0.17..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5686,2018-12-19,2,mester,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...",1.0,6,Cleveland,1.0,Powell,4.553425,1.0,3.177169,True,"[-0.005976371107438679, 0.0, 0.022684629293502...","[0.13014479004772822, 0.0, 0.14884793828058998...","[0.045920940094850926, 0.0, -0.152401367163982..."
5687,2018-12-19,2,powell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,governor,0.0,Powell,6.572603,0.0,3.177169,True,"[-0.005976371107438679, 0.0, 0.022684629293502...","[0.13014479004772822, 0.0, 0.14884793828058998...","[0.045920940094850926, 0.0, -0.152401367163982..."
5688,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,4,governor,0.0,Powell,1.254795,0.0,3.177169,False,"[-0.005976371107438679, 0.0, 0.022684629293502...","[0.13014479004772822, 0.0, 0.14884793828058998...","[0.045920940094850926, 0.0, -0.152401367163982..."
5689,2018-12-19,2,rosengren,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.72...",0.0,5,Boston,0.0,Powell,11.424658,1.0,3.177169,True,"[-0.005976371107438679, 0.0, 0.022684629293502...","[0.13014479004772822, 0.0, 0.14884793828058998...","[0.045920940094850926, 0.0, -0.152401367163982..."


In [42]:
mfdff['num_expd'] = mfdff.groupby(['date_x', 'section'])['expd'].transform('sum')

In [43]:
# alternative way of comparing expd to non-expd
mfdff['temp_norm_svect'] = mfdff['norm_svect'] * (mfdff['expd'] == True)
mfdff['total_norm_expd'] = mfdff.groupby(['date_x', 'section'])['temp_norm_svect'].transform('sum')
mfdff['avg_norm_expd'] = mfdff['total_norm_expd'] / mfdff['num_expd']
mfdff['diff_avg_norm_expd'] = mfdff['norm_svect'] - mfdff['avg_norm_expd']

In [44]:
# mfdff['date_section_exp_mean'] = 
mfdff['temp_svect'] = mfdff['svect'] * (mfdff['expd'] == True)
mfdff['total_expd'] = mfdff.groupby(['date_x', 'section'])['temp_svect'].transform('sum')
mfdff['avg_expd'] = mfdff['total_expd'] / mfdff['num_expd']

In [45]:
mfdff['diff_exp_norm'] = ((mfdff['svect'] - mfdff['avg_expd']) / mfdff['date_section_std']).map(np.nan_to_num)

In [46]:
odf = pd.read_csv('../working-csvs/order.csv', index_col=0)

In [47]:
mfdff

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,...,norm_svect,num_expd,temp_norm_svect,total_norm_expd,avg_norm_expd,diff_avg_norm_expd,temp_svect,total_expd,avg_expd,diff_exp_norm
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,2,Philadelphia,0.0,Greenspan,13.016438,...,"[0.21076368570479964, 0.20838750489999366, 0.1...",10,"[0.21076368570479964, 0.20838750489999366, 0.1...","[-1.506960732525174, 0.01794486181385957, -1.0...","[-0.15069607325251738, 0.001794486181385957, -...","[0.361459758957317, 0.20659301871860772, 0.285...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.36145975895731697, 0.20659301871860772, 0.2..."
1,1994-02-04,1,broaddus,"[-0.09660027630591544, 1.3331039275764582, 0.0...",1.0,10,Richmond,0.0,Greenspan,1.093151,...,"[0.12120464236625796, 2.2752370945534497, 0.17...",10,"[0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0,...","[-1.506960732525174, 0.01794486181385957, -1.0...","[-0.15069607325251738, 0.001794486181385957, -...","[0.2719007156187753, 2.2734426083720636, 0.285...","[-0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.2719007156187753, 2.273442608372064, 0.2859..."
2,1994-02-04,1,forrestal,"[1.1557700791714343, 0.0, 0.2708661502319228, ...",1.0,9,Atlanta,0.0,Greenspan,10.169863,...,"[1.282289219256153, 0.20838750489999366, 1.241...",10,"[1.282289219256153, 0.20838750489999366, 1.241...","[-1.506960732525174, 0.01794486181385957, -1.0...","[-0.15069607325251738, 0.001794486181385957, -...","[1.4329852925086703, 0.20659301871860772, 1.34...","[1.1557700791714343, 0.0, 0.2708661502319228, ...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[1.4329852925086706, 0.20659301871860772, 1.34..."
3,1994-02-04,1,hoenig,"[0.0, 0.4742930543253638, 0.0, 0.0, 0.0, 0.0, ...",0.0,4,Kansas City,0.0,Greenspan,2.347945,...,"[0.21076368570479964, 0.9437333278547005, 0.17...",10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.506960732525174, 0.01794486181385957, -1.0...","[-0.15069607325251738, 0.001794486181385957, -...","[0.361459758957317, 0.9419388416733145, 0.2859...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.36145975895731697, 0.9419388416733145, 0.28..."
4,1994-02-04,1,jordan,"[0.0, -1.949719620815178, 0.0, 0.0, 0.0, 0.0, ...",1.0,5,Cleveland,0.0,Greenspan,1.906849,...,"[0.21076368570479964, -2.814465488675048, 0.17...",10,"[0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-1.506960732525174, 0.01794486181385957, -1.0...","[-0.15069607325251738, 0.001794486181385957, -...","[0.361459758957317, -2.8162599748564343, 0.285...","[0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.36145975895731697, -2.8162599748564343, 0.2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5686,2018-12-19,2,mester,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...",1.0,6,Cleveland,1.0,Powell,4.553425,...,"[0.045920940094850926, 0.0, -0.152401367163982...",10,"[0.045920940094850926, 0.0, -0.152401367163982...","[0.45920940094850926, 0.0, -0.8307049981363934...","[0.045920940094850926, 0.0, -0.083070499813639...","[0.0, 0.0, -0.06933086735034355, 0.0, -0.32029...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[0.0, 0.0, -0.06933086735034362, 0.0, -0.32029..."
5687,2018-12-19,2,powell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,governor,0.0,Powell,6.572603,...,"[0.045920940094850926, 0.0, -0.152401367163982...",10,"[0.045920940094850926, 0.0, -0.152401367163982...","[0.45920940094850926, 0.0, -0.8307049981363934...","[0.045920940094850926, 0.0, -0.083070499813639...","[0.0, 0.0, -0.06933086735034355, 0.0, -0.32029...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[0.0, 0.0, -0.06933086735034362, 0.0, -0.32029..."
5688,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,4,governor,0.0,Powell,1.254795,...,"[0.045920940094850926, 0.0, -0.152401367163982...",10,"[0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0...","[0.45920940094850926, 0.0, -0.8307049981363934...","[0.045920940094850926, 0.0, -0.083070499813639...","[0.0, 0.0, -0.06933086735034355, 0.0, -0.32029...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[0.0, 0.0, -0.06933086735034362, 0.0, -0.32029..."
5689,2018-12-19,2,rosengren,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.72...",0.0,5,Boston,0.0,Powell,11.424658,...,"[0.045920940094850926, 0.0, -0.152401367163982...",10,"[0.045920940094850926, 0.0, -0.152401367163982...","[0.45920940094850926, 0.0, -0.8307049981363934...","[0.045920940094850926, 0.0, -0.083070499813639...","[0.0, 0.0, -0.06933086735034355, 0.0, -0.32029...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.72...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[0.0, 0.0, -0.06933086735034362, 0.0, -0.32029..."


In [48]:
mfdff.rename(columns={'date_x': 'date'}, inplace=True)

In [49]:
mfdff = mfdff.merge(odf, on=['date', 'section', 'lname']).sort_values(['date', 'section', 'order'])

In [50]:
mfdff['numspoken'] = mfdff.groupby(['date', 'section']).cumcount() +1

In [51]:
mfdff['norm_svect_sum'] = mfdff.groupby(['date', 'section'])['norm_svect'].apply(np.cumsum).reset_index().set_index('level_2')['norm_svect']
mfdff['norm_svect_sum_avg'] = mfdff['norm_svect_sum'] / mfdff['numspoken']

In [52]:
mfdff[['norm_svect', 'norm_svect_sum', 'norm_svect_sum_avg','numspoken']]
mfdff['norm_svect_avg_prior'] = mfdff.groupby(['date', 'section'])['norm_svect_sum_avg'].shift(1)
mfdff['norm_svect_diff_avg_prior'] = mfdff['norm_svect'] - mfdff['norm_svect_avg_prior']

In [53]:
mfdff

Unnamed: 0,date,section,lname,svect,voter,sent,region,female,chair,exp,...,temp_svect,total_expd,avg_expd,diff_exp_norm,order,numspoken,norm_svect_sum,norm_svect_sum_avg,norm_svect_avg_prior,norm_svect_diff_avg_prior
5,1994-02-04,1,keehn,"[0.0, -0.2916265693009405, 0.0, 0.0, 0.0, 0.0,...",0.0,17,Chicago,0.0,Greenspan,12.605479,...,"[0.0, -0.2916265693009405, 0.0, 0.0, 0.0, 0.0,...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.36145975895731697, -0.24554596506450346, 0....",2,1,"[0.21076368570479964, -0.24375147888311754, 0....","[0.21076368570479964, -0.24375147888311754, 0....",,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
12,1994-02-04,1,parry,"[-4.004581241919781, -0.59218863516712, -1.0, ...",1.0,18,San Francisco,0.0,Greenspan,8.005479,...,"[-4.004581241919781, -0.59218863516712, -1.0, ...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[-3.351225946640841, -0.7115385779812329, -3.6...",3,2,"[-3.291158334188559, -0.9534955706829644, -3.5...","[-1.6455791670942794, -0.4767477853414822, -1....","[0.21076368570479964, -0.24375147888311754, 0....","[-3.712685705598158, -0.46599261291672933, -3...."
10,1994-02-04,1,mcteer,"[0.6570798630865422, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Dallas,0.0,Greenspan,3.010959,...,"[0.6570798630865422, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.9706448067173415, 0.20659301871860772, 0.28...",4,3,"[-2.4712096007237347, -0.7451080657829707, -3....","[-0.8237365335745782, -0.24836935526099024, -1...","[-1.6455791670942794, -0.4767477853414822, -1....","[2.4655279005591035, 0.6851352902414759, 1.961..."
15,1994-02-04,1,syron,"[0.0, -0.49712230701709625, 0.0, 0.0, 0.0, 0.0...",0.0,7,Boston,0.0,Greenspan,5.095890,...,"[0.0, -0.49712230701709625, 0.0, 0.0, 0.0, 0.0...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.36145975895731697, -0.5641473676920651, 0.2...",5,4,"[-2.260445915018935, -1.3074609472936498, -3.2...","[-0.5651114787547338, -0.32686523682341245, -0...","[-0.8237365335745782, -0.24836935526099024, -1...","[1.0345002192793777, -0.31398352624968884, 1.3..."
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,2,Philadelphia,0.0,Greenspan,13.016438,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.8987813276160646, -1.3325109192383469, -0....","[-0.38987813276160643, -0.1332510919238347, -0...","[0.36145975895731697, 0.20659301871860772, 0.2...",6,5,"[-2.0496822293141355, -1.0990734423936561, -3....","[-0.40993644586282707, -0.21981468847873123, -...","[-0.5651114787547338, -0.32686523682341245, -0...","[0.7758751644595334, 0.5352527417234061, 0.980..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5655,2018-12-19,2,george,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Kansas City,1.0,Powell,7.221918,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[0.0, 0.0, -0.06933086735034362, 0.0, -0.32029...",12,11,"[2.8805025605971695, 0.0, -2.2441710436488416,...","[0.2618638691451972, 0.0, -0.20401554942262196...","[0.28345816205023183, 0.0, -0.2091769676484858...","[-0.2375372219553809, 0.0, 0.05677560048450297..."
5661,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,4,governor,0.0,Powell,1.254795,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[0.0, 0.0, -0.06933086735034362, 0.0, -0.32029...",13,12,"[2.9264235006920205, 0.0, -2.3965724108128246,...","[0.24386862505766838, 0.0, -0.1997143675677354...","[0.2618638691451972, 0.0, -0.20401554942262196...","[-0.2159429290503463, 0.0, 0.05161418225863906..."
5658,2018-12-19,2,kashkari,"[-0.3987878854106168, 0.0, 0.0, 0.0, -0.520186...",0.0,6,Minneapolis,0.0,Powell,2.967123,...,"[-0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[-3.0641863209765727, 0.0, -0.0693308673503436...",14,13,"[-0.09184188018970119, 0.0, -2.548973777976807...","[-0.007064760014592399, 0.0, -0.19607490599821...","[0.24386862505766838, 0.0, -0.1997143675677354...","[-3.26213400593939, 0.0, 0.0473130004037525, 0..."
5663,2018-12-19,2,williams,"[0.0, 0.0, 0.42477875017013805, 0.0, 0.7597292...",1.0,24,New York,0.0,Powell,7.805479,...,"[0.0, 0.0, 0.42477875017013805, 0.0, 0.7597292...","[0.0, 0.0, 0.1031975666430372, 0.0, 0.75972927...","[0.0, 0.0, 0.010319756664303719, 0.0, 0.075972...","[0.0, 0.0, 2.7844456449544284, 0.0, 2.88261628...",16,14,"[-0.04592094009485026, 0.0, 0.1524013671639816...","[-0.0032800671496321615, 0.0, 0.01088581194028...","[-0.007064760014592399, 0.0, -0.19607490599821...","[0.052985700109443326, 0.0, 2.897450051139005,..."


In [54]:
mfdff['use'] = mfdff['sent'] >= 3

In [55]:
# mfdff.to_csv('../working-csvs/mfdff.csv')
mfdff.to_pickle('../working-csvs/mfdff.pkl')