In [41]:
import pandas as pd
import numpy as np

from nltk.stem import PorterStemmer

from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

import setup_modules
from lib.preprocessing import setup_enhance, ngram_enhance, process, load_stopwords

In [42]:
df = pd.read_csv('../working-csvs/fomc-sents-w-sentiment.csv', index_col=0)

In [43]:
bigrams, trigrams = setup_enhance('../ngrams/bigrams', '../ngrams/trigrams')
df['enhanced'] = df['content'].map(lambda x : ngram_enhance(x, bigrams, trigrams))

In [44]:
stopwords = load_stopwords('../stopwords/stopwords.txt')
stemmer = PorterStemmer()
df['tokens'] = df['enhanced'].map(lambda x : process(x, stemmer=stemmer, stopwords=stopwords))

In [45]:
topic_model = LdaModel.load('../models/02-29/02-29lda')
model_dict = Dictionary.load('../models/02-29/02-29dict')
# topic_model = LdaModel.load('../models/12-10lda')
# model_dict = Dictionary.load('../models/12-10dict')

In [46]:
df['tprob_vect'] = df['tokens'].map(lambda x : topic_model[model_dict.doc2bow(x)])

In [47]:
def topk_topics(topic_vec, k=3):
    probs = [item[1] for item in topic_vec]
    probs.sort(reverse=True)
    threshold = probs[k - 1]
    topk = [item for item in topic_vec if item[1] >= max(threshold, 0.03)]
    rounded = [(topic, round(prob,4)) for topic, prob in topk]
    return rounded

In [48]:
df['topk'] = df['tprob_vect'].map(lambda x : topk_topics(x, k=3))

In [49]:
for i, row in df.sample(20)[['content', 'topk']].iterrows():
    if len(row['topk']) > 0:
        print(row['content'])
        print(row['topk'])

Indeed, based on the latest data, I have raised my forecast for Q4-over-Q4 GDP growth this year to 2.4 percent.
[(9, 0.0409)]
And as I said earlier, we could see a substantial rise in inflation if the normal historical relationship between unemployment and inflation were to reassert itself.
[(30, 0.0307)]
Similarly, under appropriate monetary policy, my modal outlook is for the unemployment rate to fall below the natural rate, which I currently gauge to be 5 percent, for some period of time.
[(6, 0.0341), (10, 0.0467), (23, 0.0363)]
I strongly support adding the list of economic conditions to spell out, to some extent, the nature of our reaction function.
[(23, 0.0708)]
I do think we should keep a wary eye on deflation.
[(33, 0.0314)]
The bankers reported an increase in business sentiment among their customers, but a national labor leader on our Cincinnati Branch board noted that the employers they deal with are now less optimistic.
[(1, 0.071), (17, 0.0324)]
And finally, on President 

In [50]:
k = 45

def partial_stance_vect(topics, sent_label):
    # maybe normalize
    total = sum([topic[1] for topic in topics])
    svect = [0] * k
    for topic, weight in topics:
        svect[topic] = float(sent_label) * float(weight) / total
    
    return np.array(svect)

In [51]:
min_sent_prob = 0.5

In [52]:
dff = df[(df['topk'].map(len) > 0) & (df['sent_prob'] > 0.5) & (df['sent'] != 0)].copy()

In [53]:
dff['svect'] = dff.apply(lambda x : partial_stance_vect(x['topk'], x['sent']), axis=1)

In [54]:
dff[(dff['date'] == '1994-02-04') & (dff['section'] == 1)]['lname'].unique()

array(['boehne', 'broaddus', 'forrestal', 'hoenig', 'jordan', 'keehn',
       'kelley', 'laware', 'lindsey', 'mcdonough', 'mcteer', 'melzer',
       'parry', 'phillips', 'stern', 'syron'], dtype=object)

In [55]:
fomc = pd.read_csv('../working-csvs/fomc.csv', index_col=0)

In [56]:
fomc['date_pd'] = pd.to_datetime(fomc['date'])
fomc['year'] = fomc['date_pd'].dt.year
fomc['month'] = fomc['date_pd'].dt.month

In [57]:
dff['date_pd'] = pd.to_datetime(dff['date'].map(str))
dff['year'] = dff['date_pd'].dt.year
dff['month'] = dff['date_pd'].dt.month

In [58]:
fdff = dff.merge(fomc, how='left', left_on=['year', 'month', 'lname'], right_on=['year', 'month', 'member'])

In [59]:
mfdff = fdff.groupby(['date_x', 'section', 'lname'])[['svect', 'voter', 'sent', 'region', 'female', 'chair', 'exp', 'Econ_PhD']].agg({'svect': 'sum', 'voter': 'max', 'sent': 'count', 'region': 'first', 'female': 'first', 'chair': 'first', 'exp':'first', 'Econ_PhD': 'max'}).reset_index()

In [60]:
mfdff[(mfdff['date_x'] == '1994-02-04') & (mfdff['section'] == 2)]

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,Econ_PhD
16,1994-02-04,2,boehne,"[0.0, 0.390893463834136, -1.0, 0.0, 0.0, 0.0, ...",0.0,3,Philadelphia,0.0,Greenspan,13.016438,1.0
17,1994-02-04,2,broaddus,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,2,Richmond,0.0,Greenspan,1.093151,1.0
18,1994-02-04,2,forrestal,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.35...",1.0,1,Atlanta,0.0,Greenspan,10.169863,0.0
19,1994-02-04,2,greenspan,"[-0.29740134824317155, -0.32435034602443663, 0...",1.0,29,governor,0.0,Greenspan,6.490411,1.0
20,1994-02-04,2,jordan,"[0.0, 0.0, 0.3787425226506579, 0.0, 0.0, 0.0, ...",1.0,2,Cleveland,0.0,Greenspan,1.906849,1.0
21,1994-02-04,2,keehn,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,2,Chicago,0.0,Greenspan,12.605479,0.0
22,1994-02-04,2,kelley,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,governor,0.0,Greenspan,6.70137,0.0
23,1994-02-04,2,laware,"[0.0, 0.0, -0.2089830523077163, 0.0, 0.0, 0.0,...",1.0,3,governor,0.0,Greenspan,5.476712,0.0
24,1994-02-04,2,lindsey,"[-1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.0,4,governor,0.0,Greenspan,2.194521,1.0
25,1994-02-04,2,mcdonough,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,New York,0.0,Greenspan,0.547945,0.0


In [61]:
mfdff['exp_tertile'] = mfdff.groupby(['date_x', 'section'])['exp'].transform(lambda x : np.quantile(x, 1/3))

In [62]:
mfdff['expd'] = mfdff['exp'] > mfdff['exp_tertile']

In [64]:
mfdff

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,Econ_PhD,exp_tertile,expd
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Philadelphia,0.0,Greenspan,13.016438,1.0,2.347945,True
1,1994-02-04,1,broaddus,"[-0.09660027630591544, 1.3331039275764582, 0.0...",1.0,12,Richmond,0.0,Greenspan,1.093151,1.0,2.347945,False
2,1994-02-04,1,forrestal,"[1.1557700791714343, 0.0, 0.2708661502319228, ...",1.0,11,Atlanta,0.0,Greenspan,10.169863,0.0,2.347945,True
3,1994-02-04,1,hoenig,"[-0.40276034193397064, 0.4742930543253638, 0.0...",0.0,6,Kansas City,0.0,Greenspan,2.347945,1.0,2.347945,False
4,1994-02-04,1,jordan,"[0.0, -2.949719620815178, 0.0, 0.0, 0.0, 0.0, ...",1.0,10,Cleveland,0.0,Greenspan,1.906849,1.0,2.347945,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5916,2018-12-19,2,mester,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...",1.0,9,Cleveland,1.0,Powell,4.553425,1.0,3.177169,True
5917,2018-12-19,2,powell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,governor,0.0,Powell,6.572603,0.0,3.177169,True
5918,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0,...",1.0,5,governor,0.0,Powell,1.254795,0.0,3.177169,False
5919,2018-12-19,2,rosengren,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.72...",0.0,6,Boston,0.0,Powell,11.424658,1.0,3.177169,True


In [65]:
import numpy as np

In [66]:
def array_group_std(x):
    stdarr = np.std(np.stack(x), axis = 0)
    new = np.tile(stdarr, (len(x), 1))
    return pd.Series(new.tolist(), index=x.index)

In [87]:
mfdff[0:2]

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,Econ_PhD,exp_tertile,expd,date_section_mean,date_section_std,norm_svect
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Philadelphia,0.0,Greenspan,13.016438,1.0,2.347945,True,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.23429682879206745, 0.33351604423676745, 0.1..."
1,1994-02-04,1,broaddus,"[-0.09660027630591544, 1.3331039275764582, 0.0...",1.0,12,Richmond,0.0,Greenspan,1.093151,1.0,2.347945,False,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.14466301043467458, 1.7885303967924822, 0.13..."


In [83]:
mfdff[0:2].groupby(['date_x','section'])['svect'].transform('mean')
mfdff[0:2].groupby(['date_x','section'])['svect'].transform(array_group_std)

0    [0.04830013815295772, 0.6665519637882291, 0.0,...
1    [0.04830013815295772, 0.6665519637882291, 0.0,...
Name: svect, dtype: object

In [67]:
mfdff['date_section_mean'] = mfdff.groupby(['date_x', 'section'])['svect'].transform('mean')
mfdff['date_section_std'] = mfdff.groupby(['date_x', 'section'])['svect'].transform(array_group_std).map(np.nan_to_num)

mfdff['norm_svect'] = ((mfdff['svect'] - mfdff['date_section_mean']) / mfdff['date_section_std']).map(np.nan_to_num)

In [88]:
mfdff

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,Econ_PhD,exp_tertile,expd,date_section_mean,date_section_std,norm_svect
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Philadelphia,0.0,Greenspan,13.016438,1.0,2.347945,True,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.23429682879206745, 0.33351604423676745, 0.1..."
1,1994-02-04,1,broaddus,"[-0.09660027630591544, 1.3331039275764582, 0.0...",1.0,12,Richmond,0.0,Greenspan,1.093151,1.0,2.347945,False,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.14466301043467458, 1.7885303967924822, 0.13..."
2,1994-02-04,1,forrestal,"[1.1557700791714343, 0.0, 0.2708661502319228, ...",1.0,11,Atlanta,0.0,Greenspan,10.169863,0.0,2.347945,True,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[1.3067170050169474, 0.33351604423676745, 1.68..."
3,1994-02-04,1,hoenig,"[-0.40276034193397064, 0.4742930543253638, 0.0...",0.0,6,Kansas City,0.0,Greenspan,2.347945,1.0,2.347945,False,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[-0.13941791314247798, 0.8511825120083222, 0.1..."
4,1994-02-04,1,jordan,"[0.0, -2.949719620815178, 0.0, 0.0, 0.0, 0.0, ...",1.0,10,Cleveland,0.0,Greenspan,1.906849,1.0,2.347945,False,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.23429682879206745, -2.8859511672234803, 0.1..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5916,2018-12-19,2,mester,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...",1.0,9,Cleveland,1.0,Powell,4.553425,1.0,3.177169,True,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658..."
5917,2018-12-19,2,powell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,governor,0.0,Powell,6.572603,0.0,3.177169,True,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658..."
5918,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0,...",1.0,5,governor,0.0,Powell,1.254795,0.0,3.177169,False,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658..."
5919,2018-12-19,2,rosengren,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.72...",0.0,6,Boston,0.0,Powell,11.424658,1.0,3.177169,True,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658..."


In [89]:
# mfdff['date_section_exp_mean'] = 
mfdff['temp_svect'] = mfdff['svect'] * (mfdff['expd'] == True)
mfdff['total_expd'] = mfdff.groupby(['date_x', 'section'])['temp_svect'].transform('sum')
mfdff['num_expd'] = mfdff.groupby(['date_x', 'section'])['expd'].transform('sum')
mfdff['avg_expd'] = mfdff['total_expd'] / mfdff['num_expd']
mfdff['avg_expd']

0       [-0.38987813276160643, -0.30711254768631074, -...
1       [-0.38987813276160643, -0.30711254768631074, -...
2       [-0.38987813276160643, -0.30711254768631074, -...
3       [-0.38987813276160643, -0.30711254768631074, -...
4       [-0.38987813276160643, -0.30711254768631074, -...
                              ...                        
5916    [0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...
5917    [0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...
5918    [0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...
5919    [0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...
5920    [0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...
Name: avg_expd, Length: 5921, dtype: object

In [90]:
mfdff['diff_exp_norm'] = ((mfdff['svect'] - mfdff['avg_expd']) / mfdff['date_section_std']).map(np.nan_to_num)

In [30]:
odf = pd.read_csv('../working-csvs/order.csv', index_col=0)

In [91]:
mfdff

Unnamed: 0,date_x,section,lname,svect,voter,sent,region,female,chair,exp,...,exp_tertile,expd,date_section_mean,date_section_std,norm_svect,temp_svect,total_expd,num_expd,avg_expd,diff_exp_norm
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Philadelphia,0.0,Greenspan,13.016438,...,2.347945,True,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.23429682879206745, 0.33351604423676745, 0.1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[0.36176155048257724, 0.3351975457351617, 0.21..."
1,1994-02-04,1,broaddus,"[-0.09660027630591544, 1.3331039275764582, 0.0...",1.0,12,Richmond,0.0,Greenspan,1.093151,...,2.347945,False,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.14466301043467458, 1.7885303967924822, 0.13...","[-0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0...","[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[0.2721277321251843, 1.7902118982908766, 0.218..."
2,1994-02-04,1,forrestal,"[1.1557700791714343, 0.0, 0.2708661502319228, ...",1.0,11,Atlanta,0.0,Greenspan,10.169863,...,2.347945,True,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[1.3067170050169474, 0.33351604423676745, 1.68...","[1.1557700791714343, 0.0, 0.2708661502319228, ...","[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[1.4341817267074572, 0.3351975457351617, 1.764..."
3,1994-02-04,1,hoenig,"[-0.40276034193397064, 0.4742930543253638, 0.0...",0.0,6,Kansas City,0.0,Greenspan,2.347945,...,2.347945,False,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[-0.13941791314247798, 0.8511825120083222, 0.1...","[-0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[-0.011953191451968196, 0.8528640135067164, 0...."
4,1994-02-04,1,jordan,"[0.0, -2.949719620815178, 0.0, 0.0, 0.0, 0.0, ...",1.0,10,Cleveland,0.0,Greenspan,1.906849,...,2.347945,False,"[-0.25250668568719664, -0.30557193315711545, -...","[1.077721311846222, 0.9162135928314917, 0.1751...","[0.23429682879206745, -2.8859511672234803, 0.1...","[0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[0.36176155048257724, -2.884269665725087, 0.21..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5916,2018-12-19,2,mester,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...",1.0,9,Cleveland,1.0,Powell,4.553425,...,3.177169,True,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0,...","[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[0.0, 0.0, -0.34867969519931247, 0.0, -0.43871..."
5917,2018-12-19,2,powell,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,1,governor,0.0,Powell,6.572603,...,3.177169,True,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[0.0, 0.0, -0.34867969519931247, 0.0, -0.43871..."
5918,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0,...",1.0,5,governor,0.0,Powell,1.254795,...,3.177169,False,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0,...","[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[0.0, 0.0, -0.34867969519931247, 0.0, -0.43871..."
5919,2018-12-19,2,rosengren,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.72...",0.0,6,Boston,0.0,Powell,11.424658,...,3.177169,True,"[-0.005976371107438679, 0.0, 0.095576302724765...","[0.13014479004772822, 0.0, 0.20652251719858394...","[0.045920940094850926, 0.0, -0.462788774905658...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.72...","[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[0.0, 0.0, -0.34867969519931247, 0.0, -0.43871..."


In [92]:
mfdff.rename(columns={'date_x': 'date'}, inplace=True)

In [93]:
mfdff = mfdff.merge(odf, on=['date', 'section', 'lname']).sort_values(['date', 'section', 'order'])

In [94]:
mfdff['numspoken'] = mfdff.groupby(['date', 'section']).cumcount() +1

In [96]:
mfdff['norm_svect_sum'] = mfdff.groupby(['date', 'section'])['norm_svect'].apply(np.cumsum).reset_index().set_index('level_2')['norm_svect']
mfdff['norm_svect_sum_avg'] = mfdff['norm_svect_sum'] / mfdff['numspoken']

In [97]:
mfdff[['norm_svect', 'norm_svect_sum', 'norm_svect_sum_avg','numspoken']]
mfdff['norm_svect_avg_prior'] = mfdff.groupby(['date', 'section'])['norm_svect_sum_avg'].shift(1)
mfdff['norm_svect_diff_avg_prior'] = mfdff['norm_svect'] - mfdff['norm_svect_avg_prior']

In [98]:
mfdff

Unnamed: 0,date,section,lname,svect,voter,sent,region,female,chair,exp,...,total_expd,num_expd,avg_expd,diff_exp_norm,order,numspoken,norm_svect_sum,norm_svect_sum_avg,norm_svect_avg_prior,norm_svect_diff_avg_prior
5,1994-02-04,1,keehn,"[0.0, -1.5944303132803317, 0.0, 0.0, 0.0, 0.0,...",0.0,19,Chicago,0.0,Greenspan,12.605479,...,"[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[0.36176155048257724, -1.4050411123192994, 0.2...",2,1,"[0.23429682879206745, -1.4067226138176938, 0.1...","[0.23429682879206745, -1.4067226138176938, 0.1...",,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
12,1994-02-04,1,parry,"[-4.004581241919781, -0.59218863516712, -0.653...",1.0,19,San Francisco,0.0,Greenspan,8.005479,...,"[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[-3.354023966516818, -0.3111458831338686, -3.5...",3,2,"[-3.2471918594152602, -1.7195499984499567, -3....","[-1.6235959297076301, -0.8597749992249784, -1....","[0.23429682879206745, -1.4067226138176938, 0.1...","[-3.715785516999395, 1.093895229185431, -3.729..."
10,1994-02-04,1,mcteer,"[0.6570798630865422, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Dallas,0.0,Greenspan,3.010959,...,"[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[0.9714552216236929, 0.3351975457351617, 0.218...",4,3,"[-2.403201359482077, -1.3860339542131892, -3.3...","[-0.801067119827359, -0.46201131807106305, -1....","[-1.6235959297076301, -0.8597749992249784, -1....","[2.4675864296408134, 1.1932910434617459, 1.864..."
15,1994-02-04,1,syron,"[0.0, -0.9329331206624658, 0.0, 0.0, 0.0, 0.0,...",0.0,8,Boston,0.0,Greenspan,5.095890,...,"[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[0.36176155048257724, -0.6830509587203372, 0.2...",5,4,"[-2.1689045306900097, -2.0707664144319207, -3....","[-0.5422261326725024, -0.5176916036079802, -0....","[-0.801067119827359, -0.46201131807106305, -1....","[1.0353639486194264, -0.22272114214766842, 1.2..."
0,1994-02-04,1,boehne,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,3,Philadelphia,0.0,Greenspan,13.016438,...,"[-3.8987813276160646, -3.0711254768631076, -0....",10,"[-0.38987813276160643, -0.30711254768631074, -...","[0.36176155048257724, 0.3351975457351617, 0.21...",6,5,"[-1.9346077018979422, -1.7372503701951532, -3....","[-0.3869215403795884, -0.3474500740390306, -0....","[-0.5422261326725024, -0.5176916036079802, -0....","[0.7765229614645699, 0.8512076478447477, 0.932..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5885,2018-12-19,2,george,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.41044775...",0.0,5,Kansas City,1.0,Powell,7.221918,...,"[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[0.0, 0.0, -0.34867969519931247, 0.0, -0.43871...",12,11,"[2.8805025605971695, 0.0, -0.2056605732257073,...","[0.2618638691451972, 0.0, -0.01869641574779157...","[0.28345816205023183, 0.0, 0.02571282016799507...","[-0.2375372219553809, 0.0, -0.4885015950736531..."
5891,2018-12-19,2,quarles,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0,...",1.0,5,governor,0.0,Powell,1.254795,...,"[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[0.0, 0.0, -0.34867969519931247, 0.0, -0.43871...",13,12,"[2.9264235006920205, 0.0, -0.6684493481313654,...","[0.24386862505766838, 0.0, -0.0557041123442804...","[0.2618638691451972, 0.0, -0.01869641574779157...","[-0.2159429290503463, 0.0, -0.4440923591578665..."
5888,2018-12-19,2,kashkari,"[-0.3987878854106168, 0.0, 0.0, 0.0, -0.520186...",0.0,8,Minneapolis,0.0,Powell,2.967123,...,"[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[-3.0641863209765727, 0.0, -0.3486796951993124...",14,13,"[-0.09184188018970119, 0.0, -1.131238123037023...","[-0.007064760014592399, 0.0, -0.08701831715669...","[0.24386862505766838, 0.0, -0.0557041123442804...","[-3.26213400593939, 0.0, -0.4070846625613776, ..."
5893,2018-12-19,2,williams,"[0.0, 0.0, 0.42477875017013805, 0.0, 0.7597292...",1.0,26,New York,0.0,Powell,7.805479,...,"[0.0, 0.0, 0.7201020834859702, 0.0, 1.09834313...",10,"[0.0, 0.0, 0.07201020834859702, 0.0, 0.1098343...","[0.0, 0.0, 1.7081359776490264, 0.0, 2.59590331...",16,14,"[-0.04592094009485026, 0.0, 0.4627887749056574...","[-0.0032800671496321615, 0.0, 0.03305634106468...","[-0.007064760014592399, 0.0, -0.08701831715669...","[0.052985700109443326, 0.0, 1.6810452150993749..."


In [99]:
mfdff['use'] = mfdff['sent'] >= 3

In [100]:
# mfdff.to_csv('../working-csvs/mfdff.csv')
mfdff.to_pickle('../working-csvs/mfdff.pkl')