In [24]:
from collections import defaultdict
import string
import re
import pickle

In [28]:
all_speeches = pickle.load(open("speeches.p", "rb"))
bigram_dict = pickle.load(open("bigrams.p", "rb"))
len(all_speeches), len(bigram_dict)

(9, 9)

In [29]:
pearsons = {}
for i in range(108,115):
    bigrams = bigram_dict[i]
    
    r_phrase_counts = defaultdict(int)
    d_phrase_counts = defaultdict(int)
    speeches = all_speeches[i]
    for bigram, speechIDs in bigrams.items():
        for speechID in speechIDs:
            party = speeches[speechID]["party"]
            if party == 'R':
                r_phrase_counts[bigram] += 1
            elif party == 'D':
                d_phrase_counts[bigram] += 1
                
    r_sum = sum(r_phrase_counts.values())
    d_sum = sum(d_phrase_counts.values())

    pearson = {}
    for bigram in bigrams:
        f_plr = r_phrase_counts[bigram] if bigram in r_phrase_counts else 0
        f_pld = d_phrase_counts[bigram] if bigram in d_phrase_counts else 0
        f_nplr = r_sum - f_plr
        f_npld = d_sum - f_pld
        num = (f_plr*f_npld - f_pld*f_nplr)**2
        denom = (f_plr+f_pld)*(f_plr+f_nplr)*(f_pld+f_npld)*(f_nplr+f_npld)
        sign = 2*(f_plr > f_pld)-1 ## make democrat negative
        pearson[bigram] = sign * num/denom
    pearsons[i] = pearson
pickle.dump(pearsons, open("pearsons.p", "wb"))

In [30]:
new_pearsons = {}
new_bigrams = pickle.load(open("new_bigrams.p", "rb"))
for i in range(108,115):
    new_pearsons[i] = {k: v for k,v in pearsons[i].items() if k in new_bigrams[i]}
pickle.dump(new_pearsons, open("new_pearsons.p", "wb"))

- 108: {Years: 03-04, House: R, Senate: R}
- 109: {Years: 05-06, House: R, Senate: R}
- 110: {Years: 07-08, House: D, Senate: D}
- 111: {Years: 09-10, House: D, Senate: D}
- 112: {Years: 11-12, House: R, Senate: D}
- 113: {Years: 13-14, House: R, Senate: D}
- 114: {Years: 15-16, House: R, Senate: R}

In [31]:
sorted(new_pearsons[108].items(), key = lambda kv: kv[1], reverse=True)

[(('tradit', 'marriag'), 5.73981760510823e-05),
 (('need', 'hate'), 3.748285897633694e-05),
 (('introduc', 'local'), 3.7456447990505974e-05),
 (('act', 'symbol'), 3.729713554075622e-05),
 (('growth', 'packag'), 3.527221890839623e-05),
 (('bill', 'pryor'), 3.066492375850003e-05),
 (('act', '2003'), 2.8687180741593667e-05),
 (('2003', 'senat'), 2.5059848645004806e-05),
 (('general', 'pryor'), 2.2644627471861073e-05),
 (('fsc', 'eti'), 1.955745312862312e-05),
 (('bless', 'troop'), 1.6976975080239792e-05),
 (('oh', 'columbus'), 1.681630437153929e-05),
 (('second', 'victim'), 1.637287518556477e-05),
 (('definit', 'marriag'), 1.5685331231887536e-05),
 (('competit', 'sourc'), 1.4834676814985091e-05),
 (('medic', 'litig'), 1.378977606685124e-05),
 (('laci', 'conner'), 1.3543757893133386e-05),
 (('health', 'save'), 1.3130241927986929e-05),
 (('victim', 'two'), 1.2403178766505191e-05),
 (('bill', 'myer'), 1.2295376249447568e-05),
 (('2004', 'purpos'), 1.2097069019710357e-05),
 (('taxat', 'divide

In [32]:
sorted(new_pearsons[108].items(), key = lambda kv: kv[1])

[(('000', 'b'), -1.774984860502351e-05),
 (('overtim', 'protect'), -1.3716542503765384e-05),
 (('smart', 'secur'), -1.3244431389826625e-05),
 (('al', 'anbar'), -1.2292514936726345e-05),
 (('pendleton', 'ca'), -1.2119057769374747e-05),
 (('provinc', 'assign'), -1.1880120140110748e-05),
 (('away', 'overtim'), -1.1299700988439937e-05),
 (('anbar', 'provinc'), -1.0750145679750637e-05),
 (('iraq', 'watch'), -1.0425700285193855e-05),
 (('nobid', 'contract'), -1.0239963868331142e-05),
 (('bunker', 'buster'), -9.955798259889173e-06),
 (('veteran', 'tax'), -9.726557483713935e-06),
 (('billion', 'iraq'), -9.682449525277882e-06),
 (('mount', 'hood'), -9.549708716944224e-06),
 (('maynard', 'jackson'), -9.387839295641943e-06),
 (('right', 'overtim'), -9.269920649799534e-06),
 (('mr', 'chalabi'), -9.180518587080196e-06),
 (('deficit', 'matter'), -9.099968615195674e-06),
 (('divis', 'camp'), -8.699663495083537e-06),
 (('ship', 'job'), -8.59463953028449e-06),
 (('serv', 'iraq'), -8.569592402023265e-06

In [33]:
sorted(new_pearsons[110].items(), key = lambda kv: kv[1], reverse=True)

[(('z', 'visa'), 2.4754530873147255e-05),
 (('commonsens', 'plan'), 2.068235347035875e-05),
 (('coaltoliquid', 'technolog'), 1.7428207637974285e-05),
 (('shale', 'oil'), 1.7241439174029672e-05),
 (('radic', 'jihadist'), 1.6645846102822985e-05),
 (('execut', 'committeeman'), 1.641878304152442e-05),
 (('hyman', 'rickov'), 1.5213720813262878e-05),
 (('physician', 'workforc'), 1.4773750399936918e-05),
 (('privat', 'ballot'), 1.3870918840842436e-05),
 (('selfevid', 'truth'), 1.3514496512010229e-05),
 (('sunset', 'memori'), 1.3214088645334375e-05),
 (('earmark', 'rule'), 1.165057578727213e-05),
 (('683', 'billion'), 1.0845388736765919e-05),
 (('secur', 'station'), 1.032167660589603e-05),
 (('amt', 'patch'), 1.0219040326622097e-05),
 (('wade', 'first'), 1.00922309650927e-05),
 (('joint', 'secur'), 9.961200542295211e-06),
 (('general', 'petraeuss'), 9.409626974893285e-06),
 (('micromanag', 'war'), 9.144389267921314e-06),
 (('sue', 'opec'), 9.117435037929503e-06),
 (('bring', 'skyrocket'), 8.90

In [34]:
sorted(new_pearsons[110].items(), key = lambda kv: kv[1])

[(('retroact', 'immun'), -2.4232697961042787e-05),
 (('iowa', 'public'), -2.1336710392715475e-05),
 (('excess', 'specul'), -2.081312839891918e-05),
 (('harkin', 'grant'), -1.9685228461317032e-05),
 (('act', '2008'), -1.8592670282082704e-05),
 (('mr', 'altmir'), -1.417198455750079e-05),
 (('foreclosur', 'crisi'), -1.4079535425141144e-05),
 (('lilli', 'ledbett'), -1.3633355938664752e-05),
 (('room', '253'), -1.165587678571979e-05),
 (('253', 'russel'), -1.156889130635343e-05),
 (('face', 'foreclosur'), -1.1415147793883048e-05),
 (('father', 'drinan'), -1.1256457429902045e-05),
 (('presid', 'iowa'), -1.0999078485701681e-05),
 (('iowa', 'student'), -1.0995580922362976e-05),
 (('121', 'million'), -1.090862294411307e-05),
 (('govern', 'iowa'), -1.090862294411307e-05),
 (('known', 'among'), -1.090862294411307e-05),
 (('presid', 'escal'), -1.0864001482638185e-05),
 (('begun', 'know'), -1.0821665410509611e-05),
 (('nationwid', 'iowa'), -1.0821665410509611e-05),
 (('iowa', 'demonstr'), -1.073470

In [35]:
sorted(new_pearsons[114].items(), key = lambda kv: kv[1], reverse=True)

[(('garrison', 'member'), 3.701261170874772e-05),
 (('iran', 'deal'), 2.556782089170952e-05),
 (('res', '11'), 1.9449112462182514e-05),
 (('anytim', 'anywher'), 1.738251601551985e-05),
 (('2015', 'summer'), 1.6846780466087707e-05),
 (('jv', 'team'), 1.6081005417075826e-05),
 (('presid', 'healthcar'), 1.6081005417075826e-05),
 (('wotus', 'rule'), 1.588519307998549e-05),
 (('choic', 'card'), 1.558676294426938e-05),
 (('chant', 'death'), 1.4149465905617546e-05),
 (('multiyear', 'highway'), 1.3801852701320683e-05),
 (('kate', 'steinl'), 1.3641102389166604e-05),
 (('email', 'server'), 1.2272340494543602e-05),
 (('anywher', 'inspect'), 1.202196641319195e-05),
 (('sanctuari', 'jurisdict'), 1.1996891607599909e-05),
 (('polici', 'modern'), 1.1847633076940265e-05),
 (('r', '1735'), 1.1580782184925378e-05),
 (('senat', 'ernst'), 1.128038141586337e-05),
 (('director', 'comey'), 1.097421714419646e-05),
 (('death', 'israel'), 1.07539512510863e-05),
 (('colleagu', 'nevadan'), 1.072061287581951e-05),


In [36]:
sorted(new_pearsons[114].items(), key = lambda kv: kv[1])

[(('judg', 'garland'), -7.125151434586521e-05),
 (('zika', 'virus'), -4.5786434701703566e-05),
 (('mr', 'zinser'), -4.457152871027837e-05),
 (('loretta', 'lynch'), -4.2087408985388794e-05),
 (('check', 'legisl'), -3.113919910735722e-05),
 (('1217', 'bipartisan'), -3.0817604101172236e-05),
 (('battl', 'flag'), -2.9575938778187647e-05),
 (('peopl', 'flint'), -2.87179062749051e-05),
 (('bipartisan', 'expand'), -2.8015925151610487e-05),
 (('receiv', 'moment'), -2.6487743257666706e-05),
 (('confeder', 'battl'), -2.601435725154602e-05),
 (('silenc', 'hous'), -2.241261439958158e-05),
 (('futur', 'forum'), -2.1139143450148148e-05),
 (('forc', 'arbitr'), -2.0120369028432734e-05),
 (('web', 'denial'), -1.9610982596849952e-05),
 (('gmo', 'label'), -1.8862728011780546e-05),
 (('dark', 'act'), -1.8095876767475715e-05),
 (('clean', 'homeland'), -1.7586864448420422e-05),
 (('garland', 'nomin'), -1.6823386860393195e-05),
 (('judg', 'merrick'), -1.6568906160797844e-05),
 (('clean', 'depart'), -1.641910

In [8]:
for value in new_bigrams.values():
    print(len(value))

43554
28420
33063
15105
12640
8249
6143


In [39]:
# after changing set to list
for value in new_bigrams.values():
    print(len(value))

49184
32925
37492
18147
15074
10186
7933


In [23]:
for i in range(106,115):
    bigrams = pickle.load(open("bigrams" + str(i) + ".p", "rb"))
    print(i, len(bigrams))

106 218742
107 187558
108 209481
109 196782
110 211066
111 156743
112 148637
113 129903
114 107474


In [22]:
# after changing set to list
for key in bigrams:
    print(key, len(bigrams[key]))

106 233207
107 200535
108 222713
109 209502
110 224125
111 167227
112 158063
113 138476
114 115679
