In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
import nltk
import time, os, pickle
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, names, opinion_lexicon
from itertools import combinations, permutations
from collections import Counter
from tqdm import tqdm, tqdm_notebook
# plt.style.use('ggplot')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('names')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /home/alex/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/alex/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

In [2]:
datapath = r'/home/alex/Documents/MIDS/w266/final_project/data/convote_v1.1/data_stage_three'

In [3]:
os.listdir(datapath)

['test_set', 'development_set', 'training_set']

# Load data
combine the three datasets into one since we are adding it to other datasets later and will do a train test split on the full data at that point

In [4]:
# check how many files we are working with and load them into a list
files = []
party2label = {'D':'liberal','R':'conservative','I':'neutral'}

# iterate through directory
for i, (dirName, subDirList, fileList) in enumerate(os.walk(datapath)):
    if i > 0:
        # print the number of files in each dataset
        print(os.listdir(datapath)[i-1], ':', len(fileList))
    for i,f in enumerate(fileList):
        # convert from party label to ideological label
        label = party2label[f.split('_')[-1][0]]
        
        # store the filepath with the label attached
        filepath = os.path.join(dirName,f)
        files.append((label, filepath))
print('Total files :', len(files))

test_set : 860
development_set : 257
training_set : 2740
Total files : 3857


In [5]:
files[0]

('conservative',
 '/home/alex/Documents/MIDS/w266/final_project/data/convote_v1.1/data_stage_three/test_set/414_400343_2897011_RON.txt')

In [6]:
# parse sentences and load into a DataFrame
sentences = []
for label, filename in files:
    with open(filename, 'r') as f:
        # split the sentences
        sents = sent_tokenize(f.read())
        
        # collect (label, sentence) tuples
        for sent in sents:
            sentences.append((label, sent))

df = pd.DataFrame(sentences, columns=['label', 'text'])
print('Total sentences: ', df.shape[0])

Total sentences:  54830


In [7]:
df.sample(5)

Unnamed: 0,label,text
25332,liberal,"now , suppose , in a second example , a checki..."
21492,conservative,"we would all do well to remember that , especi..."
31280,liberal,where do the republicans find the moral justif...
28061,conservative,they are so expensive and have been denied for...
44391,liberal,we think it is a good trade-off .


In [8]:
df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
conservative,26155
liberal,28439
neutral,236


# Determine most politically charged bigrams

In [9]:
# split data into liberal and conservative
lib = df.loc[df.label == 'liberal']
con = df.loc[df.label == 'conservative']

In [15]:
stops = [w.lower() for w in stopwords.words('english')] + \
        ['would', 'could','really','sensenbrenner',"n't"]
stops[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [16]:
men = [w.lower() for w in names.words('male.txt')]
women = [w.lower() for w in names.words('female.txt')]
stopNames = men + women

In [17]:
len(stops), len(stopNames)

(184, 7944)

In [18]:
stops.extend(stopNames)
len(stops)

8128

In [20]:
# define a helper function to pull the bigrams
def getBigrams(text, stops, ops=None):
    '''return all bigrams in a Counter'''
    def criteria(word):
        if len(word) > 1 and word not in stops:
            return True
        else:
            return False
            
    # intialize counter
    bigrams = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words]
    
    # throw out sentences with less than two words
    if len(words) < 2:
        return bigrams
    
    # add bigrams to Counter
    for i in range(len(words) - 1):
        b = ' '.join((words[i], words[i+1]))
        if criteria(words[i]) and criteria(words[i+1]):
            if ops:
                if words[i] in ops or words[i+1] in ops:
                    bigrams[b] += 1
            else:
                bigrams[b] += 1
        
    return bigrams

In [21]:
libBigrams = Counter()
for text in tqdm(lib.text):
    libBigrams.update(getBigrams(text, stops))

100%|██████████| 28439/28439 [00:48<00:00, 589.77it/s]


In [17]:
libBigrams.most_common()[:10]

[('mr. speaker', 1622),
 ('mr. chairman', 810),
 ('united states', 471),
 ('stem cell', 398),
 ('health care', 391),
 ('american people', 318),
 ('cell research', 288),
 ('social security', 269),
 ('tax cuts', 261),
 ('patriot act', 228)]

In [22]:
conBigrams = Counter()
for text in con.text:
    conBigrams.update(getBigrams(text, stops))

In [19]:
conBigrams.most_common()[:10]

[('mr. speaker', 1741),
 ('mr. chairman', 888),
 ('united states', 530),
 ('stem cell', 391),
 ('stem cells', 361),
 ('embryonic stem', 283),
 ('small businesses', 283),
 ('small business', 236),
 ('cell research', 230),
 ('patriot act', 221)]

most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [23]:
# get the 1000 most common liberal and conservative bigrams
commonCon = [b[0] for b in conBigrams.most_common()[:1000]]
commonLib = [b[0] for b in libBigrams.most_common()[:1000]]

In [24]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libBigrams_filtered = [(w,libBigrams[w]) for w in commonLib if w not in commonCon]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conBigrams_filtered = [(w,conBigrams[w]) for w in commonCon if w not in commonLib and 'nbsp' not in w]

In [25]:
len(libBigrams_filtered),len(conBigrams_filtered)

(486, 486)

In [26]:
print('top 200 liberal bigrams:')
sorted(libBigrams_filtered,key=lambda x: -x[1])[:200]

top 200 liberal bigrams:


[('republican budget', 116),
 ('republican leadership', 102),
 ('strong opposition', 77),
 ('cbc budget', 67),
 ('child left', 63),
 ('oil companies', 62),
 ('tax breaks', 53),
 ('congressional black', 51),
 ('black caucus', 51),
 ('national debt', 48),
 ('republican party', 45),
 ('public health', 45),
 ('police dept', 45),
 ('alternative budget', 39),
 ('fiscal responsibility', 38),
 ('student loans', 34),
 ('cbc alternative', 34),
 ("majority 's", 33),
 ('law school', 33),
 ('trust fund', 32),
 ('rule 11', 32),
 ('state legislatures', 32),
 ('environmental laws', 31),
 ('special interests', 31),
 ('poor people', 31),
 ('care coverage', 31),
 ('republican colleagues', 30),
 ('democratic alternative', 30),
 ('drinking water', 29),
 ('safety net', 28),
 ('five years', 28),
 ('majority party', 28),
 ('budget cuts', 28),
 ('million per', 27),
 ('card companies', 27),
 ('mr. leader', 27),
 ('fiscally responsible', 27),
 ('20 billion', 27),
 ('university school', 27),
 ('1.5 billion', 27),

In [27]:
print('top 200 conservative bigrams')
sorted(conBigrams_filtered,key=lambda x: -x[1])[:200]

top 200 conservative bigrams


[('house resolution', 77),
 ('new jobs', 55),
 ('immediate consideration', 52),
 ('restaurant association', 42),
 ('class members', 37),
 ('lawsuit abuse', 34),
 ('commission report', 34),
 ('thank chairman', 33),
 ('contractors association', 32),
 ('pension protection', 30),
 ('human embryo', 29),
 ('electrical contractors', 29),
 ('gang members', 29),
 ('oil-for-food program', 29),
 ('identification cards', 29),
 ('legislative days', 28),
 ('personal injury', 28),
 ('national electrical', 28),
 ('property owners', 27),
 ('bankruptcy abuse', 26),
 ('days within', 26),
 ('chinese government', 26),
 ('federal jurisdiction', 26),
 ('spent fuel', 26),
 ('million new', 25),
 ('inner cell', 25),
 ('cell mass', 25),
 ('trial lawyers', 25),
 ('sales tax', 25),
 ('budget request', 25),
 ('start program', 24),
 ('crude oil', 24),
 ('mandatory spending', 23),
 ('important issue', 23),
 ('tax repeal', 23),
 ('urge support', 23),
 ('million jobs', 23),
 ('pluripotent stem', 23),
 ('budget authorit

Looks much more reasonable

In [28]:
# keep just the bigrams and drop the count data
libGrams = [k for k,v in sorted(libBigrams_filtered,key=lambda x: -x[1])[:200]]
conGrams = [k for k,v in sorted(conBigrams_filtered,key=lambda x: -x[1])[:200]]

In [29]:
# savepath = r'../data'
# with open(os.path.join(savepath, 'libGrams.pickle'), 'wb') as f:
#     pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
# with open(os.path.join(savepath, 'conGrams.pickle'), 'wb') as f:
#     pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)
with open('../data/libGrams4.pickle', 'wb') as f:
    pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
with open('../data/conGrams4.pickle', 'wb') as f:
    pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)

# Load bigrams

In [10]:
with open('../data/libGrams4.pickle', 'rb') as f:
    libGrams = pickle.load(f)
    print(libGrams[:5])
with open('../data/conGrams4.pickle', 'rb') as f:
    conGrams = pickle.load(f)
    print(conGrams[:5])

['republican budget', 'republican leadership', 'strong opposition', 'cbc budget', 'child left']
['house resolution', 'new jobs', 'immediate consideration', 'restaurant association', 'class members']


In [11]:
len(libGrams), len(conGrams)

(200, 200)

In [14]:
libGrams[:10]

['republican budget',
 'republican leadership',
 'strong opposition',
 'cbc budget',
 'child left',
 'oil companies',
 'tax breaks',
 'congressional black',
 'black caucus',
 'national debt']

In [13]:
conGrams[:10]

['house resolution',
 'new jobs',
 'immediate consideration',
 'restaurant association',
 'class members',
 'lawsuit abuse',
 'commission report',
 'thank chairman',
 'contractors association',
 'pension protection']

# Determine most politically charged lexicons

In [101]:
# define a helper function to pull the bigrams
def getLexicons(text, stops):
    '''return all bigrams in a Counter'''
    # intialize counter
    lexicons = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words]
    
    # add lexicons to Counter
    for w in words:
        if w.isalpha() and len(w) > 1 and w not in stops:
            lexicons[w] += 1
        
    return lexicons

In [102]:
getLexicons(df.text[0], stops)

Counter({'madam': 1, 'speaker': 1, 'yield': 1, 'time': 1, 'consume': 1})

In [103]:
libLexicons = Counter()
for text in lib.text:
    libLexicons.update(getLexicons(text, stops))

In [104]:
libLexicons.most_common()[:10]

[('speaker', 1848),
 ('people', 1676),
 ('time', 1389),
 ('us', 1306),
 ('chairman', 1165),
 ('health', 1120),
 ('budget', 1108),
 ('today', 1058),
 ('tax', 1047),
 ('one', 1042)]

In [105]:
conLexicons = Counter()
for text in con.text:
    conLexicons.update(getLexicons(text, stops))

In [106]:
conLexicons.most_common()[:10]

[('speaker', 2047),
 ('time', 1619),
 ('chairman', 1553),
 ('gentleman', 1358),
 ('people', 1221),
 ('one', 1156),
 ('support', 1135),
 ('act', 1075),
 ('us', 1034),
 ('committee', 972)]

most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [107]:
# get the 1000 most common liberal and conservative lexicons
commonCon = [L[0] for L in conLexicons.most_common()[:1000]]
commonLib = [L[0] for L in libLexicons.most_common()[:1000]]

In [108]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libLexicons_filtered = [(w,libLexicons[w]) for w in commonLib if w not in commonCon]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conLexicons_filtered = [(w,conLexicons[w]) for w in commonCon if w not in commonLib and 'nbsp' not in w and 'amp' not in w]

In [109]:
len(libLexicons_filtered),len(conLexicons_filtered)

(151, 150)

In [110]:
print('top 100 liberal lexicons:')
sorted(libLexicons_filtered,key=lambda x: -x[1])[:100]

top 100 liberal lexicons:


[('coverage', 186),
 ('housing', 161),
 ('protections', 160),
 ('poor', 143),
 ('cbc', 143),
 ('trillion', 142),
 ('party', 140),
 ('medicaid', 131),
 ('cutting', 126),
 ('gun', 125),
 ('college', 123),
 ('proposal', 122),
 ('values', 122),
 ('deserve', 117),
 ('bahrain', 116),
 ('deficits', 115),
 ('professor', 115),
 ('fails', 110),
 ('despite', 108),
 ('worker', 105),
 ('basic', 104),
 ('crisis', 104),
 ('victims', 102),
 ('corporate', 101),
 ('whose', 100),
 ('iraqi', 100),
 ('failed', 99),
 ('worse', 93),
 ('run', 89),
 ('show', 87),
 ('ethics', 87),
 ('lead', 86),
 ('black', 86),
 ('cafta', 86),
 ('block', 85),
 ('effective', 85),
 ('reject', 85),
 ('short', 84),
 ('billions', 83),
 ('white', 83),
 ('debtors', 83),
 ('agreements', 83),
 ('alone', 82),
 ('uninsured', 82),
 ('trust', 79),
 ('although', 79),
 ('votes', 79),
 ('wealthy', 79),
 ('wealthiest', 78),
 ('essential', 78),
 ('wildlife', 78),
 ('partisan', 76),
 ('caucus', 76),
 ('corporations', 76),
 ('seeking', 75),
 ('afg

In [111]:
print('top 100 conservative lexicons')
sorted(conLexicons_filtered,key=lambda x: -x[1])[:100]

top 100 conservative lexicons


[('embryo', 231),
 ('gang', 184),
 ('lawyers', 122),
 ('growing', 117),
 ('commend', 114),
 ('identification', 114),
 ('license', 109),
 ('shall', 108),
 ('bringing', 99),
 ('gangs', 94),
 ('bit', 92),
 ('nih', 90),
 ('continuing', 90),
 ('illinois', 90),
 ('immediate', 89),
 ('talked', 89),
 ('products', 87),
 ('judiciary', 87),
 ('chinese', 87),
 ('product', 86),
 ('production', 86),
 ('regard', 83),
 ('rates', 82),
 ('activities', 81),
 ('manufacturers', 81),
 ('mentioned', 80),
 ('grow', 79),
 ('piece', 79),
 ('management', 78),
 ('chamber', 78),
 ('tonight', 77),
 ('attention', 77),
 ('stage', 77),
 ('sources', 77),
 ('toward', 77),
 ('direction', 76),
 ('fence', 76),
 ('destruction', 75),
 ('terror', 74),
 ('extend', 73),
 ('results', 73),
 ('discussion', 72),
 ('marrow', 72),
 ('understanding', 72),
 ('supply', 72),
 ('crimes', 72),
 ('tough', 72),
 ('markets', 72),
 ('focus', 71),
 ('produce', 71),
 ('medicine', 71),
 ('county', 70),
 ('lawsuit', 69),
 ('activity', 69),
 ('purp

In [138]:
# keep just the bigrams and drop the count data
libCons = [k for k,v in sorted(libLexicons_filtered,key=lambda x: -x[1])[:100]]
conCons = [k for k,v in sorted(conLexicons_filtered,key=lambda x: -x[1])[:100]]
# savepath = r'../data'
# with open(os.path.join(savepath, 'libCons.pickle'), 'wb') as f:
#     pickle.dump(libCons, f, pickle.HIGHEST_PROTOCOL)
# with open(os.path.join(savepath, 'conCons.pickle'), 'wb') as f:
#     pickle.dump(conCons, f, pickle.HIGHEST_PROTOCOL)
with open('../data/libCons3.pickle', 'wb') as f:
    pickle.dump(libCons, f, pickle.HIGHEST_PROTOCOL)
with open('../data/conCons3.pickle', 'wb') as f:
    pickle.dump(conCons, f, pickle.HIGHEST_PROTOCOL)

In [140]:
with open('../data/libCons3.pickle', 'rb') as f:
    libCons = pickle.load(f)
    print(libCons[:5])
with open('../data/conCons3.pickle', 'rb') as f:
    conCons = pickle.load(f)
    print(conCons[:5])

['coverage', 'housing', 'protections', 'poor', 'cbc']
['embryo', 'gang', 'lawyers', 'growing', 'commend']


# Filter Data
filter the original dataset for only those sentences that contain politically charged bigrams and lexicons

In [31]:
def keepSentence(label, text):
    # get bigrams and lexicons
    if len(text.split()) < 6:
        return False
    bigrams = getBigrams(text,stops).keys()
#     lexicons = getLexicons(text,stops).keys()
    
    # get the bigrams and lexicons that appear in the ideology lists
    libBigramSet = set(bigrams).intersection(libGrams)
#     libLexiconSet = set(lexicons).intersection(libCons)
    conBigramSet = set(bigrams).intersection(conGrams)
#     conLexiconSet = set(lexicons).intersection(conCons)
    
    # determine whether to keep the sentence
    if label == 'liberal':
#         if libBigramSet or libLexiconSet:
        if libBigramSet:
            return True
        else:
            return False
    elif label == 'conservative':
#         if conBigramSet or conLexiconSet:
        if conBigramSet:
            return True
        else:
            return False
    elif label == 'neutral':
        if libBigramSet or conBigramSet:
            return False
        else:
            return True

In [32]:
def filterText(df):
    filteredText = []
    for i in tqdm(range(df.shape[0])):
        label,text = df.label[i], df.text[i]
        if keepSentence(label,text):
            filteredText.append((label, text))
            
    return pd.DataFrame(filteredText, columns=['label','text'])

In [33]:
start = time.time()
df_filtered = filterText(df)
print('runtime (min):', (time.time() - start) / 60)

100%|██████████| 54830/54830 [01:47<00:00, 509.27it/s]

runtime (min): 1.7945236682891845





In [34]:
df.shape, df_filtered.shape

((54830, 2), (6693, 2))

In [35]:
df_filtered.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
conservative,2831
liberal,3678
neutral,184


In [33]:
n = 20
sample = df_filtered.sample(n)
for i in range(n):
    print(i, sample.iloc[i].values)

NameError: name 'df_filtered' is not defined

In [37]:
# savepath = r'../data'
# with open(os.path.join(savepath, 'filteredConvote.pickle'), 'wb') as f:
#     pickle.dump(df_filtered, f, pickle.HIGHEST_PROTOCOL)
with open('../data/filteredConvote4.pickle', 'wb') as f:
    pickle.dump(df_filtered, f, pickle.HIGHEST_PROTOCOL)

In [7]:
with open('../data/filteredConvote4.pickle', 'rb') as f:
    test = pickle.load(f)
test.sample(5)

Unnamed: 0,label,text
5052,conservative,the most important step we can take to curb ob...
6183,liberal,because section 1502 would apply to all such c...
3914,conservative,gang activity has been directly linked to the ...
374,liberal,"810 , with the informed consent of the donor ,..."
4021,conservative,the science -- the committee on science shall ...


In [34]:
n = 20
sample = test.sample(n)
for i in range(n):
    print(i, sample.iloc[i].values)

0 ['conservative'
 'small business , 99.7 percent of all business is small business ; and 75 percent of all new jobs are hired in the small business sector .']
1 ['liberal'
 'in short , in the considered judgment of each of these 18 state legislatures , laws have been enacted that best serve their states .']
2 ['conservative'
 'mr. chairman , this has been a fine debate today , and i believe that both sides have handled it very responsibly , but let us take a look at what the real issue is .']
3 ['conservative'
 '418 is critical to the continued construction of the southwest border fence in san diego .']
4 ['liberal' 'i can not believe the republican budget .']
5 ['conservative'
 '3824 will allow the secretary of interior to compensate private property owners for the fair market value of the loss of use of their property when the secretary concludes that the use of the property would be a taking .']
6 ['conservative'
 'as they start to interconnect , we need laws that can address this 

In [37]:
for v in test[test.label == 'liberal'].sample(5).values:
    print(v)

['liberal'
 'unfortunately , the republican leadership was not willing to follow that more reasonable approach , and is insisting on sticking with their own recipe .']
['liberal' 'that is why the judicial conference looks at rules changes .']
['liberal'
 "the committee has seen fit to provide $ 20 million over the president 's request for operations , an increase i support , but our national parks should be safe places , where parents and children can roam and relax , where they can picnic and hike and raft ."]
['liberal'
 'this adjustment would restore an estimated $ 36.3 billion in fy 2006 , including nearly $ 4 billion for deficit reduction .']
['liberal'
 "mr. speaker , it is one of the more amazing aspects of this debate that the party that claims to be for states ' rights and tries to take political advantage of saying , listen , states , we stand for you and what you decide to do on a policy level , is so quick to jettison states ' rights when it becomes politically inconvenient

In [38]:
for v in test[test.label == 'conservative'].sample(5).values:
    print(v)

['conservative'
 'i note there are some small increases included in the bill for invasive species efforts by the fish and wildlife service also .']
['conservative' 'mr. chairman , i yield myself 30 seconds .']
['conservative'
 "it 's a tremendous step in the right direction for this nation to achieve energy independence ."]
['conservative'
 '`` and , madam speaker , 60 percent said that democrats want to use this issue for political advantage rather than trying to get at what went wrong .']
['conservative'
 'thus , there is no reason to believe that the mass action provision would affect any vioxx-related cases whatsoever .']


In [32]:
for v in test[test.label == 'neutral'].sample(5).values:
    print(v)

['neutral'
 'we should be very proud that , on this issue , such diverse groups as the aclu , the american conservative union , the gun owners of america , the u.s. chamber of commerce , the american library association and the american book sellers association have come together to say to congress , please support the senate version .']
['neutral'
 "according to a recent report from the department of labor 's bureau of labor statistics , over the next decade , seven out of the 10 fastest-growing occupations will be low-paying , low-skilled jobs that do not require a college education ."]
['neutral'
 'president george bush has not created one new job in the private sector since he has been in office ; he has lost jobs .']
['neutral'
 'mr. speaker , can i inquire again as to how much time remains ?']
['neutral'
 'i am not going to suggest that trade alone is the only reason for the decline of the middle class .']
