In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
import nltk
import time, os, re, pickle
from nltk.tokenize import word_tokenize, sent_tokenize
# from itertools import combinations, permutations
from collections import Counter
# plt.style.use('ggplot')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
datapath = r'/home/alex/Documents/MIDS/w266/final_project/data/convote_v1.1/data_stage_three'

In [3]:
os.listdir(datapath)

['test_set', 'development_set', 'training_set']

# Load data
combine the three datasets into one since we are adding it to other datasets later and will do a train test split on the full data at that point

In [4]:
# check how many files we are working with and load them into a list
files = []
party2label = {'D':'liberal','R':'conservative','I':'neutral'}

# iterate through directory
for i, (dirName, subDirList, fileList) in enumerate(os.walk(datapath)):
    if i > 0:
        # print the number of files in each dataset
        print(os.listdir(datapath)[i-1], ':', len(fileList))
    for i,f in enumerate(fileList):
        # convert from party label to ideological label
        label = party2label[f.split('_')[-1][0]]
        
        # store the filepath with the label attached
        filepath = os.path.join(dirName,f)
        files.append((label, filepath))
print('Total files :', len(files))

test_set : 860
development_set : 257
training_set : 2740
Total files : 3857


In [5]:
files[0]

('conservative',
 '/home/alex/Documents/MIDS/w266/final_project/data/convote_v1.1/data_stage_three/test_set/414_400343_2897011_RON.txt')

In [6]:
# parse sentences and load into a DataFrame
sentences = []
for label, filename in files:
    with open(filename, 'r') as f:
        # split the sentences
        sents = sent_tokenize(f.read())
        
        # collect (label, sentence) tuples
        for sent in sents:
            sentences.append((label, sent))

df = pd.DataFrame(sentences, columns=['label', 'text'])
print('Total sentences: ', df.shape[0])

Total sentences:  54830


In [7]:
df.sample(5)

Unnamed: 0,label,text
53615,conservative,"mr. chairman , reserving the right to object ,..."
17417,liberal,it does not really speak to trying to make sur...
19638,liberal,"now , you should know where i am coming from ,..."
35567,liberal,"because the needs of victims are so crucial , ..."
39609,liberal,"if released into the water table , a small por..."


In [8]:
df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
conservative,26155
liberal,28439
neutral,236


# Determine most politically charged bigrams

In [9]:
# split data into liberal and conservative
lib = df.loc[df.label == 'liberal']
con = df.loc[df.label == 'conservative']

In [10]:
# define a helper function to pull the bigrams
def getBigrams(text):
    '''return all bigrams in a Counter'''
    # intialize counter
    bigrams = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words if w.isalpha() and len(w) > 1]
    
    # throw out sentences with less than two words
    if len(words) < 2:
        return bigrams
    
    # add bigrams to Counter
    for i in range(len(words) - 1):
        b = ' '.join((words[i], words[i+1]))
        bigrams[b] += 1
        
    return bigrams

In [11]:
libBigrams = Counter()
for text in lib.text:
    libBigrams.update(getBigrams(text))

In [12]:
libBigrams.most_common()[:10]

[('of the', 4107),
 ('in the', 2570),
 ('to the', 1824),
 ('it is', 1682),
 ('this bill', 1544),
 ('for the', 1260),
 ('on the', 1259),
 ('and the', 1240),
 ('we are', 1149),
 ('that is', 1136)]

In [13]:
conBigrams = Counter()
for text in con.text:
    conBigrams.update(getBigrams(text))

In [14]:
conBigrams.most_common()[:10]

[('of the', 4135),
 ('in the', 2510),
 ('it is', 1759),
 ('to the', 1649),
 ('the gentleman', 1331),
 ('we have', 1276),
 ('and the', 1262),
 ('that is', 1176),
 ('on the', 1167),
 ('that we', 1119)]

most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [15]:
# get the 1000 most common liberal and conservative bigrams
commonCon = [b[0] for b in conBigrams.most_common()[:2000]]
commonLib = [b[0] for b in libBigrams.most_common()[:2000]]

In [16]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libBigrams_filtered = [(w,libBigrams[w]) for w in commonLib[:1000] if w not in commonCon]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conBigrams_filtered = [(w,conBigrams[w]) for w in commonCon[:1000] if w not in commonLib and 'nbsp' not in w]

In [17]:
len(libBigrams_filtered),len(conBigrams_filtered)

(85, 63)

In [19]:
print('top 10 liberal bigrams:')
sorted(libBigrams_filtered,key=lambda x: -x[1])

top 10 liberal bigrams:


[('the republicans', 172),
 ('nothing to', 138),
 ('the cbc', 129),
 ('republican budget', 117),
 ('the estate', 103),
 ('civil rights', 102),
 ('republican leadership', 102),
 ('fails to', 99),
 ('the bush', 95),
 ('this administration', 94),
 ('professor of', 93),
 ('bush administration', 91),
 ('to oppose', 91),
 ('republican majority', 85),
 ('by million', 82),
 ('oppose this', 81),
 ('middle class', 80),
 ('strong opposition', 77),
 ('cuts to', 77),
 ('cuts for', 77),
 ('no on', 74),
 ('left behind', 74),
 ('to cut', 72),
 ('working families', 70),
 ('vote no', 70),
 ('tax cut', 70),
 ('cbc budget', 69),
 ('by billion', 68),
 ('the oil', 68),
 ('the social', 67),
 ('no child', 67),
 ('the wealthiest', 66),
 ('does nothing', 65),
 ('the alternative', 64),
 ('child left', 63),
 ('the pentagon', 63),
 ('oil companies', 62),
 ('school of', 62),
 ('war in', 61),
 ('appropriations bill', 60),
 ('budget that', 58),
 ('the wrong', 57),
 ('million americans', 57),
 ('alternative minimum', 

In [20]:
print('top 10 conservative bigrams')
sorted(conBigrams_filtered,key=lambda x: -x[1])

top 10 conservative bigrams


[('death tax', 175),
 ('economic growth', 82),
 ('driver license', 82),
 ('protection act', 81),
 ('human life', 78),
 ('chairman for', 77),
 ('house resolution', 77),
 ('of human', 77),
 ('hard work', 75),
 ('world trade', 74),
 ('forward to', 73),
 ('his leadership', 73),
 ('jurisdiction over', 71),
 ('ask for', 70),
 ('and ask', 69),
 ('class actions', 67),
 ('the corps', 65),
 ('the embryo', 64),
 ('the things', 63),
 ('human embryos', 63),
 ('may not', 62),
 ('on rules', 61),
 ('resolution and', 61),
 ('little bit', 61),
 ('bone marrow', 61),
 ('an embryo', 59),
 ('economy and', 58),
 ('look forward', 58),
 ('is something', 58),
 ('the chinese', 58),
 ('new jobs', 57),
 ('destruction of', 56),
 ('trade organization', 56),
 ('call up', 55),
 ('frivolous lawsuits', 54),
 ('ask unanimous', 54),
 ('its immediate', 54),
 ('passed the', 54),
 ('on terror', 53),
 ('form of', 53),
 ('and thank', 53),
 ('very very', 53),
 ('all members', 52),
 ('immediate consideration', 52),
 ('the border

Looks much more reasonable

In [21]:
# keep just the bigrams and drop the count data
libGrams = [k for k,v in libBigrams_filtered]
conGrams = [k for k,v in conBigrams_filtered]
savepath = r'../data'
# with open(os.path.join(savepath, 'libGrams.pickle'), 'wb') as f:
#     pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
# with open(os.path.join(savepath, 'conGrams.pickle'), 'wb') as f:
#     pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'libGrams2.pickle'), 'wb') as f:
    pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'conGrams2.pickle'), 'wb') as f:
    pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)

In [22]:
with open(os.path.join(savepath, 'libGrams2.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])
with open(os.path.join(savepath, 'conGrams2.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])

['the republicans', 'nothing to', 'the cbc', 'republican budget', 'the estate']
['death tax', 'economic growth', 'driver license', 'protection act', 'human life']


# Determine most politically charged lexicons

In [23]:
# define a helper function to pull the bigrams
def getLexicons(text):
    '''return all bigrams in a Counter'''
    # intialize counter
    lexicons = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words if w.isalpha() and len(w) > 1]
    
    # add lexicons to Counter
    for w in words:
        lexicons[w] += 1
        
    return lexicons

In [24]:
getLexicons(df.text[0])

Counter({'madam': 1,
         'speaker': 1,
         'yield': 1,
         'myself': 1,
         'such': 1,
         'time': 1,
         'as': 1,
         'may': 1,
         'consume': 1})

In [25]:
libLexicons = Counter()
for text in lib.text:
    libLexicons.update(getLexicons(text))

In [26]:
libLexicons.most_common()[:10]

[('the', 35641),
 ('to', 19094),
 ('of', 16658),
 ('and', 15338),
 ('that', 11591),
 ('in', 10829),
 ('is', 8746),
 ('this', 7928),
 ('for', 6904),
 ('we', 6716)]

In [27]:
conLexicons = Counter()
for text in con.text:
    conLexicons.update(getLexicons(text))

In [28]:
conLexicons.most_common()[:10]

[('the', 32341),
 ('to', 17535),
 ('of', 16200),
 ('and', 15279),
 ('that', 11445),
 ('in', 9823),
 ('is', 8510),
 ('we', 7102),
 ('this', 6905),
 ('for', 6033)]

most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [29]:
# get the 1000 most common liberal and conservative lexicons
commonCon = [L[0] for L in conLexicons.most_common()[:2000]]
commonLib = [L[0] for L in libLexicons.most_common()[:2000]]

In [30]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libLexicons_filtered = [(w,libLexicons[w]) for w in commonLib[:1000] if w not in commonCon]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conLexicons_filtered = [(w,conLexicons[w]) for w in commonCon[:1000] if w not in commonLib and 'nbsp' not in w and 'amp' not in w]

In [31]:
len(libLexicons_filtered),len(conLexicons_filtered)

(23, 9)

In [32]:
print('top 20 liberal lexicons:')
sorted(libLexicons_filtered,key=lambda x: -x[1])[:20]

top 20 liberal lexicons:


[('cbc', 143),
 ('gun', 125),
 ('values', 122),
 ('deficits', 115),
 ('fails', 110),
 ('theresa', 103),
 ('failed', 99),
 ('worse', 93),
 ('black', 86),
 ('reject', 85),
 ('michael', 80),
 ('wealthy', 79),
 ('wealthiest', 78),
 ('wildlife', 78),
 ('partisan', 76),
 ('caucus', 76),
 ('profits', 73),
 ('estates', 71),
 ('irresponsible', 71),
 ('student', 71)]

In [33]:
print('top 20 conservative lexicons')
sorted(conLexicons_filtered,key=lambda x: -x[1])[:20]

top 20 conservative lexicons


[('embryo', 231),
 ('shall', 108),
 ('gangs', 94),
 ('stage', 77),
 ('sensenbrenner', 76),
 ('fence', 76),
 ('activity', 69),
 ('sales', 62),
 ('addresses', 62)]

In [34]:
# keep just the bigrams and drop the count data
libCons = [k for k,v in libLexicons_filtered]
conCons = [k for k,v in conLexicons_filtered]
savepath = r'../data'
# with open(os.path.join(savepath, 'libCons.pickle'), 'wb') as f:
#     pickle.dump(libCons, f, pickle.HIGHEST_PROTOCOL)
# with open(os.path.join(savepath, 'conCons.pickle'), 'wb') as f:
#     pickle.dump(conCons, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'libCons2.pickle'), 'wb') as f:
    pickle.dump(libCons, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'conCons2.pickle'), 'wb') as f:
    pickle.dump(conCons, f, pickle.HIGHEST_PROTOCOL)

In [35]:
with open(os.path.join(savepath, 'libCons2.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])
with open(os.path.join(savepath, 'conCons2.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])

['cbc', 'gun', 'values', 'deficits', 'fails']
['embryo', 'shall', 'gangs', 'stage', 'sensenbrenner']


# Filter Data
filter the original dataset for only those sentences that contain politically charged bigrams and lexicons

In [36]:
def keepSentence(label, text):
    # get bigrams and lexicons
    bigrams = getBigrams(text).keys()
    lexicons = getLexicons(text).keys()
    
    # get the bigrams and lexicons that appear in the ideology lists
    libBigramSet = set(bigrams).intersection(libGrams)
    libLexiconSet = set(lexicons).intersection(libCons)
    conBigramSet = set(bigrams).intersection(conGrams)
    conLexiconSet = set(lexicons).intersection(conCons)
    
    # determine whether to keep the sentence
    if label == 'liberal':
        if libBigramSet or libLexiconSet:
            return True
        else:
            return False
    elif label == 'conservative':
        if conBigramSet or conLexiconSet:
            return True
        else:
            return False
    elif label == 'neutral':
        if libBigramSet or libLexiconSet or conBigramSet or conLexiconSet:
            return False
        else:
            return True

In [37]:
def filterText(df):
    filteredText = []
    for i in range(df.shape[0]):
        label,text = df.label[i], df.text[i]
        if keepSentence(label,text):
            filteredText.append((label, text))
            
    return pd.DataFrame(filteredText, columns=['label','text'])

In [38]:
df_filtered = filterText(df)
df_filtered.sample(5)

Unnamed: 0,label,text
54,liberal,the estate tax level attaches at $ 1.5 million...
2901,liberal,mr. pearse 's investigation concluded that the...
8401,liberal,"but i think the gentleman is correct , it woul..."
171,liberal,the resolution correctly says there is a need ...
7106,neutral,industry after industry has been decimated in ...


In [39]:
df.shape, df_filtered.shape

((54830, 2), (8459, 2))

In [40]:
df_filtered.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
conservative,3399
liberal,4885
neutral,175


In [45]:
sample = df_filtered.sample(10)
for i in range(sample.shape[0]):
    print(i, sample.label.iloc[i], sample.text.iloc[i])

0 liberal it is unfortunate that republicans in congress are choosing to strip away essential safeguards for families in order to implement tax cuts benefiting the wealthiest americans .
1 conservative in the closing days of the 108th congress , some in the other body objected to commonsense provisions that deal with our national security and our border security , to wit , the notion that when you apply for a driver 's license or another legal document , you should be who you say you are , and you should enjoy legal status in this country .
2 liberal before , the invasion , the pentagon planned to reduce our troop levels to 20 , 000-30 , 000 within a few weeks of overthrowing saddam hussein .
3 conservative it should start by requesting funding for all of the border enforcement positions that congress authorized last year .
4 liberal and i think when we see that five out of eight of the top appointments at fema are political hacks , i think when we look at passing a drug bill that does

In [46]:
savepath = r'../data'
# with open(os.path.join(savepath, 'filteredConvote.pickle'), 'wb') as f:
#     pickle.dump(df_filtered, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'filteredConvote2.pickle'), 'wb') as f:
    pickle.dump(df_filtered, f, pickle.HIGHEST_PROTOCOL)

In [47]:
with open(os.path.join(savepath, 'filteredConvote2.pickle'), 'rb') as f:
    test = pickle.load(f)
test.sample(5)

Unnamed: 0,label,text
7150,conservative,"after months of negotiation , i have been told..."
1158,conservative,"and in spite of all this , the death tax does ..."
4981,conservative,this provision is intended to respond to conce...
660,liberal,we must work to meet our existing obligations ...
1486,conservative,you asked if cells from the inner cell mass co...


In [48]:
for i,s in test.sample(10).values:
    print(i,s)

conservative so i am in strong support of this resolution , and i hope all members of the house of representatives will support it .
conservative the construction of that fence is critical to our national security and has been delayed for far too long and i think it is imperative that it be constructed as soon as possible .
liberal it was not on these provisions as you know because a change was made , not in a partisan sense , according to the gentleman from maryland ( mr. cardin )  .
conservative and who can forget on september 11 , 2001 , when firefighters in new york pulled our flag out of the rubble of the world trade center and hoisted it in defiance of terror ?
conservative its provisions should be read broadly , with a strong preference that interstate class actions should be heard in a federal court if removed by any defendant .
conservative `` all of the research you have there stops short of the creation of an embryo for experimental purposes , and short of destruction of an 