In [1]:
import pandas as pd
import nltk
import time, os, pickle
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, names, opinion_lexicon
from collections import Counter
from tqdm import tqdm, tqdm_notebook
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('names')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /home/alex/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/alex/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

In [2]:
datapath = r'/home/alex/Documents/MIDS/w266/final_project/data/convote_v1.1/data_stage_three'

In [3]:
os.listdir(datapath)

['test_set', 'development_set', 'training_set']

# Load data
combine the three datasets into one since we are adding it to other datasets later and will do a train test split on the full data at that point

In [4]:
# check how many files we are working with and load them into a list
files = []
# party2label = {'D':'liberal','R':'conservative','I':'neutral'}
party2label = {'D':1,'R':-1,'I':0}

# iterate through directory
for i, (dirName, subDirList, fileList) in enumerate(os.walk(datapath)):
    if i > 0:
        # print the number of files in each dataset
        print(os.listdir(datapath)[i-1], ':', len(fileList))
    for i,f in enumerate(fileList):
        # convert from party label to ideological label
        label = party2label[f.split('_')[-1][0]]
        
        # store the filepath with the label attached
        filepath = os.path.join(dirName,f)
        files.append((label, filepath))
print('Total files :', len(files))

test_set : 860
development_set : 257
training_set : 2740
Total files : 3857


In [5]:
# parse sentences and load into a DataFrame
sentences = []
for label, filename in files:
    with open(filename, 'r') as f:
        # split the sentences
        sents = sent_tokenize(f.read())
        
        # collect (label, sentence) tuples
        for sent in sents:
            sentences.append((label, sent))

df = pd.DataFrame(sentences, columns=['label', 'text'])
print('Total sentences: ', df.shape[0])

Total sentences:  54830


In [6]:
df.sample(5)

Unnamed: 0,label,text
29305,1,when congress passed h. con .
50317,1,the one aspect of this bill that seems directe...
38126,-1,"the states , the courts and the american peopl..."
54484,1,"mr. speaker , the gentleman brags about the ad..."
22354,-1,"mr. speaker , i thank the gentleman for yieldi..."


In [7]:
df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
-1,26155
0,236
1,28439


# Determine most politically charged bigrams and trigrams

In [8]:
# split data into liberal and conservative
lib = df.loc[df.label == 1]
con = df.loc[df.label == -1]

In [9]:
stops = [w.lower() for w in stopwords.words('english')] + \
        ['``', "'s", 'sensenbrenner', 'chairman', "n't", 'support','extraneous','even', 'thank']
#         ['would', 'could','really','sensenbrenner',"n't"]
        
stops[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [10]:
men = [w.lower() for w in names.words('male.txt')]
women = [w.lower() for w in names.words('female.txt')]
stopNames = men + women

In [11]:
ops = opinion_lexicon.words()

In [12]:
len(stops), len(stopNames), len(ops)

(188, 7944, 6789)

In [13]:
stops.extend(stopNames)

In [14]:
stops = set(stops)
ops = set(ops)

In [15]:
len(stops), len(ops)

(7761, 6786)

In [16]:
# define a helper function to pull the bigrams
def getNgrams(text, stops=None, ops=None, n=2):
    '''return all bigrams in a Counter'''
    def criteria(word):
        if stops:
            if len(word) > 1 and not word.isnumeric() and word not in stops:
                return True
            else:
                return False
        else:
            return True
            
    # intialize counter
    ngrams = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words]
    
    # throw out sentences with less than two words
    if len(words) < 2:
        return ngrams
    
    # add bigrams to Counter
    for i in range(len(words) - (n-1)):
        gram = ' '.join(words[i:i+n])
        if all([criteria(w) for w in words[i:i+n]]):
            if ops:
                if any([w in ops for w in words[i:i+n]]):
                    ngrams[gram] += 1
            else:
                ngrams[gram] += 1
        
    return ngrams

In [17]:
libTrigrams = Counter()
for text in tqdm(lib.text):
#     libBigrams.update(getBigrams(text, stops))
    libTrigrams.update(getNgrams(text, stops=stops, n=3))

100%|██████████| 28439/28439 [00:06<00:00, 4846.82it/s]


In [18]:
libBigrams = Counter()
for text in tqdm(lib.text):
#     libBigrams.update(getBigrams(text, stops))
    libBigrams.update(getNgrams(text, stops=stops, ops=ops, n=2))

100%|██████████| 28439/28439 [00:06<00:00, 4709.07it/s]


In [19]:
conTrigrams = Counter()
for text in tqdm(con.text):
    conTrigrams.update(getNgrams(text, stops=stops, n=3))

100%|██████████| 26155/26155 [00:05<00:00, 4547.01it/s]


In [20]:
conBigrams = Counter()
for text in tqdm(con.text):
    conBigrams.update(getNgrams(text, stops=stops, ops=ops, n=2))

100%|██████████| 26155/26155 [00:05<00:00, 4634.27it/s]


most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [21]:
# get the 1000 most common liberal and conservative bigrams
commonConTrigrams = [b[0] for b in conTrigrams.most_common()[:1000]]
commonLibTrigrams = [b[0] for b in libTrigrams.most_common()[:1000]]
commonConBigrams = [b[0] for b in conBigrams.most_common()[:1000]]
commonLibBigrams = [b[0] for b in libBigrams.most_common()[:1000]]

In [22]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libTrigrams_filtered = [(w, libTrigrams[w]) for w in commonLibTrigrams if w not in commonConTrigrams]
libBigrams_filtered = [(w, libBigrams[w]) for w in commonLibBigrams if w not in commonConBigrams]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conTrigrams_filtered = [(w, conTrigrams[w]) for w in commonConTrigrams if w not in commonLibTrigrams]
conBigrams_filtered = [(w, conBigrams[w]) for w in commonConBigrams if w not in commonLibBigrams]

In [23]:
len(libTrigrams_filtered),len(conTrigrams_filtered),len(libBigrams_filtered),len(conBigrams_filtered)

(751, 751, 665, 665)

In [24]:
print('top 10 liberal trigrams:')
# sorted(libTrigrams_filtered,key=lambda x: -x[1])[:10]
for k,v in sorted(libTrigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 liberal trigrams:
social security trust
security trust fund
cbc alternative budget
black caucus budget
estate tax relief
privatize social security
u.s. trade deficit
republican budget resolution
national wildlife refuge
guardian ad litem


In [25]:
print('top 10 liberal bigrams:')
for k,v in sorted(libBigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 liberal bigrams:
tax breaks
security trust
bad policy
would lose
reduce crime
budget reconciliation
ethical standard
fiscally irresponsible
working poor
subpoena power


In [26]:
print('top 10 conservative trigrams')
for k,v in sorted(conTrigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 conservative trigrams
national electrical contractors
electrical contractors association
legislative days within
inner cell mass
head start program
community protection act
million new jobs
death tax repeal
9/11 commission report
stem cells without


In [28]:
print('top 10 conservative bigrams')
for k,v in sorted(conBigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 conservative bigrams
community protection
free market
organized crime
bankruptcy relief
good news
relief extension
delayed notification
soft money
illegal aliens
invasive species


In [34]:
# keep just the bigrams and drop the count data
libGrams = [k for k,v in sorted(libTrigrams_filtered,key=lambda x: -x[1])[:100]] + \
           [k for k,v in sorted(libBigrams_filtered,key=lambda x: -x[1])[:100]]
conGrams = [k for k,v in sorted(conTrigrams_filtered,key=lambda x: -x[1])[:100]] + \
           [k for k,v in sorted(conBigrams_filtered,key=lambda x: -x[1])[:100]]

In [35]:
# savepath = r'../data'
# with open(os.path.join(savepath, 'libGrams.pickle'), 'wb') as f:
#     pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
# with open(os.path.join(savepath, 'conGrams.pickle'), 'wb') as f:
#     pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)
with open('../data/libGrams5.pickle', 'wb') as f:
    pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
with open('../data/conGrams5.pickle', 'wb') as f:
    pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)

# Load bigrams

In [29]:
with open('../data/libGrams5.pickle', 'rb') as f:
    libGrams = pickle.load(f)
    print(libGrams[:5])
with open('../data/conGrams5.pickle', 'rb') as f:
    conGrams = pickle.load(f)
    print(conGrams[:5])

['social security trust', 'security trust fund', 'cbc alternative budget', 'black caucus budget', 'estate tax relief']
['national electrical contractors', 'electrical contractors association', 'legislative days within', 'inner cell mass', 'head start program']


In [30]:
len(libGrams), len(conGrams)

(200, 200)

In [31]:
libGrams[-10:]

['hard look',
 'corporate interests',
 'critical funding',
 'broken promises',
 'safe drinking',
 'religious persecution',
 'work within',
 'reconciliation package',
 'new debt',
 'budget supports']

In [32]:
conGrams[-10:]

['necessary reforms',
 'another attack',
 'wrong message',
 'particularly pleased',
 'great strides',
 'best possible',
 'federal criminal',
 'adversarial relationship',
 'top rate',
 'environmental concerns']

# Filter Data
filter the original dataset for only those sentences that contain politically charged bigrams and lexicons

In [40]:
def keepSentence(label, text):
    # get bigrams and lexicons
    if len(text.split()) < 6:
        return False
    ngrams = list(getNgrams(text,n=2).keys()) + list(getNgrams(text,n=3).keys())
    
    # get the bigrams and lexicons that appear in the ideology lists
    libNgramSet = set(ngrams).intersection(libGrams)
    conNgramSet = set(ngrams).intersection(conGrams)
    
    # determine whether to keep the sentence
    if label == 1:
        return libNgramSet
    elif label == -1:
        return conNgramSet
    else:
        if libNgramSet or conNgramSet:
            return False
        else:
            return True

In [41]:
def filterText(df):
    filteredText = []
    for i in tqdm(range(df.shape[0])):
        label,text = df.label[i], df.text[i]
        ngrams = keepSentence(label,text)
        if ngrams:
            filteredText.append((label, text, ngrams))
            
    return pd.DataFrame(filteredText, columns=['label','text','ngrams'])

In [42]:
df_filtered = filterText(df)

100%|██████████| 54830/54830 [00:25<00:00, 2127.24it/s]


In [43]:
df.shape, df_filtered.shape

((54830, 2), (2922, 3))

In [44]:
df_filtered.groupby('label').count()

Unnamed: 0_level_0,text,ngrams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,1326,1326
0,213,213
1,1383,1383


In [45]:
n = 20
sample = df_filtered.sample(n)
for i in range(n):
    print(i, sample.iloc[i].values)

0 [1
 'the second question we will be asking will be : would you rather spend $ 7.9 billion on a ballistic missile defense program which has been tested time after time after time and has failed all of those tests , or would you rather spend that $ 7.8 billion on providing more security to our troops , body armor , personnel support equipment , and other protective gear for our troops , and providing more benefits to our veterans in this country ?'
 {'ballistic missile defense'}]
1 [1
 'they have been especially resistant to providing the adequate funding along the borders , especially the canadian border .'
 {'adequate funding'}]
2 [-1
 'the sponsors intend that such a conclusion would favor allowing the state court in which the action was originally filed to handle the litigation .'
 {'would favor'}]
3 [1
 "those of us on this side of the aisle believe we have a better approach , one that is fair to millions of americans and their families who get up every morning , put in a hard day

In [49]:
# savepath = r'../data'
# with open(os.path.join(savepath, 'filteredConvote.pickle'), 'wb') as f:
#     pickle.dump(df_filtered, f, pickle.HIGHEST_PROTOCOL)
with open('../data/filteredConvote5.pickle', 'wb') as f:
    pickle.dump(df_filtered.loc[:,['label','text']], f, pickle.HIGHEST_PROTOCOL)

In [33]:
with open('../data/filteredConvote5.pickle', 'rb') as f:
    test = pickle.load(f)
test.sample(5)

Unnamed: 0,label,text
1273,-1,what it merely says is that in an instance whe...
1337,-1,"in years past , when those of us on the subcom..."
1333,-1,i have here an april 26 story from the associa...
2761,1,why close the doors to those who are injured b...
1983,-1,"finally , i must oppose this bill because it e..."


In [43]:
for s in test[test.label == 1].sample(5).text.values:
    print(s,'\n')

it is clear that there would be plenty of money to deal with the social security trust fund if the president were not using the social security trust fund as a slush fund to give tax cuts to the wealthiest people in america . 

this bill is another missed opportunity to take america into the future , to take america into the leadership around the world in energy production , energy innovation , and energy technology ; to create a new generation of important products , and a new generation of jobs . 

instead , these changes will make it harder for people legitimately fleeing persecution to prove their asylum claims and gain protection here . 

in it , congress provides the yearly resources needed to keep our families healthy , our children educated , our workers employed , and our most vulnerable citizens a productive part of our society . 

under the business records provision , section 215 of the patriot act , the bill provides that the government may seek a court order for `` any ta

In [41]:
for s in test[test.label == -1].sample(5).text.values:
    print(s,'\n')

what do you do about private property rights ? 

on both the business records and delayed notification sections of the patriot act ( among others ) , the stance of the american civil liberties union and like-minded critics seems to have an ulterior motive . 

it will create a comprehensive national system for sex offender registration , improve information exchange between states when sex offenders move from state to state , and increase penalties for failing to comply with the registration law . 

i ask members to support the osha reform and in particular h.r. 

that legislation helped to streamline the intelligence community and tightened some asylum rules that allowed potential terrorists to remain in our country . 



In [42]:
for s in test[test.label == 0].sample(5).text.values:
    print(s,'\n')

let us look at what is going on in america today . 

andy grove , the founder of intel , predicts that the united states will lose the bulk of its information technology to jobs to china and india within the next decade . 

just yesterday , we learned that general motors is now going to cut back on another 25 , 000 good-paying jobs for american workers . 

let us pass this resolution . 

mr. speaker , parliamentary inquiry . 

