In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
import nltk
import time, os, re, pickle
from nltk.tokenize import word_tokenize, sent_tokenize
# from itertools import combinations, permutations
from collections import Counter
# plt.style.use('ggplot')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
datapath = r'/home/alex/Documents/MIDS/w266/final_project/data/convote_v1.1/data_stage_three'

In [3]:
os.listdir(datapath)

['test_set', 'development_set', 'training_set']

# Load data
combine the three datasets into one since we are adding it to other datasets later and will do a train test split on the full data at that point

In [4]:
# check how many files we are working with and load them into a list
files = []
party2label = {'D':'liberal','R':'conservative','I':'neutral'}

# iterate through directory
for i, (dirName, subDirList, fileList) in enumerate(os.walk(datapath)):
    if i > 0:
        # print the number of files in each dataset
        print(os.listdir(datapath)[i-1], ':', len(fileList))
    for i,f in enumerate(fileList):
        # convert from party label to ideological label
        label = party2label[f.split('_')[-1][0]]
        
        # store the filepath with the label attached
        filepath = os.path.join(dirName,f)
        files.append((label, filepath))
print('Total files :', len(files))

test_set : 860
development_set : 257
training_set : 2740
Total files : 3857


In [5]:
files[0]

('conservative',
 '/home/alex/Documents/MIDS/w266/final_project/data/convote_v1.1/data_stage_three/test_set/414_400343_2897011_RON.txt')

In [6]:
# parse sentences and load into a DataFrame
sentences = []
for label, filename in files:
    with open(filename, 'r') as f:
        # split the sentences
        sents = sent_tokenize(f.read())
        
        # collect (label, sentence) tuples
        for sent in sents:
            sentences.append((label, sent))

df = pd.DataFrame(sentences, columns=['label', 'text'])
print('Total sentences: ', df.shape[0])

Total sentences:  54830


In [7]:
df.sample(5)

Unnamed: 0,label,text
13657,liberal,i do not want you to make it any better .
14326,conservative,i have no doubt that the people of the states ...
36979,conservative,its members represent over 95 percent of world...
2432,conservative,"well , i put the majority of the blame in prac..."
14174,conservative,making the system less costly will increase jo...


In [8]:
df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
conservative,26155
liberal,28439
neutral,236


# Determine most politically charged bigrams

In [9]:
# split data into liberal and conservative
lib = df.loc[df.label == 'liberal']
con = df.loc[df.label == 'conservative']

In [10]:
# define a helper function to pull the bigrams
def getBigrams(text):
    '''return all bigrams in a Counter'''
    # intialize counter
    bigrams = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words if w.isalpha() and len(w) > 1]
    
    # throw out sentences with less than two words
    if len(words) < 2:
        return bigrams
    
    # add bigrams to Counter
    for i in range(len(words) - 1):
        b = ' '.join((words[i], words[i+1]))
        bigrams[b] += 1
        
    return bigrams

In [11]:
libBigrams = Counter()
for text in lib.text:
    libBigrams.update(getBigrams(text))

In [12]:
libBigrams.most_common()[:10]

[('of the', 4107),
 ('in the', 2570),
 ('to the', 1824),
 ('it is', 1682),
 ('this bill', 1544),
 ('for the', 1260),
 ('on the', 1259),
 ('and the', 1240),
 ('we are', 1149),
 ('that is', 1136)]

In [13]:
conBigrams = Counter()
for text in con.text:
    conBigrams.update(getBigrams(text))

In [14]:
conBigrams.most_common()[:10]

[('of the', 4135),
 ('in the', 2510),
 ('it is', 1759),
 ('to the', 1649),
 ('the gentleman', 1331),
 ('we have', 1276),
 ('and the', 1262),
 ('that is', 1176),
 ('on the', 1167),
 ('that we', 1119)]

most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [15]:
# get the 1000 most common liberal and conservative bigrams
commonCon = [b[0] for b in conBigrams.most_common()[:1000]]
commonLib = [b[0] for b in libBigrams.most_common()[:1000]]

In [16]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libBigrams_filtered = [(w,libBigrams[w]) for w in commonLib if w not in commonCon]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conBigrams_filtered = [(w,conBigrams[w]) for w in commonCon if w not in commonLib and 'nbsp' not in w]

In [17]:
len(libBigrams_filtered),len(conBigrams_filtered)

(228, 226)

In [18]:
print('top 10 liberal bigrams:')
sorted(libBigrams_filtered,key=lambda x: -x[1])[:20]

top 10 liberal bigrams:


[('the republican', 429),
 ('social security', 269),
 ('tax cuts', 261),
 ('the republicans', 172),
 ('estate tax', 171),
 ('nothing to', 138),
 ('the cbc', 129),
 ('republican budget', 117),
 ('the public', 106),
 ('pay for', 104),
 ('iraq and', 104),
 ('the estate', 103),
 ('civil rights', 102),
 ('republican leadership', 102),
 ('vote against', 100),
 ('fails to', 99),
 ('programs that', 98),
 ('health and', 97),
 ('the bush', 95),
 ('this administration', 94)]

In [19]:
print('top 10 conservative bigrams')
sorted(conBigrams_filtered,key=lambda x: -x[1])[:20]

top 10 conservative bigrams


[('death tax', 175),
 ('the death', 170),
 ('our economy', 158),
 ('strong support', 128),
 ('cord blood', 110),
 ('rule and', 100),
 ('will help', 94),
 ('head start', 93),
 ('chairman reserve', 92),
 ('natural gas', 90),
 ('war on', 89),
 ('driver licenses', 88),
 ('work on', 87),
 ('important that', 86),
 ('economic growth', 82),
 ('driver license', 82),
 ('protection act', 81),
 ('the class', 81),
 ('human life', 78),
 ('chairman for', 77)]

Looks much more reasonable

In [20]:
# keep just the bigrams and drop the count data
libGrams = [k for k,v in libBigrams_filtered]
conGrams = [k for k,v in conBigrams_filtered]
savepath = r'../data'
with open(os.path.join(savepath, 'libGrams.pickle'), 'wb') as f:
    pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'conGrams.pickle'), 'wb') as f:
    pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)

In [21]:
with open(os.path.join(savepath, 'libGrams.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])
with open(os.path.join(savepath, 'conGrams.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])

['the republican', 'social security', 'tax cuts', 'the republicans', 'estate tax']
['death tax', 'the death', 'our economy', 'strong support', 'cord blood']


# Determine most politically charged lexicons

In [22]:
# define a helper function to pull the bigrams
def getLexicons(text):
    '''return all bigrams in a Counter'''
    # intialize counter
    lexicons = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words if w.isalpha() and len(w) > 1]
    
    # add lexicons to Counter
    for w in words:
        lexicons[w] += 1
        
    return lexicons

In [23]:
getLexicons(df.text[0])

Counter({'madam': 1,
         'speaker': 1,
         'yield': 1,
         'myself': 1,
         'such': 1,
         'time': 1,
         'as': 1,
         'may': 1,
         'consume': 1})

In [24]:
libLexicons = Counter()
for text in lib.text:
    libLexicons.update(getLexicons(text))

In [25]:
libLexicons.most_common()[:10]

[('the', 35641),
 ('to', 19094),
 ('of', 16658),
 ('and', 15338),
 ('that', 11591),
 ('in', 10829),
 ('is', 8746),
 ('this', 7928),
 ('for', 6904),
 ('we', 6716)]

In [26]:
conLexicons = Counter()
for text in con.text:
    conLexicons.update(getLexicons(text))

In [27]:
conLexicons.most_common()[:10]

[('the', 32341),
 ('to', 17535),
 ('of', 16200),
 ('and', 15279),
 ('that', 11445),
 ('in', 9823),
 ('is', 8510),
 ('we', 7102),
 ('this', 6905),
 ('for', 6033)]

most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [28]:
# get the 1000 most common liberal and conservative lexicons
commonCon = [L[0] for L in conLexicons.most_common()[:1000]]
commonLib = [L[0] for L in libLexicons.most_common()[:1000]]

In [29]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libLexicons_filtered = [(w,libLexicons[w]) for w in commonLib if w not in commonCon]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conLexicons_filtered = [(w,conLexicons[w]) for w in commonCon if w not in commonLib and 'nbsp' not in w and 'amp' not in w]

In [30]:
len(libLexicons_filtered),len(conLexicons_filtered)

(151, 149)

In [31]:
print('top 20 liberal lexicons:')
sorted(libLexicons_filtered,key=lambda x: -x[1])[:20]

top 20 liberal lexicons:


[('cuts', 600),
 ('debt', 269),
 ('estate', 207),
 ('coverage', 186),
 ('housing', 161),
 ('protections', 160),
 ('behind', 154),
 ('leader', 143),
 ('poor', 143),
 ('cbc', 143),
 ('trillion', 142),
 ('party', 140),
 ('minimum', 134),
 ('medicaid', 131),
 ('interests', 127),
 ('cutting', 126),
 ('independent', 126),
 ('gun', 125),
 ('college', 123),
 ('proposal', 122)]

In [32]:
print('top 20 conservative lexicons')
sorted(conLexicons_filtered,key=lambda x: -x[1])[:20]

top 20 conservative lexicons


[('embryo', 231),
 ('growth', 230),
 ('gang', 184),
 ('terri', 162),
 ('freedom', 141),
 ('lawyers', 122),
 ('growing', 117),
 ('licenses', 117),
 ('criminal', 116),
 ('commend', 114),
 ('identification', 114),
 ('license', 109),
 ('shall', 108),
 ('early', 106),
 ('patients', 105),
 ('bringing', 99),
 ('commerce', 98),
 ('probably', 96),
 ('gangs', 94),
 ('bit', 92)]

In [33]:
# keep just the bigrams and drop the count data
libCons = [k for k,v in libLexicons_filtered]
conCons = [k for k,v in conLexicons_filtered]
savepath = r'../data'
with open(os.path.join(savepath, 'libCons.pickle'), 'wb') as f:
    pickle.dump(libCons, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'conCons.pickle'), 'wb') as f:
    pickle.dump(conCons, f, pickle.HIGHEST_PROTOCOL)

In [34]:
with open(os.path.join(savepath, 'libGrams.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])
with open(os.path.join(savepath, 'conGrams.pickle'), 'rb') as f:
    test = pickle.load(f)
    print(test[:5])

['the republican', 'social security', 'tax cuts', 'the republicans', 'estate tax']
['death tax', 'the death', 'our economy', 'strong support', 'cord blood']


# Filter Data
filter the original dataset for only those sentences that contain politically charged bigrams and lexicons

In [35]:
def keepSentence(label, text):
    # get bigrams and lexicons
    bigrams = getBigrams(text).keys()
    lexicons = getLexicons(text).keys()
    
    # get the bigrams and lexicons that appear in the ideology lists
    libBigramSet = set(bigrams).intersection(libGrams)
    libLexiconSet = set(lexicons).intersection(libCons)
    conBigramSet = set(bigrams).intersection(conGrams)
    conLexiconSet = set(lexicons).intersection(conCons)
    
    # determine whether to keep the sentence
    if label == 'liberal':
        if libBigramSet or libLexiconSet:
            return True
        else:
            return False
    elif label == 'conservative':
        if conBigramSet or conLexiconSet:
            return True
        else:
            return False
    elif label == 'neutral':
        if libBigramSet or libLexiconSet or conBigramSet or conLexiconSet:
            return False
        else:
            return True

In [36]:
def filterText(df):
    filteredText = []
    for i in range(df.shape[0]):
        label,text = df.label[i], df.text[i]
        if keepSentence(label,text):
            filteredText.append((label, text))
            
    return pd.DataFrame(filteredText, columns=['label','text'])

In [37]:
df_filtered = filterText(df)
df_filtered.sample(5)

Unnamed: 0,label,text
1927,conservative,"to date , the program has facilitated almost 2..."
13856,conservative,for me and the millions of other small busines...
15902,liberal,congress is obligated to use appropriations an...
17415,liberal,then when not all of the money was spent in ne...
21480,conservative,i looked forward to having some `` daddy time ...


In [38]:
df.shape, df_filtered.shape

((54830, 2), (27562, 2))

In [39]:
df_filtered.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
conservative,12976
liberal,14499
neutral,87


In [40]:
savepath = r'../data'
with open(os.path.join(savepath, 'filteredConvote.pickle'), 'wb') as f:
    pickle.dump(df_filtered, f, pickle.HIGHEST_PROTOCOL)

In [43]:
with open(os.path.join(savepath, 'filteredConvote.pickle'), 'rb') as f:
    test = pickle.load(f)
test.sample(5)

Unnamed: 0,label,text
20170,liberal,"but again , it comes down to where the victims..."
14688,liberal,a preliminary chemical safety board investigat...
11620,conservative,what it merely says is that in an instance whe...
19053,liberal,this congress is about to overturn the separat...
19731,liberal,people are thinking what to give their children .
