In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
import nltk
import time, os, pickle
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, names, opinion_lexicon
from itertools import combinations, permutations
from collections import Counter
from tqdm import tqdm, tqdm_notebook
# plt.style.use('ggplot')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('names')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /home/alex/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/alex/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

# maybe later

In [2]:
def parseContent(text):
    sents = sent_tokenize(text)
    words = []
    for sent in sents:
        words.append([w.lower() for w in word_tokenize(sent)])
    return sents, words

In [6]:
def parse_and_pickle(openpath, savepath):
    df = pd.read_csv(openpath)
    df['content'] = df.content + ' ' + df.title
    df = df.loc[:,['publication','content']]
    sentences = []
    wordList = []
    start = time.time()
    for text in df.content:
        sents, words = parseContent(text)
        sentences.append(sents)
        wordList.append(words)
    print('runtime (min):', (time.time() - start) / 60)
    df['sentences'] = sentences
    df['words'] = wordList
    df = df.loc[:,['publication','sentences','words']]
    with open(savepath, 'wb') as f:
        pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# df1 = pd.read_csv(r'../all-the-news/articles1.csv')
# df2 = pd.read_csv(r'../all-the-news/articles2.csv')
# df3 = pd.read_csv(r'../all-the-news/articles3.csv')
opens = [
    r'../all-the-news/articles1.csv',
    r'../all-the-news/articles2.csv',
    r'../all-the-news/articles3.csv'
]
saves = [
    r'../data/news1_parsed.pickle',
    r'../data/news2_parsed.pickle',
    r'../data/news3_parsed.pickle'
]
for o,s in zip(opens, saves):
    parse_and_pickle(o, s)

# current import

In [3]:
df1 = pd.read_csv(r'../all-the-news/articles1.csv')
df2 = pd.read_csv(r'../all-the-news/articles2.csv')
df3 = pd.read_csv(r'../all-the-news/articles3.csv')
df = df1.append(df2).append(df3)
df.index = range(df.shape[0])
df['content'] = df.content + ' ' + df.title
df = df.loc[:,['publication','content']]
del df1, df2, df3
df.shape

(142570, 2)

In [4]:
df.publication.unique()

array(['New York Times', 'Breitbart', 'CNN', 'Business Insider',
       'Atlantic', 'Fox News', 'Talking Points Memo', 'Buzzfeed News',
       'National Review', 'New York Post', 'Guardian', 'NPR', 'Reuters',
       'Vox', 'Washington Post'], dtype=object)

In [4]:
df = df.sample(20000)

# Generating new lexicons and bigrams for news set

In [5]:
pubs = ['Breitbart', 'CNN', 'Fox News', 'Talking Points Memo','National Review','Reuters','Vox']
idx = [i for i in range(df.shape[0]) if df.publication.iloc[i] in pubs]
df = df.iloc[idx]
df.publication.unique()

array(['Breitbart', 'CNN', 'Fox News', 'Talking Points Memo',
       'National Review', 'Reuters', 'Vox'], dtype=object)

In [6]:
df.shape

(66697, 2)

In [7]:
# actual labels from MBFC
bias_dict = {
    'New York Times': 'left-center',
    'Breitbart': 'extreme-right',
    'CNN': 'left',
    'Business Insider': 'left-center',
    'Atlantic': 'left-center',
    'Fox News': 'right',
    'Talking Points Memo': 'left',
    'Buzzfeed News': 'left-center',
    'National Review': 'right',
    'New York Post': 'right-center',
    'Guardian': 'left-center',
    'NPR': 'left-center',
    'Reuters': 'neutral',
    'Vox': 'left',
    'Washington Post': 'left-center'
}

In [8]:
# simplified labels
for k,v in bias_dict.items():
    if 'left' in v:
        bias_dict[k] = 'liberal'
    elif 'right' in v:
        bias_dict[k] = 'conservative'
    else:
        bias_dict[k] = 'neutral'
for k,v in bias_dict.items():
    print(k,v)

New York Times liberal
Breitbart conservative
CNN liberal
Business Insider liberal
Atlantic liberal
Fox News conservative
Talking Points Memo liberal
Buzzfeed News liberal
National Review conservative
New York Post conservative
Guardian liberal
NPR liberal
Reuters neutral
Vox liberal
Washington Post liberal


In [9]:
df.columns

Index(['publication', 'content'], dtype='object')

In [10]:
df['label'] = [bias_dict[p] for p in df.publication.values]

In [11]:
df.columns

Index(['publication', 'content', 'label'], dtype='object')

In [12]:
df.groupby('publication').count()

Unnamed: 0_level_0,content,label
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
Breitbart,23781,23781
CNN,11488,11488
Fox News,4354,4354
National Review,6203,6203
Reuters,10709,10710
Talking Points Memo,5213,5214
Vox,4947,4947


In [13]:
df.groupby('label').count()

Unnamed: 0_level_0,publication,content
label,Unnamed: 1_level_1,Unnamed: 2_level_1
conservative,34338,34338
liberal,21649,21648
neutral,10710,10709


In [14]:
# split data into liberal and conservative
lib = df.loc[df.label == 'liberal']
con = df.loc[df.label == 'conservative']

In [15]:
# define a helper function to pull the bigrams
def getBigrams(text, stops, ops=None):
    '''return all bigrams in a Counter'''
    def criteria(word):
        if len(word) > 1 and word not in stops:
            return True
        else:
            return False
            
    # intialize counter
    bigrams = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words]
    
    # throw out sentences with less than two words
    if len(words) < 2:
        return bigrams
    
    # add bigrams to Counter
    for i in range(len(words) - 1):
        b = ' '.join((words[i], words[i+1]))
        if criteria(words[i]) and criteria(words[i+1]):
            if ops:
                if words[i] in ops or words[i+1] in ops:
                    bigrams[b] += 1
            else:
                bigrams[b] += 1
        
    return bigrams

In [50]:
stops = [w.lower() for w in stopwords.words('english')] + \
['trump', 'good','great','bad','pretty','covering','writer',
 'author','like','news','follow','tv','said','could','would',
 'really','best','journalist','journalists','commentator']
stops[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [51]:
ops = opinion_lexicon.words()

In [52]:
men = [w.lower() for w in names.words('male.txt')]
women = [w.lower() for w in names.words('female.txt')]
stopNames = men + women

In [53]:
# # stops2 = [
# #     'breitbart','twitter','facebook','follow','hudson','sirius',
# #     'author','nussbaum','hanchett','foxnews','share article','hawkins',
# #     'cnn','join','said','told','like','tells','latest',':','.','share',
# #     'klein','could','really','would'
# # ]
# stops2 = [
#     'twitter','facebook','follow','sirius','author',
#     'share','cnn','join','said','told','like','tells',
#     'latest',':','.','could','really','would','good','bad',
#     'editor'
# ]
stops3 = []
for p in df.publication.unique():
    stops3.extend(p.lower().split())
stops3

['breitbart',
 'cnn',
 'fox',
 'news',
 'talking',
 'points',
 'memo',
 'national',
 'review',
 'reuters',
 'vox']

In [54]:
# len(stops), len(stops2), len(stopNames)

In [55]:
# stops.extend(stops2)
stops.extend(stops3)
stops.extend(stopNames)
len(stops)

8154

In [56]:
stops = set(stops)
ops = set(ops)

In [57]:
len(stops), len(ops)

(7781, 6786)

In [58]:
start = time.time()
# libBigrams = Counter()
libBigrams3 = Counter()
for pub,content,_ in tqdm(lib.values):
    try:
        sents = sent_tokenize(content)
    except:
        continue
    for text in sents:
        libBigrams3.update(getBigrams(text, stops, ops=ops))
print((time.time() - start) / 60)

100%|██████████| 21649/21649 [03:04<00:00, 59.87it/s]

3.0722961107889812





In [64]:
libBigrams.most_common()[:10]

[('white house', 1360),
 ('united states', 1256),
 ('new york', 1040),
 ('health care', 655),
 ('supreme court', 444),
 ('barack obama', 435),
 ('north korea', 399),
 ('obama administration', 381),
 ('last year', 369),
 ('last week', 364)]

In [44]:
libBigrams2.most_common()[:10]

[('supreme court', 3410),
 ('vice president', 1622),
 ('affordable care', 1244),
 ('sexual assault', 1033),
 ('look like', 807),
 ('intelligence committee', 801),
 ('looks like', 791),
 ('freedom caucus', 740),
 ('feel like', 715),
 ('criminal justice', 675)]

In [32]:
libBigrams3.most_common()[:10]

[('white house', 10173),
 ('united states', 8914),
 ('new york', 7696),
 ('u. s.', 7549),
 ('health care', 4613),
 ('supreme court', 3410),
 ('north korea', 3175),
 ('barack obama', 3038),
 ('last year', 2856),
 ('last week', 2854)]

In [66]:
len(libBigrams2)

276255

In [24]:
# with open('../data/libGrams_news2_unfiltered.pickle', 'wb') as f:
#     pickle.dump(libBigrams, f, pickle.HIGHEST_PROTOCOL)

In [20]:
# with open('../data/libGrams_news2_unfiltered.pickle', 'rb') as f:
#     libBigrams = pickle.load(f)

In [None]:
len(libBigrams)

In [59]:
start = time.time()
# conBigrams = Counter()
conBigrams3 = Counter()
for content in tqdm(con.content.values):
    try:
        sents = sent_tokenize(content)
    except:
        continue
    for text in sents:
        conBigrams3.update(getBigrams(text, stops,ops=ops))
print((time.time() - start) / 60)

100%|██████████| 34338/34338 [03:47<00:00, 151.03it/s]

3.789326004187266





In [None]:
conBigrams.most_common()[:10]

In [48]:
conBigrams2.most_common()[:10]

[('supreme court', 4400),
 ('vice president', 1795),
 ('free speech', 1669),
 ('illegal immigrants', 1524),
 ('illegal aliens', 1138),
 ('fake news', 1115),
 ('illegal immigration', 922),
 ('america great', 823),
 ('siriusxm patriot', 808),
 ('would like', 778)]

In [34]:
conBigrams3.most_common()[:10]

[('u. s.', 22067),
 ('united states', 11775),
 ('new york', 10648),
 ('white house', 7892),
 ('president obama', 4971),
 ('supreme court', 4400),
 ('barack obama', 4054),
 ('obama administration', 3875),
 ('last year', 3605),
 ('last week', 3514)]

In [24]:
# with open('../data/conGrams_news2_unfiltered.pickle', 'wb') as f:
#     pickle.dump(conBigrams, f, pickle.HIGHEST_PROTOCOL)

In [25]:
# with open('../data/conGrams_news2_unfiltered.pickle', 'rb') as f:
#     conBigrams = pickle.load(f)

In [68]:
len(conBigrams2)

323391

In [60]:
# get the 1000 most common liberal and conservative bigrams
# commonCon = [b[0] for b in conBigrams.most_common()[:1000]]
# commonLib = [b[0] for b in libBigrams.most_common()[:1000]]
commonCon3 = [b[0] for b in conBigrams3.most_common()[:1000]]
commonLib3 = [b[0] for b in libBigrams3.most_common()[:1000]]

In [72]:
remove_bigrams = ['doctor strange','fury road','gold medals','mad men',
                  'stranger things','little lies','lose weight',
                  'world champion','premier league','walking dead',
                  'weight loss','grand slam','science fiction','right now.']

In [73]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
# libBigrams_filtered = [(w,libBigrams[w]) for w in commonLib if w not in commonCon]
libBigrams_filtered3 = [(w,libBigrams3[w]) for w in commonLib3 if w not in commonCon3 and w not in remove_bigrams]

In [74]:
len(libBigrams_filtered3)

402

In [75]:
print('top 100 liberal bigrams:')
sorted(libBigrams_filtered3,key=lambda x: -x[1])[:100]

top 100 liberal bigrams:


[('us intelligence', 369),
 ('fast facts', 299),
 ('opioid epidemic', 186),
 ('top democrat', 178),
 ('health reform', 157),
 ('lethal injection', 134),
 ('budget reconciliation', 116),
 ('chronic pain', 111),
 ('us supreme', 110),
 ('lead poisoning', 109),
 ('intelligence committees', 107),
 ('prison sentences', 101),
 ('rights advocates', 99),
 ('poverty line', 97),
 ('provocative narrative', 95),
 ('undocumented immigrant', 88),
 ('reconciliation process', 86),
 ('patriot act', 86),
 ('lose coverage', 84),
 ('excessive force', 81),
 ('lead exposure', 80),
 ('crude oil', 79),
 ('top republicans', 78),
 ('fatal shooting', 77),
 ('car bomb', 74),
 ('stars hollow', 74),
 ('increased risk', 73),
 ('sick people', 71),
 ('gas attack', 71),
 ('top white', 70),
 ('bigger problem', 70),
 ('plane crash', 69),
 ('american crime', 68),
 ('corruption scandal', 67),
 ('yet clear', 67),
 ('extreme poverty', 67),
 ('assault allegations', 65),
 ('still unclear', 65),
 ('possible collusion', 65),
 ('b

In [76]:
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
# conBigrams_filtered = [(w,conBigrams[w]) for w in commonCon if w not in commonLib]
conBigrams_filtered3 = [(w,conBigrams3[w]) for w in commonCon3 if w not in commonLib3 and w not in remove_bigrams]

In [77]:
len(conBigrams_filtered3)

416

In [78]:
print('top 100 conservative bigrams:')
sorted(conBigrams_filtered3,key=lambda x: -x[1])[:100]

top 100 conservative bigrams:


[('illegal aliens', 1138),
 ('siriusxm patriot', 808),
 ('illegal alien', 624),
 ('patriot 125', 598),
 ('illegal immigrant', 482),
 ('criminal aliens', 375),
 ('migrant crisis', 366),
 ('popular weekend', 327),
 ('patriot channel', 316),
 ('hard truths', 298),
 ('19 hard', 283),
 ('limited government', 206),
 ('islamic terror', 203),
 ('twin falls', 199),
 ('suppression cost', 175),
 ('liberal elite', 169),
 ('snarky opinions', 159),
 ('dangerous faggot', 156),
 ('free beacon', 155),
 ('conservative principles', 151),
 ('violent campus', 139),
 ('award winning', 138),
 ('free expression', 137),
 ('free markets', 134),
 ('million illegal', 126),
 ('politically incorrect', 126),
 ('real clear', 124),
 ('shocking speed', 124),
 ('clear politics', 121),
 ('certain death', 116),
 ('criminal illegal', 114),
 ('sex attacks', 111),
 ('work permits', 110),
 ('cheap labor', 109),
 ('free movement', 106),
 ('derangement syndrome', 104),
 ('gross negligence', 103),
 ('die welt', 102),
 ('media bi

In [79]:
# keep just the bigrams and drop the count data
libGrams = [k for k,v in sorted(libBigrams_filtered3,key=lambda x: -x[1])]
conGrams = [k for k,v in sorted(conBigrams_filtered3,key=lambda x: -x[1])]

In [80]:
# keep just the bigrams and drop the count data
# libGrams = [k for k,v in libBigrams_filtered]
# conGrams = [k for k,v in conBigrams_filtered if 'national review' not in k and 'awr hawkins' not in k] # getting rid of 'national review'

with open('../data/libGrams_news_withOpinion.pickle', 'wb') as f:
    pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
with open('../data/conGrams_news_withOpinion.pickle', 'wb') as f:
    pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)

# Load liberal and conservative bigrams

In [81]:
# savepath = r'../data'
# with open(os.path.join(savepath, 'libGrams_news.pickle'), 'rb') as f:
#     test = pickle.load(f)
#     print(test[:5])
# with open(os.path.join(savepath, 'conGrams_news.pickle'), 'rb') as f:
#     test = pickle.load(f)
#     print(test[:5])
with open('../data/libGrams_news_withOpinion.pickle', 'rb') as f:
    libGrams = pickle.load(f)
    print(libGrams[:5])
with open('../data/conGrams_news_withOpinion.pickle', 'rb') as f:
    conGrams = pickle.load(f)
    print(conGrams[:5])

['us intelligence', 'fast facts', 'opioid epidemic', 'top democrat', 'health reform']
['illegal aliens', 'siriusxm patriot', 'illegal alien', 'patriot 125', 'illegal immigrant']


# Determine most common lexicons

In [92]:
# define a helper function to pull the bigrams
def getLexicons(text, stops):
    '''return all bigrams in a Counter'''
    # intialize counter
    lexicons = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words]
    
    # add lexicons to Counter
    for w in words:
        if w not in stops and len(w) > 1 and not w.isnumeric():
            lexicons[w] += 1
        
    return lexicons

In [102]:
len(stops), len(stops2)

(179, 22)

In [103]:
stops.extend(stops2)
len(stops)

201

In [104]:
lib.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content', 'label'],
      dtype='object')

In [106]:
start = time.time()
libLexicons = Counter()
for text in lib.content:
    libLexicons.update(getLexicons(text, stops))
print('runtime (min):', (time.time() - start) / 60)

runtime (min): 2.987015656630198


In [107]:
libLexicons.most_common()[:10]

[('trump', 82297),
 ('people', 45281),
 ('would', 42706),
 ('one', 42477),
 ('president', 30646),
 ('also', 30267),
 ('new', 30187),
 ('us', 29867),
 ('could', 24097),
 ('time', 23036)]

In [108]:
start = time.time()
conLexicons = Counter()
for text in con.content:
    conLexicons.update(getLexicons(text, stops))
print('runtime (min):', (time.time() - start) / 60)

runtime (min): 3.5280917326609296


In [109]:
conLexicons.most_common()[:10]

[('trump', 109740),
 ('clinton', 50981),
 ('would', 49631),
 ('people', 47887),
 ('one', 46192),
 ('president', 40427),
 ('new', 35565),
 ('also', 32915),
 ('state', 31321),
 ('news', 30307)]

most of these bigrams are neutral. Grab the 1000 most common from each ideology, then filter out any that appear in the other ideology

In [110]:
# get the 1000 most common liberal and conservative lexicons
commonCon = [L[0] for L in conLexicons.most_common()[:1000]]
commonLib = [L[0] for L in libLexicons.most_common()[:1000]]

In [120]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libLexicons_filtered = [(w,libLexicons[w]) for w in commonLib[:1000] if w not in commonCon and w.isalpha()]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conLexicons_filtered = [(w,conLexicons[w]) for w in commonCon[:1000] if w not in commonLib and w.isalpha()]

In [121]:
len(libLexicons_filtered), len(conLexicons_filtered)

(129, 124)

In [124]:
# keep just the bigrams and drop the count data
libCons = [k for k,v in libLexicons_filtered]
conCons = [k for k,v in conLexicons_filtered]
savepath = r'../data'

with open(os.path.join(savepath, 'libCons_news.pickle'), 'wb') as f:
    pickle.dump(libCons, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(savepath, 'conCons_news.pickle'), 'wb') as f:
    pickle.dump(conCons, f, pickle.HIGHEST_PROTOCOL)

In [24]:
savepath = r'../data'
# with open(os.path.join(savepath, 'libCons_news.pickle'), 'rb') as f:
#     test = pickle.load(f)
#     print(test[:5])
# with open(os.path.join(savepath, 'conCons_news.pickle'), 'rb') as f:
#     test = pickle.load(f)
#     print(test[:5])
with open(os.path.join(savepath, 'libCons_news.pickle'), 'rb') as f:
    libCons = pickle.load(f)
    print(libCons[:5])
with open(os.path.join(savepath, 'conCons_news.pickle'), 'rb') as f:
    conCons = pickle.load(f)
    print(conCons[:5])

['season', 'flynn', 'pretty', 'food', 'science']
['islamic', 'pic', 'illegal', 'article', 'com']


# Reload Data and test sentence filtering

In [82]:
df1 = pd.read_csv(r'../all-the-news/articles1.csv')
df2 = pd.read_csv(r'../all-the-news/articles2.csv')
df3 = pd.read_csv(r'../all-the-news/articles3.csv')
df = df1.append(df2).append(df3)
df.index = range(df.shape[0])
df['content'] = df.content + ' ' + df.title
df = df.loc[:,['publication','content']]
del df1, df2, df3
df.shape

(142570, 2)

In [40]:
df = df.sample(20000)

# testing

In [83]:
df['label'] = [bias_dict[p] for p in df.publication.values]

In [84]:
df.columns

Index(['publication', 'content', 'label'], dtype='object')

In [85]:
def extractContent(row, libGrams, conGrams):
    sentList = []
    try:
        sentences = sent_tokenize(row.content)
    except:
        return sentList
#     sentences.append(row.title)
    for s in sentences:
        if len(s) < 5:
            continue
        try:
            words = word_tokenize(s)
            words = [w.lower() for w in words]
            bigrams = []
            for i in range(len(words) - 1):
                bigrams.append(' '.join((words[i], words[i+1])))
        except:
            continue
#         libConsPresent = set(words).intersection(libCons)
        libGramsPresent = set(bigrams).intersection(libGrams)
#         conConsPresent = set(words).intersection(conCons)
        conGramsPresent = set(bigrams).intersection(conGrams)
#         libNum = len(libConsPresent) + len(libGramsPresent)
#         conNum = len(conConsPresent) + len(conGramsPresent)
        libNum = len(libGramsPresent)
        conNum = len(conGramsPresent)
        if libNum == conNum:
            label = 'neutral'
        elif libNum > conNum:
            label = 'liberal'
        else:
            label = 'conservative'
        if libGramsPresent:
            grams = libGramsPresent
        elif conGramsPresent:
            grams = conGramsPresent
        else:
            grams = set()
        sentList.append((label, s, grams))
    return sentList

In [86]:
# testing
sample = df.sample(100)
sents = []
for i in tqdm(range(100)):
    sentList = extractContent(sample.iloc[i], libGrams[:100], conGrams[:100])
    sents.extend(sentList)
# df2 = pd.DataFrame(sents, columns=['label','text'])
df2 = pd.DataFrame(sents, columns=['label','text','grams'])
df2[df2.label != 'neutral'].shape

100%|██████████| 100/100 [00:00<00:00, 109.34it/s]


(30, 3)

In [87]:
df2.groupby('label').count()

Unnamed: 0_level_0,text,grams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
conservative,16,16
liberal,14,14
neutral,3912,3912


In [88]:
for i,t in enumerate(df2[df2.label != 'neutral'].values):
    print(i,t)

0 ['conservative' 'That’s not keeping us safe.' {'us safe'}]
1 ['conservative'
 'A jury in Teesside Crown Court in North Yorkshire, England, found    Mohammed Zaman guilty of manslaughter and gross negligence because he switched out almond powder for a cheaper ground nut mix containing peanuts,  .'
 {'gross negligence'}]
2 ['liberal'
 ' Paul Ryan, the top Republican in the U. S. Congress, says he talks to Donald Trump often, but the   has tapped a   lawmaker from western New York to be his eyes and ears among congressional Republicans.'
 {'top republican'}]
3 ['liberal' '9 [  ] event is getting less clear with time.' {'less clear'}]
4 ['liberal'
 'Remember: 83 percent of city schools had water fixtures with lead levels above federal safety limits.'
 {'lead levels'}]
5 ['liberal'
 'While the Obama administration had clear guidance prioritizing deportation of   criminals, an executive order signed by Trump in his first week set up enforcement priorities that could include virtually any u

In [89]:
df.publication.unique()

array(['New York Times', 'Breitbart', 'CNN', 'Business Insider',
       'Atlantic', 'Fox News', 'Talking Points Memo', 'Buzzfeed News',
       'National Review', 'New York Post', 'Guardian', 'NPR', 'Reuters',
       'Vox', 'Washington Post'], dtype=object)

In [90]:
def extractSentences(df, libGrams, conGrams):
    output = []
    for i in tqdm(range(df.shape[0])):
        sentList = extractContent(df.iloc[i], libGrams, conGrams)
        output.extend(sentList)
    df = pd.DataFrame(output, columns=['label','text','grams'])
    return df

In [None]:
# start = time.time()
# df1 = extractSentences(df,50)
# print('runtime (min): ', (time.time() - start) / 60)
# df1.groupby('label').count()

In [91]:
# start = time.time()
df2 = extractSentences(df, libGrams[:100], conGrams[:100])
# print('runtime (min): ', (time.time() - start) / 60)
df2.groupby('label').count()

100%|██████████| 142570/142570 [22:17<00:00, 106.62it/s]


Unnamed: 0_level_0,text,grams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
conservative,19462,19462
liberal,23106,23106
neutral,5058369,5058369


In [106]:
df2.columns

Index(['label', 'text', 'grams'], dtype='object')

In [92]:
sample = df2[df2.label != 'neutral'].sample(10)
for i,t in enumerate(sample.values):
    print(i,t)

0 ['liberal'
 'Mateen “made a pledge of allegiance to ISIS,” California Rep. Adam Schiff, the top Democrat on the House Permanent Select Committee on Intelligence, told CNN.'
 {'top democrat'}]
1 ['liberal' 'Gay rights advocates have been trying to get approval for a .'
 {'rights advocates'}]
2 ['liberal'
 'And how can a state with the highest number of poor people in the nation  —   23 percent of Californians are below the accepted poverty line, according to the Census Bureau  —   ensure that its gas prices will be the highest in the nation?'
 {'poverty line'}]
3 ['liberal'
 'But if there’s abundant video of you demanding respect for law, it becomes a bigger problem if there’s also abundant video of you breaking laws to take financial advantage of ordinary people.'
 {'bigger problem'}]
4 ['liberal' 'The sign reads, “Undocumented Immigrant?'
 {'undocumented immigrant'}]
5 ['conservative'
 'That’s probably not what the president of the United States had in mind when this week he set up 

In [93]:
del df

In [100]:
n = round(min(df2[df2.label == 'conservative'].shape[0], df2[df2.label == 'liberal'].shape[0]),-3)
n

19000

In [101]:
s1 = df2.loc[df2.label == 'conservative'].sample(n)
s2 = df2.loc[df2.label == 'liberal'].sample(n)
s3 = df2.loc[df2.label == 'neutral'].sample(n)
type(s1), type(s2), type(s3)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [107]:
s1.append(s2).append(s3).groupby('label').count()

Unnamed: 0_level_0,text,grams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
conservative,19000,19000
liberal,19000,19000
neutral,19000,19000


In [108]:
df2 = s1.append(s2).append(s3)
df2.shape, n*3

((57000, 3), 57000)

In [109]:
# with open(r'../data/filteredNews_top50grams.pickle', 'wb') as f:
#     pickle.dump(df1, f, pickle.HIGHEST_PROTOCOL)
with open(r'../data/filteredNews_bigrams2.pickle', 'wb') as f:
    pickle.dump(df2, f, pickle.HIGHEST_PROTOCOL)

In [110]:
# with open(r'../data/filteredNews_top50grams.pickle', 'rb') as f:
#     df1 = pickle.load(f)
with open(r'../data/filteredNews_bigrams2.pickle', 'rb') as f:
    df2 = pickle.load(f)

In [111]:
df2.shape

(57000, 3)

In [112]:
df2.groupby('label').count()

Unnamed: 0_level_0,text,grams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
conservative,19000,19000
liberal,19000,19000
neutral,19000,19000


In [72]:
df2.columns

Index(['label', 'text'], dtype='object')

In [5]:
sample = df2[df2.label != 'neutral'].sample(20)
for i,t in enumerate(sample.values):
    print(i,t)

0 ['conservative'
 'Today  …     and until people I know and love can feel safe again     —   BeTheChange (@honoraye)  In the wake of Trump’s victory on Tuesday,   vandalism, hate speech and violence have been reported across the country.'
 {'hate speech'}]
1 ['conservative'
 'It’s gotta be the other way around.”  The Netherlands’ relationship with the European Union has been another hot topic on the campaign trail.'
 {'got ta'}]
2 ['conservative'
 'May payrolls were revised sharply down to show them rising 11, 000 rather than the previously reported 38, 000.'
 {'previously reported'}]
3 ['conservative'
 'Ginsberg, Senior Political Editor, The Washington Post   Samuels, National Political Reporter, The Washington Post   Phillip, National Political Reporter The Washington Post Moderated by Dan Balz, Chief Correspondent, The Washington Post         July 25, 1:30  —   2:30 p. m. | Party Platform: Criminal Justice Policymakers and experts examine the Democrats’ party platform on policing a

In [12]:
!pwd

/home/alex/Documents/MIDS/w266/final_project/w266_final_project


In [17]:
df.to_csv('filtered_sentences.csv')

In [18]:
import pickle

In [26]:
start = time.time()
with open('filtered_sentences.pickle', 'wb') as f:
    pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
with open('filtered_sentences.pickle', 'rb') as f:
    df2 = pickle.load(f)
time.time() - start

0.7255961894989014

In [27]:
start = time.time()
df.to_csv('filtered_sentences.csv')
df3 = pd.read_csv('filtered_sentences.csv')
time.time() - start

4.2353599071502686

In [24]:
with open('filtered_sentences.pickle', 'rb') as f:
    df2 = pickle.load(f)

In [25]:
df2.head()

Unnamed: 0,label,text
0,liberal,WASHINGTON — Congressional Republicans have...
1,liberal,The incoming Trump administration could choose...
2,liberal,"In another twist, Donald J. Trump’s administra..."
3,liberal,“Given that this pending litigation involves t...
4,liberal,"“Upon taking office, the Trump administration ..."
