In [1]:
import pandas as pd
import nltk
import time, os, pickle, re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, names, opinion_lexicon
from collections import Counter
from tqdm import tqdm, tqdm_notebook
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('names')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /home/alex/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/alex/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

# Import Data

In [2]:
# import data from three separate csv files
df1 = pd.read_csv(r'../all-the-news/articles1.csv')
df2 = pd.read_csv(r'../all-the-news/articles2.csv')
df3 = pd.read_csv(r'../all-the-news/articles3.csv')
df = df1.append(df2).append(df3)

# reindex the concatenated dataframe
df.index = range(df.shape[0])

# add the title sentence to the content body
df['content'] = df.content + ' ' + df.title

# keep only the publication and content columns and delete the other DFs for memory conservation
df = df.loc[:,['publication','content']]
del df1, df2, df3
df.shape

(142570, 2)

In [3]:
# drop missing data
df = df.dropna()
df.shape

(142568, 2)

In [4]:
# parse the articles into sentences
start = time.time()
df['sentences'] = df.content.apply(sent_tokenize)
print('runtime (min):', (time.time() - start) / 60)

runtime (min): 3.253500250975291


## Apply bias labels, sourced from mediabiasfactcheck.com based on publisher

In [4]:
# actual labels from MBFC
bias_dict = {
    'New York Times': 'left-center',
    'Breitbart': 'extreme-right',
    'CNN': 'left',
    'Business Insider': 'left-center',
    'Atlantic': 'left-center',
    'Fox News': 'right',
    'Talking Points Memo': 'left',
    'Buzzfeed News': 'left-center',
    'National Review': 'right',
    'New York Post': 'right-center',
    'Guardian': 'left-center',
    'NPR': 'left-center',
    'Reuters': 'neutral',
    'Vox': 'left',
    'Washington Post': 'left-center'
}

In [5]:
# simplified labels
for k,v in bias_dict.items():
    if 'left' in v:
        # 1 for liberal
        bias_dict[k] = 1
    elif 'right' in v:
        # -1 for conservative
        bias_dict[k] = -1
    else:
        # 0 for neutral
        bias_dict[k] = 0
for k,v in bias_dict.items():
    print(k,v)

New York Times 1
Breitbart -1
CNN 1
Business Insider 1
Atlantic 1
Fox News -1
Talking Points Memo 1
Buzzfeed News 1
National Review -1
New York Post -1
Guardian 1
NPR 1
Reuters 0
Vox 1
Washington Post 1


In [6]:
# add a label column
df['label'] = [bias_dict[p] for p in df.publication.values]
df.columns

Index(['publication', 'content', 'label'], dtype='object')

In [10]:
# save partial dataframe for document classification task later
with open(r'../data/newsFullDocs.pickle', 'wb') as f:
    pickle.dump(df.loc[:,['label','sentences']], f, pickle.HIGHEST_PROTOCOL)

In [11]:
# drop sentences column for memory conservation
df = df.drop('sentences', axis=1)

# Gather Bias Detector n-grams

Using a subset of publishers that have the most extreme MBFC labels, collect the most common trigrams and bigrams for both the conservative and liberal labels. Filter out stopwords on all n-grams and require that bigrams contain at least one "opinion" word from the NLTK `opinion_lexicon` corpus. 

In [7]:
# add unigrams from publisher names to list of stopwords
pubs = []
for p in df.publication.unique():
    pubs.extend(p.lower().split())
pubs

['new',
 'york',
 'times',
 'breitbart',
 'cnn',
 'business',
 'insider',
 'atlantic',
 'fox',
 'news',
 'talking',
 'points',
 'memo',
 'buzzfeed',
 'news',
 'national',
 'review',
 'new',
 'york',
 'post',
 'guardian',
 'npr',
 'reuters',
 'vox',
 'washington',
 'post']

In [8]:
# select subset of publishers with most extreme MPFC labels for n-gram selection
publishers = ['Breitbart', 'CNN', 'Fox News', 'Talking Points Memo','Vox']
idx = [i for i in range(df.shape[0]) if df.publication.iloc[i] in publishers]
df = df.iloc[idx]
df.publication.unique()

array(['Breitbart', 'CNN', 'Fox News', 'Talking Points Memo', 'Vox'],
      dtype=object)

In [9]:
# check the size of the subset
df.shape

(49783, 3)

In [10]:
# check the balance of content per publisher
df.groupby('publication').count()

Unnamed: 0_level_0,content,label
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
Breitbart,23781,23781
CNN,11488,11488
Fox News,4354,4354
Talking Points Memo,5213,5213
Vox,4947,4947


In [11]:
# check how balanced the labels are
df.groupby('label').count()

Unnamed: 0_level_0,publication,content
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,28135,28135
1,21648,21648


# Find n-grams

In [12]:
# split data into liberal and conservative
lib = df.loc[df.label == 1]
con = df.loc[df.label == -1]

In [13]:
# create custom stopwords list by adding custom stops found empirically from data
stops = [w.lower() for w in stopwords.words('english')] + \
['trump', 'good','great','bad','pretty','covering','writer',
 'author','like','news','follow','tv','said','could','would',
 'really','best','journalist','journalists','commentator',
 'affiliate','told','reporter','reporters','siriusxm','radio',
 'columnist','contributed','bestselling','battleground','weekend',
 'seven']

In [14]:
# add English names to list of stopwords
men = [w.lower() for w in names.words('male.txt')]
women = [w.lower() for w in names.words('female.txt')]
stopNames = men + women

In [15]:
# extend the stops list
stops.extend(pubs)
stops.extend(stopNames)

In [16]:
# get the list of opinion words
ops = opinion_lexicon.words()

In [17]:
# generate sets for each list
stops = set(stops)
ops = set(ops)

In [18]:
len(stops), len(ops)

(7803, 6786)

In [19]:
# define function to select n-grams
def getNgrams(text, stops=None, ops=None, n=2):
    '''return all n-grams from sentence that meet criteria in a Counter'''
    
    def criteria(word):
        # determine if a word meets the criteria for keeping that word
        if stops:
            if len(word) > 2 and not word.isnumeric() and word not in stops:
                return True
            else:
                return False
        else:
            return True
            
    # intialize counter
    ngrams = Counter()
    
    # tokenize sentence into words
    words = word_tokenize(text)
    
    # throw out sentences with less than n words
    if len(words) < n:
        return ngrams
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words]
    
    # add n-grams to Counter
    for i in range(len(words) - (n-1)):
        gram = ' '.join(words[i:i+n])
        if all([criteria(w) for w in words[i:i+n]]):
            if ops:
                if any([w in ops for w in words[i:i+n]]):
                    ngrams[gram] += 1
            else:
                ngrams[gram] += 1
        
    return ngrams

In [20]:
# get list of all liberal trigrams
libTrigrams = Counter()
for pub,content,_ in tqdm(lib.values):
    sents = sent_tokenize(content)
    for text in sents:
        libTrigrams.update(getNgrams(text, stops=stops, n=3))

100%|██████████| 21648/21648 [03:16<00:00, 110.17it/s]


In [21]:
# get list of all liberal bigrams
libBigrams = Counter()
for pub,content,_ in tqdm(lib.values):
    sents = sent_tokenize(content)
    for text in sents:
        libBigrams.update(getNgrams(text, stops=stops, ops=ops, n=2))

100%|██████████| 21648/21648 [03:24<00:00, 105.99it/s]


In [22]:
# get list of all conservative trigrams
conTrigrams = Counter()
for pub,content,_ in tqdm(con.values):
    sents = sent_tokenize(content)
    for text in sents:
        conTrigrams.update(getNgrams(text, stops=stops, n=3))

100%|██████████| 28135/28135 [02:54<00:00, 161.29it/s]


In [23]:
# get list of all consrvative bigrams
conBigrams = Counter()
for pub,content,_ in tqdm(con.values):
    sents = sent_tokenize(content)
    for text in sents:
        conBigrams.update(getNgrams(text, stops=stops, ops=ops, n=2))

100%|██████████| 28135/28135 [02:49<00:00, 166.17it/s]


In [32]:
# get the 1000 most common liberal and conservative n-grams
commonConTrigrams = [b[0] for b in conTrigrams.most_common()[:1000]]
commonLibTrigrams = [b[0] for b in libTrigrams.most_common()[:1000]]
commonConBigrams = [b[0] for b in conBigrams.most_common()[:1000]]
commonLibBigrams = [b[0] for b in libBigrams.most_common()[:1000]]

In [33]:
# filter specific bigrams that passed criteria requirements but do not indicate bias
remove_bigrams = ['doctor strange','fury road','gold medals','gold medal',
                  'mad men','stranger things','little lies','lose weight',
                  'world champion','premier league','walking dead',
                  'weight loss','grand slam','science fiction','right now.',
                  'open champion','associated press','world heavyweight',
                  'wonder woman','last month','top stories','fast facts']

In [34]:
# take the set difference for liberal and conservative n-grams and remove any n-grams containing bigrams from from remove_bigrams
libTrigrams_filtered = [(w, libTrigrams[w]) for w in commonLibTrigrams if w not in commonConTrigrams and not any([b in w for b in remove_bigrams])]
libBigrams_filtered = [(w, libBigrams[w]) for w in commonLibBigrams if w not in commonConBigrams and w not in remove_bigrams]
conTrigrams_filtered = [(w, conTrigrams[w]) for w in commonConTrigrams if w not in commonLibTrigrams and not any([b in w for b in remove_bigrams])]
conBigrams_filtered = [(w, conBigrams[w]) for w in commonConBigrams if w not in commonLibBigrams and w not in remove_bigrams]

In [46]:
# check size of each set
len(libTrigrams_filtered),len(conTrigrams_filtered),len(libBigrams_filtered),len(conBigrams_filtered)

(495, 501, 426, 441)

In [41]:
# print the top 10 liberal trigrams
print('TOP 10 LIBERAL TRIGRAMS:\n')
for k,v in sorted(libTrigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

TOP 10 LIBERAL TRIGRAMS:

senior administration official
greenhouse gas emissions
north korean leader
federal civil rights
provocative narrative essays
clean air act
health care policy
republican health care
gop health care
civil rights laws


In [42]:
# print the top 10 liberal bigrams
print('TOP 10 LIBERAL BIGRAMS:\n')
for k,v in sorted(libBigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

TOP 10 LIBERAL BIGRAMS:

opioid epidemic
health reform
lethal injection
healthy people
racial bias
budget reconciliation
chronic pain
lead poisoning
intelligence committees
rights advocates


In [43]:
# print the top 10 conservative trigrams
print('TOP 10 CONSERVATIVE TRIGRAMS:\n')
for k,v in sorted(conTrigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

TOP 10 CONSERVATIVE TRIGRAMS:

jerusalem bureau chief
border patrol agent
social justice warriors
black panther party
cartel chronicles project
refugee resettlement program
prison sentence commuted
god less america
real clear politics
face certain death


In [44]:
# print the top 10 conservative bigrams
print('TOP 10 CONSERVATIVE BIGRAMS:\n')
for k,v in sorted(conBigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

TOP 10 CONSERVATIVE BIGRAMS:

illegal aliens
illegal alien
illegal immigrant
migrant crisis
patriot channel
hard truths
twin falls
islamic terror
snarky opinions
dangerous faggot


In [45]:
# keep the n-grams and drop the count data
libGrams = [k for k,v in sorted(libTrigrams_filtered,key=lambda x: -x[1])[:100]] + \
           [k for k,v in sorted(libBigrams_filtered,key=lambda x: -x[1])[:100]]
conGrams = [k for k,v in sorted(conTrigrams_filtered,key=lambda x: -x[1])[:100]] + \
           [k for k,v in sorted(conBigrams_filtered,key=lambda x: -x[1])[:100]]

In [34]:
# save off the n-grams for loading later
with open('../data/libGrams_news6.pickle', 'wb') as f:
    pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
with open('../data/conGrams_news6.pickle', 'wb') as f:
    pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)

# Filter sentences for Bias

## Load liberal and conservative n-grams for sentence filtering

In [35]:
# load the n-grams
with open('../data/libGrams_news6.pickle', 'rb') as f:
    libGrams = pickle.load(f)
    print(libGrams[:5])
with open('../data/conGrams_news6.pickle', 'rb') as f:
    conGrams = pickle.load(f)
    print(conGrams[:5])

['senior administration official', 'greenhouse gas emissions', 'north korean leader', 'federal civil rights', 'provocative narrative essays']
['jerusalem bureau chief', 'border patrol agent', 'social justice warriors', 'black panther party', 'cartel chronicles project']


## Reload Data and test sentence filtering

In [36]:
df1 = pd.read_csv(r'../all-the-news/articles1.csv')
df2 = pd.read_csv(r'../all-the-news/articles2.csv')
df3 = pd.read_csv(r'../all-the-news/articles3.csv')
df = df1.append(df2).append(df3)
df.index = range(df.shape[0])
df['content'] = df.content + ' ' + df.title
df = df.loc[:,['publication','content']]
del df1, df2, df3
df.shape

(142570, 2)

In [37]:
# define bias dict again
bias_dict = {
    'New York Times': 'left-center',
    'Breitbart': 'extreme-right',
    'CNN': 'left',
    'Business Insider': 'left-center',
    'Atlantic': 'left-center',
    'Fox News': 'right',
    'Talking Points Memo': 'left',
    'Buzzfeed News': 'left-center',
    'National Review': 'right',
    'New York Post': 'right-center',
    'Guardian': 'left-center',
    'NPR': 'left-center',
    'Reuters': 'neutral',
    'Vox': 'left',
    'Washington Post': 'left-center'
}

In [38]:
# simplified labels
for k,v in bias_dict.items():
    if 'left' in v:
        bias_dict[k] = 1
    elif 'right' in v:
        bias_dict[k] = -1
    else:
        bias_dict[k] = 0
for k,v in bias_dict.items():
    print(k,v)

New York Times 1
Breitbart -1
CNN 1
Business Insider 1
Atlantic 1
Fox News -1
Talking Points Memo 1
Buzzfeed News 1
National Review -1
New York Post -1
Guardian 1
NPR 1
Reuters 0
Vox 1
Washington Post 1


In [39]:
df['label'] = [bias_dict[p] for p in df.publication.values]

In [41]:
# define function to determine if sentence should be kept or thrown out
def keepSentence(label, text):
    # remove sentences with less than 6 words
    if len(text.split()) < 6:
        return False
    
    # get trigrams and bigrams
    ngrams = list(getNgrams(text,n=2).keys()) + list(getNgrams(text,n=3).keys())
    
    # keep only n-grams that appear in detector lists
    libNgramSet = set(ngrams).intersection(libGrams)
    conNgramSet = set(ngrams).intersection(conGrams)
    
    # determine whether to keep the sentence
    if label == 1:
        return libNgramSet
    elif label == -1:
        return conNgramSet
    else:
        if libNgramSet or conNgramSet:
            return False
        else:
            return True

In [42]:
# function to filter the dataframe
def filterText(df):
    filteredText = []
    for i in tqdm(range(df.shape[0])):
        # grab the label and tokenize each sentence in the doc
        label = df.label.iloc[i]
        sents = sent_tokenize(df.content.iloc[i])
        
        # determine whether to keep the sentences
        for text in sents:
            ngrams = keepSentence(label,text)
            
            # if the returned n-gram set is not empty, add it to the list
            if ngrams:
                filteredText.append((label, text, ngrams))
            
    return pd.DataFrame(filteredText, columns=['label','text','ngrams'])

In [11]:
# test on sample
sample = df.sample(100)
df2 = filterText(sample)

100%|██████████| 100/100 [00:01<00:00, 59.77it/s]


In [12]:
df2.groupby('label').count()

Unnamed: 0_level_0,text,ngrams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,11,11
0,274,274
1,24,24


In [43]:
# filter the full dataset
df2 = filterText(df)
df2.groupby('label').count()

100%|██████████| 142570/142570 [38:55<00:00, 47.08it/s]


Unnamed: 0_level_0,text,ngrams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,16745,16745
0,277751,277751
1,28230,28230


In [44]:
# delete original set for memory conservation
del df

In [49]:
# n = round(min(df2[df2.label == -1].shape[0], df2[df2.label == 1].shape[0]),-3)
n = 16000

In [None]:
# grab a sample of values from each label for label balancing
s1 = df2.loc[df2.label == -1].sample(n)
s2 = df2.loc[df2.label == 1].sample(n)
s3 = df2.loc[df2.label == 0].sample(n)

In [52]:
# create new df with balanced labels
df2 = s1.append(s2).append(s3)
df2.shape, n*3

((48000, 3), 48000)

In [54]:
# save off the filtered dataset
with open(r'../data/filteredNews6.pickle', 'wb') as f:
    pickle.dump(df2.loc[:,['label','text']], f, pickle.HIGHEST_PROTOCOL)

## Reload saved data and remove escape characters
Found that escape characters were causing trouble with model predictions so we removed them

In [15]:
with open(r'../data/filteredNews6.pickle', 'rb') as f:
    df = pickle.load(f)

In [5]:
# example of escape characters in text causing issues
s = df.text.iloc[30794]
s

'@CookPolitical\r  —   Dave Wasserman (@Redistrict) March 18, 2016       Dave Wasserman is the US House editor of the Cook Political Report, the gold standard for granular analysis of congressional races.'

In [6]:
# showing the break in the sentence from the carriage return
print(s)

  —   Dave Wasserman (@Redistrict) March 18, 2016       Dave Wasserman is the US House editor of the Cook Political Report, the gold standard for granular analysis of congressional races.


In [18]:
# filter out any escape characters and create filtered text column
pattern = re.compile('[\t\r\n\f\v]')
newText = []
for i,t in enumerate(df.text):
    newText.append(re.sub(pattern, '', t))
df['newText'] = newText

In [19]:
# drop unfiltered text
df = df.drop('text', axis=1)
df.columns = ['label','text']

In [20]:
# save off filtered dataset
with open(r'../data/filteredNews6.pickle', 'wb') as f:
    pickle.dump(df, f, pickle.HIGHEST_PROTOCOL)
with open(r'../data/filteredNews6.pickle', 'rb') as f:
    df = pickle.load(f)

In [21]:
# check that it worked...
pattern = 'Dave Wasserman'
for i,t in enumerate(df.text):
    if pattern in t:
        print(i,t,'\n')

17651 Here is Dave Wasserman, who writes for the Cook Political Report and FiveThirtyEight, making the argument:   Jill Stein is now officially the Ralph Nader of 2016. 

22942 And Bernie Sanders’s path to victory depends on winning the voters Hillary Clinton won in 2008,” says Dave Wasserman, a political analyst at the nonpartisan Cook Political Report. 

28648 One of the top guys I know to follow for this kind of analysis on Twitter is Dave Wasserman of the Cook Political Report. 

30794 @CookPolitical  —   Dave Wasserman (@Redistrict) March 18, 2016       Dave Wasserman is the US House editor of the Cook Political Report, the gold standard for granular analysis of congressional races. 



## Check sample sentences for each label

In [22]:
for s in df[df.label == 1].sample(5).text.values:
    print(s,'\n')

On the prevalence of substance abuse in the United States, An estimated 20. 

The episode was just one of several examples that highlighted the relationship Pruitt and his top aides maintained with Devon Energy and the oil and gas industry during his time as Oklahoma attorney general. 

Schleicher says there’s also great promise in the Common Core State Standards, adopted by over 40 states, and which   Donald Trump has called “a total disaster. 

“I continue to have real concerns about the Medicaid policies in this bill, especially those that impact drug treatment at a time when Ohio is facing an opioid epidemic,” Portman said this week, citing the issue that became his personal rallying cry last year. 

It called on President Salva Kiir and his rival Vice President Riek Machar to control their respective forces, prevent the spread of violence and genuinely commit themselves to the implementation of a ceasefire and peace agreement. 



In [23]:
for s in df[df.label == -1].sample(5).text.values:
    print(s,'\n')

[Judd’s comments came during an interview with Fox Business Channel’s Stuart Varney where he addressed the dramatic impact the Trump Administration has already had on reducing illegal border crossings. 

A second picture from the same parade showed a man dressed in the rags of a homeless man carrying a sign reading “if only I was a migrant” reports   Russia’s Federal Security Service (FSB) announced Monday that it had arrested seven members of an Islamic State terror cell who were allegedly planning major jihadist attacks on Moscow and St. Petersburg. 

Herridge joined FNC in 1996 as a   correspondent. 

Reports note that the gang rape might never have come to light if one of Burrows’ fellow gang members hadn’t bragged about the rape to his   and showed him the “funny” video recording of the rape, which he had saved on his cellphone. 

” “Expect our civil disobedience to continue,” he added. 



In [24]:
for s in df[df.label == 0].sample(5).text.values:
    print(s,'\n')

”I say it and I’m going to keep saying it and some people say: ’Wow that makes sense’ and some people say: ’That’s not very nice,’” Trump said. 

Historically, those taxes have been kept low by the government’s aversion to raising rates. 

”It couldn’t be any worse, could it?” A spokeswoman for Clinton said Wednesday’s report was ”disturbing.” ”These reports suggest that he lied on the debate stage and that the disgusting behavior he bragged about in the tape is more than just words,” said Jennifer Palmieri, a spokeswoman for the Clinton campaign. 

Last year, China signed an agreement with Pakistan for the sale of eight submarines. 

English, who was the sole contender for the job, announced Social Housing Minister Paula Bennett as the deputy leader, following a special caucus meeting of the ruling   National Party. 

