In [1]:
import pandas as pd
import nltk
import time, os, pickle
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, names, opinion_lexicon
from collections import Counter
from tqdm import tqdm, tqdm_notebook
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('names')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to /home/alex/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/alex/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

# import data

In [2]:
df1 = pd.read_csv(r'../all-the-news/articles1.csv')
df2 = pd.read_csv(r'../all-the-news/articles2.csv')
df3 = pd.read_csv(r'../all-the-news/articles3.csv')
df = df1.append(df2).append(df3)
df.index = range(df.shape[0])
df['content'] = df.content + ' ' + df.title
df = df.loc[:,['publication','content']]
del df1, df2, df3
df.shape

(142570, 2)

In [3]:
pubs = []
for p in df.publication.unique():
    pubs.extend(p.lower().split())
pubs

['new',
 'york',
 'times',
 'breitbart',
 'cnn',
 'business',
 'insider',
 'atlantic',
 'fox',
 'news',
 'talking',
 'points',
 'memo',
 'buzzfeed',
 'news',
 'national',
 'review',
 'new',
 'york',
 'post',
 'guardian',
 'npr',
 'reuters',
 'vox',
 'washington',
 'post']

# Filter most extreme bias publications

In [4]:
publishers = ['Breitbart', 'CNN', 'Fox News', 'Talking Points Memo','National Review','Reuters','Vox']
idx = [i for i in range(df.shape[0]) if df.publication.iloc[i] in publishers]
df = df.iloc[idx]
df.publication.unique()

array(['Breitbart', 'CNN', 'Fox News', 'Talking Points Memo',
       'National Review', 'Reuters', 'Vox'], dtype=object)

In [5]:
df.shape

(66697, 2)

In [6]:
# actual labels from MBFC
bias_dict = {
    'New York Times': 'left-center',
    'Breitbart': 'extreme-right',
    'CNN': 'left',
    'Business Insider': 'left-center',
    'Atlantic': 'left-center',
    'Fox News': 'right',
    'Talking Points Memo': 'left',
    'Buzzfeed News': 'left-center',
    'National Review': 'right',
    'New York Post': 'right-center',
    'Guardian': 'left-center',
    'NPR': 'left-center',
    'Reuters': 'neutral',
    'Vox': 'left',
    'Washington Post': 'left-center'
}

In [7]:
# simplified labels
for k,v in bias_dict.items():
    if 'left' in v:
        bias_dict[k] = 1
    elif 'right' in v:
        bias_dict[k] = -1
    else:
        bias_dict[k] = 0
for k,v in bias_dict.items():
    print(k,v)

New York Times 1
Breitbart -1
CNN 1
Business Insider 1
Atlantic 1
Fox News -1
Talking Points Memo 1
Buzzfeed News 1
National Review -1
New York Post -1
Guardian 1
NPR 1
Reuters 0
Vox 1
Washington Post 1


In [8]:
df.columns

Index(['publication', 'content'], dtype='object')

In [9]:
df['label'] = [bias_dict[p] for p in df.publication.values]

In [10]:
df.columns

Index(['publication', 'content', 'label'], dtype='object')

In [11]:
df.groupby('publication').count()

Unnamed: 0_level_0,content,label
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
Breitbart,23781,23781
CNN,11488,11488
Fox News,4354,4354
National Review,6203,6203
Reuters,10709,10710
Talking Points Memo,5213,5214
Vox,4947,4947


In [12]:
df.groupby('label').count()

Unnamed: 0_level_0,publication,content
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,34338,34338
0,10710,10709
1,21649,21648


# Find n-grams

split data by label

In [13]:
# split data into liberal and conservative
lib = df.loc[df.label == 1]
con = df.loc[df.label == -1]

create stopwords list

In [14]:
stops = [w.lower() for w in stopwords.words('english')] + \
['trump', 'good','great','bad','pretty','covering','writer',
 'author','like','news','follow','tv','said','could','would',
 'really','best','journalist','journalists','commentator',
 'affiliate','told','reporter','reporters','siriusxm','radio',
 'columnist','contributed','bestselling']

In [15]:
men = [w.lower() for w in names.words('male.txt')]
women = [w.lower() for w in names.words('female.txt')]
stopNames = men + women

In [16]:
stops.extend(pubs)
stops.extend(stopNames)

Get opinion words list

In [17]:
ops = opinion_lexicon.words()

In [18]:
stops = set(stops)
ops = set(ops)

In [19]:
len(stops), len(ops)

(7800, 6786)

In [2]:
# define a helper function to pull the n-grams
def getNgrams(text, stops=None, ops=None, n=2):
    '''return all bigrams in a Counter'''
    def criteria(word):
        if stops:
            if len(word) > 2 and not word.isnumeric() and word not in stops:
                return True
            else:
                return False
        else:
            return True
            
    # intialize counter
    ngrams = Counter()
    
    # tokenize text
    words = word_tokenize(text)
    
    # filter punctuation and single letter words
    words = [w.lower() for w in words]
    
    # throw out sentences with less than two words
    if len(words) < 2:
        return ngrams
    
    # add bigrams to Counter
    for i in range(len(words) - (n-1)):
        gram = ' '.join(words[i:i+n])
        if all([criteria(w) for w in words[i:i+n]]):
            if ops:
                if any([w in ops for w in words[i:i+n]]):
                    ngrams[gram] += 1
            else:
                ngrams[gram] += 1
        
    return ngrams

In [21]:
libTrigrams = Counter()
for pub,content,_ in tqdm(lib.values):
    try:
        sents = sent_tokenize(content)
    except:
        sents = []
    for text in sents:
        libTrigrams.update(getNgrams(text, stops=stops, n=3))

100%|██████████| 21649/21649 [03:33<00:00, 101.18it/s]


In [22]:
libBigrams = Counter()
for pub,content,_ in tqdm(lib.values):
    try:
        sents = sent_tokenize(content)
    except:
        sents = []
    for text in sents:
        libBigrams.update(getNgrams(text, stops=stops, ops=ops, n=2))

100%|██████████| 21649/21649 [03:44<00:00, 96.26it/s]


In [23]:
conTrigrams = Counter()
for pub,content,_ in tqdm(con.values):
    try:
        sents = sent_tokenize(content)
    except:
        sents = []
    for text in sents:
        conTrigrams.update(getNgrams(text, stops=stops, n=3))

100%|██████████| 34338/34338 [04:28<00:00, 127.88it/s]


In [24]:
conBigrams = Counter()
for pub,content,_ in tqdm(con.values):
    try:
        sents = sent_tokenize(content)
    except:
        sents = []
    for text in sents:
        conBigrams.update(getNgrams(text, stops=stops, ops=ops, n=2))

100%|██████████| 34338/34338 [04:18<00:00, 132.96it/s]


In [25]:
# get the 1000 most common liberal and conservative bigrams
commonConTrigrams = [b[0] for b in conTrigrams.most_common()[:1000]]
commonLibTrigrams = [b[0] for b in libTrigrams.most_common()[:1000]]
commonConBigrams = [b[0] for b in conBigrams.most_common()[:1000]]
commonLibBigrams = [b[0] for b in libBigrams.most_common()[:1000]]

In [26]:
remove_bigrams = ['doctor strange','fury road','gold medals','gold medal',
                  'mad men','stranger things','little lies','lose weight',
                  'world champion','premier league','walking dead',
                  'weight loss','grand slam','science fiction','right now.',
                  'open champion','associated press','world heavyweight',
                  'wonder woman','last month','top stories']

In [27]:
# keep only the bigrams for each ideology that appear in the top 1000 of that ideology and not in the top 1000 of the other
libTrigrams_filtered = [(w, libTrigrams[w]) for w in commonLibTrigrams if w not in commonConTrigrams and not any([b in w for b in remove_bigrams])]
libBigrams_filtered = [(w, libBigrams[w]) for w in commonLibBigrams if w not in commonConBigrams and w not in remove_bigrams]
# conservative bigrams also had weird 'amp nbsp' HTML tags so remove those
conTrigrams_filtered = [(w, conTrigrams[w]) for w in commonConTrigrams if w not in commonLibTrigrams and not any([b in w for b in remove_bigrams])]
conBigrams_filtered = [(w, conBigrams[w]) for w in commonConBigrams if w not in commonLibBigrams and w not in remove_bigrams]

In [28]:
len(libTrigrams_filtered),len(conTrigrams_filtered),len(libBigrams_filtered),len(conBigrams_filtered)

(485, 491, 393, 409)

In [38]:
print('top 10 liberal trigrams:')
# sorted(libTrigrams_filtered,key=lambda x: -x[1])[:10]
for k,v in sorted(libTrigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 liberal trigrams:
senior administration official
green card holders
greenhouse gas emissions
north korean leader
federal civil rights
provocative narrative essays
health care policy
republican health care
gop health care
civil rights laws


In [39]:
print('top 10 liberal bigrams:')
# sorted(libBigrams_filtered,key=lambda x: -x[1])[:10]
for k,v in sorted(libBigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 liberal bigrams:
fast facts
opioid epidemic
health reform
lethal injection
budget reconciliation
chronic pain
lead poisoning
intelligence committees
prison sentences
rights advocates


In [40]:
print('top 10 conservative trigrams')
# sorted(conTrigrams_filtered,key=lambda x: -x[1])[:10]
for k,v in sorted(conTrigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 conservative trigrams
battleground prediction map
jerusalem bureau chief
popular weekend talk
tweetsa question needing
voter suppression cost
says voter suppression
border patrol agent
electionhillary blames america
blames america firsthillary
america firsthillary says


In [41]:
print('top 10 conservative bigrams')
# sorted(conBigrams_filtered,key=lambda x: -x[1])[:10]
for k,v in sorted(conBigrams_filtered,key=lambda x: -x[1])[:10]:
    print(k)

top 10 conservative bigrams
illegal aliens
illegal alien
illegal immigrant
migrant crisis
popular weekend
patriot channel
hard truths
limited government
islamic terror
twin falls


In [42]:
# keep just the bigrams and drop the count data
libGrams = [k for k,v in sorted(libTrigrams_filtered,key=lambda x: -x[1])[:100]] + \
           [k for k,v in sorted(libBigrams_filtered,key=lambda x: -x[1])[:100]]
conGrams = [k for k,v in sorted(conTrigrams_filtered,key=lambda x: -x[1])[:100]] + \
           [k for k,v in sorted(conBigrams_filtered,key=lambda x: -x[1])[:100]]

In [43]:
# keep just the bigrams and drop the count data
with open('../data/libGrams_news5.pickle', 'wb') as f:
    pickle.dump(libGrams, f, pickle.HIGHEST_PROTOCOL)
with open('../data/conGrams_news5.pickle', 'wb') as f:
    pickle.dump(conGrams, f, pickle.HIGHEST_PROTOCOL)

# Load liberal and conservative bigrams

In [3]:
with open('../data/libGrams_news5.pickle', 'rb') as f:
    libGrams = pickle.load(f)
    print(libGrams[:5])
with open('../data/conGrams_news5.pickle', 'rb') as f:
    conGrams = pickle.load(f)
    print(conGrams[:5])

['senior administration official', 'green card holders', 'greenhouse gas emissions', 'north korean leader', 'federal civil rights']
['battleground prediction map', 'jerusalem bureau chief', 'popular weekend talk', 'tweetsa question needing', 'voter suppression cost']


# Reload Data and test sentence filtering

In [4]:
df1 = pd.read_csv(r'../all-the-news/articles1.csv')
df2 = pd.read_csv(r'../all-the-news/articles2.csv')
df3 = pd.read_csv(r'../all-the-news/articles3.csv')
df = df1.append(df2).append(df3)
df.index = range(df.shape[0])
df['content'] = df.content + ' ' + df.title
df = df.loc[:,['publication','content']]
del df1, df2, df3
df.shape

(142570, 2)

In [52]:
df = df.sample(10000)

# testing

In [5]:
# actual labels from MBFC
bias_dict = {
    'New York Times': 'left-center',
    'Breitbart': 'extreme-right',
    'CNN': 'left',
    'Business Insider': 'left-center',
    'Atlantic': 'left-center',
    'Fox News': 'right',
    'Talking Points Memo': 'left',
    'Buzzfeed News': 'left-center',
    'National Review': 'right',
    'New York Post': 'right-center',
    'Guardian': 'left-center',
    'NPR': 'left-center',
    'Reuters': 'neutral',
    'Vox': 'left',
    'Washington Post': 'left-center'
}

In [6]:
# simplified labels
for k,v in bias_dict.items():
    if 'left' in v:
        bias_dict[k] = 1
    elif 'right' in v:
        bias_dict[k] = -1
    else:
        bias_dict[k] = 0
for k,v in bias_dict.items():
    print(k,v)

New York Times 1
Breitbart -1
CNN 1
Business Insider 1
Atlantic 1
Fox News -1
Talking Points Memo 1
Buzzfeed News 1
National Review -1
New York Post -1
Guardian 1
NPR 1
Reuters 0
Vox 1
Washington Post 1


In [7]:
df['label'] = [bias_dict[p] for p in df.publication.values]

In [8]:
df.columns

Index(['publication', 'content', 'label'], dtype='object')

In [9]:
def keepSentence(label, text):
    # get bigrams and lexicons
    if len(text.split()) < 6:
        return False
    ngrams = list(getNgrams(text,n=2).keys()) + list(getNgrams(text,n=3).keys())
    
    # get the bigrams and lexicons that appear in the ideology lists
    libNgramSet = set(ngrams).intersection(libGrams)
    conNgramSet = set(ngrams).intersection(conGrams)
    
    # determine whether to keep the sentence
    if label == 1:
        return libNgramSet
    elif label == -1:
        return conNgramSet
    else:
        if libNgramSet or conNgramSet:
            return False
        else:
            return True

In [10]:
def filterText(df):
    filteredText = []
    for i in tqdm(range(df.shape[0])):
        label = df.label.iloc[i]
        try:
            sents = sent_tokenize(df.content.iloc[i])
        except:
            sents = []
        for text in sents:
            ngrams = keepSentence(label,text)
            if ngrams:
                filteredText.append((label, text, ngrams))
            
    return pd.DataFrame(filteredText, columns=['label','text','ngrams'])

In [11]:
sample = df.sample(100)
df2 = filterText(sample)

100%|██████████| 100/100 [00:01<00:00, 59.77it/s]


In [12]:
df2.groupby('label').count()

Unnamed: 0_level_0,text,ngrams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,11,11
0,274,274
1,24,24


In [15]:
for s in df2[df2.label == 1].sample(5).text.values:
    print(s,'\n')

Here’s what you need to know: • American divisions are rapidly widening over President Trump’s order to close the U. S. to refugees and people from seven predominantly Muslim countries. 

Sure enough, she had skin cancer. 

In a video posted on her campaign’s Facebook page shortly after Mr. Sanders departed the White House grounds to visit the Capitol, Mr. Obama described Mrs. Clinton as the most qualified candidate to seek the White House, and implored Democrats to come together to elect her after a divisive party primary. 

Doctors realized that excess cholesterol in our blood predicts a higher risk of heart disease. 

Price, as head of the House Budget Committee and a member of the   Ways and Means Committee, has influence over health care legislation, such as the Affordable Care Act and programs that include Medicare and Medicaid. 



In [16]:
for s in df2[df2.label == -1].sample(5).text.values:
    print(s,'\n')

On Friday’s broadcast of HBO’s “Real Time,” filmmaker Michael Moore  reacted to the FBI announcing it was taking a second look into Democratic presidential nominee former Secretary of State Hillary Clinton’s emails by stating, “She has been attacked and abused for 30 years. 

” Follow Breitbart News investigative reporter and Citizen Journalism School founder Lee Stranahan on Twitter at @Stranahan. 

And the costs of illegal alien crime continued to mount and a lethal opioid epidemic raged. 

Obama’s claim of civic peace is also at odds with the televised evidence dramatic race riots,   cop killings, rapes, murders, illegal alien crimes, and chaos that rippled across the country during the second term of his presidency. 

See the Fox News 2016 battleground prediction map and make your own election projections. 



In [14]:
for s in df2[df2.label == 0].sample(5).text.values:
    print(s,'\n')

Trump’s national security team is reviewing a wide range of options to counter the missile threat. 

Rousseff’s survival hinges on winning over a dwindling number of undecided lawmakers who are also being courted by the man poised to take over if she is ousted, Vice President Michel Temer. 

”The manufacturing malaise that plagued the U. S. is not  . 

She has branded him a traitor. 

Showcasing their attempts to unite with other groups for the election, Islamists campaigned with Awdeh Qawwas, a prominent priest, in the affluent Abdoun district of the capital Amman. 



In [18]:
df2 = filterText(df)
df2.groupby('label').count()

100%|██████████| 142570/142570 [39:11<00:00, 60.62it/s]


Unnamed: 0_level_0,text,ngrams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,17969,17969
0,278353,278353
1,25901,25901


In [23]:
del df

In [24]:
# n = round(min(df2[df2.label == -1].shape[0], df2[df2.label == 1].shape[0]),-3)
n = 17000

In [25]:
s1 = df2.loc[df2.label == -1].sample(n)
s2 = df2.loc[df2.label == 1].sample(n)
s3 = df2.loc[df2.label == 0].sample(n)
type(s1), type(s2), type(s3)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [26]:
s1.append(s2).append(s3).groupby('label').count()

Unnamed: 0_level_0,text,ngrams
label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,17000,17000
0,17000,17000
1,17000,17000


In [27]:
df2 = s1.append(s2).append(s3)
df2.shape, n*3

((51000, 3), 51000)

In [28]:
df2.columns

Index(['label', 'text', 'ngrams'], dtype='object')

In [29]:
# with open(r'../data/filteredNews_top50grams.pickle', 'wb') as f:
#     pickle.dump(df1, f, pickle.HIGHEST_PROTOCOL)
with open(r'../data/filteredNews5.pickle', 'wb') as f:
    pickle.dump(df2.loc[:,['label','text']], f, pickle.HIGHEST_PROTOCOL)

In [30]:
# with open(r'../data/filteredNews_top50grams.pickle', 'rb') as f:
#     df1 = pickle.load(f)
with open(r'../data/filteredNews5.pickle', 'rb') as f:
    df2 = pickle.load(f)

In [31]:
df2.shape

(51000, 2)

In [32]:
df2.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
-1,17000
0,17000
1,17000
