In [1]:
import numpy as np
import pandas as pd
import os

os.chdir("../")
from lib.scraper import scrape_body, scrape_header

from collections import Counter

In [2]:
df = pd.read_csv('data/Interactive Media Bias Chart - Ad Fontes Media.csv')
scraped_df = pd.read_csv('data/scraped_data.csv')

In [3]:
df.head()

Unnamed: 0,Source,Url,Bias,Quality
0,ABC,https://abcnews.go.com/Politics/us-disrupted-a...,-5.33,52.33
1,ABC,https://abcnews.go.com/Politics/appeals-court-...,0.67,51.67
2,ABC,https://abcnews.go.com/Politics/electoral-coll...,-10.0,32.0
3,ABC,https://abcnews.go.com/Politics/facebook-agree...,-2.33,52.33
4,ABC,https://abcnews.go.com/Politics/donald-trump-t...,-4.33,52.67


In [4]:
scraped_df.head()

Unnamed: 0.1,Unnamed: 0,Source,Url,Bias,Quality,Header,Body
0,0,ABC,https://abcnews.go.com/Politics/us-disrupted-a...,-5.33,52.33,US disrupted alleged Russian trolls' internet ...,U.S. cyber operators disrupted internet access...
1,1,ABC,https://abcnews.go.com/Politics/appeals-court-...,0.67,51.67,Appeals court says special counsel Robert Muel...,A federal appeals court rejected the most dire...
2,2,ABC,https://abcnews.go.com/Politics/electoral-coll...,-10.0,32.0,The Electoral College limits the campaign play...,"U.S Senator Elizabeth Warren, who is competing..."
3,3,ABC,https://abcnews.go.com/Politics/facebook-agree...,-2.33,52.33,Facebook agrees to hide demographics from land...,Facebook announced Tuesday that it will block ...
4,4,ABC,https://abcnews.go.com/Politics/donald-trump-t...,-4.33,52.67,"Donald Trump and 'the Trump of the Tropics,' B...","President Donald Trump and ""the Trump of the T..."


## Checking number of successful scrapes

In [5]:
clean_scraped = scraped_df.drop(scraped_df[scraped_df['Header'].isna()].index, axis=0)

In [6]:
len(clean_scraped)

1311

In [7]:
len(scraped_df)

1916

## Checking content for scraped data

In [8]:
clean_scraped['header_len'] = clean_scraped['Header'].apply(lambda x: len(str(x)))
clean_scraped['body_len'] = clean_scraped['Body'].apply(lambda x: len(str(x)))

In [9]:
clean_scraped['header_len'].describe()

count    1311.000000
mean       67.473684
std        27.776052
min         2.000000
25%        55.000000
50%        68.000000
75%        84.000000
max       188.000000
Name: header_len, dtype: float64

In [10]:
clean_scraped['body_len'].describe()

count      1311.000000
mean       7185.812357
std       17974.291546
min           3.000000
25%        2607.500000
50%        4588.000000
75%        7159.500000
max      380001.000000
Name: body_len, dtype: float64

## Checking sources where scraper did not work

In [11]:
failed_sources = scraped_df[scraped_df['Header'].isna()]['Source']

In [12]:
failed_sources.unique()

array(['Alternet', 'American Spectator, The', 'Axios',
       'Bipartisan Report', 'CBS', 'CNN', 'Conservative Tribune',
       'Counterpunch', 'Daily Beast', 'Daily Caller', 'Daily Kos',
       'Daily Signal', 'Fortune', 'Fox News', 'FreeSpeech TV',
       'Guacamoley', 'Huffington Post', 'InfoWars', 'Intercept',
       'LA Times', 'Life News', 'MSNBC', 'NewsPunch', 'Occupy Democrats',
       'One America News Network', 'Palmer Report', 'PJ Media',
       'ProPublica', 'RedState', 'Reuters', 'Second Nexus', 'Spoutable',
       'The Advocate', 'The American Conservative', 'The Economist',
       'The Federalist', 'The Gateway Pundit', 'The Skimm', 'The Week',
       'Time', 'Truthout', 'Twitchy', 'UrNews24', 'Washington Monthly',
       'Washington Times', 'Weather.com', 'World Truth TV',
       'WorldNetDaily'], dtype=object)

In [13]:
len(failed_sources)

605

In [14]:
len(scraped_df['Source'].unique())

108

In [15]:
sorted_source = {k: v for k, v in sorted(Counter(scraped_df['Source'].values).items(), key=lambda item: item[1])}
sorted_source

{'CNSNews': 1,
 'EPI': 1,
 'Spoutable': 1,
 'Forward': 4,
 'UrNews24': 5,
 'Weather.com': 5,
 'Occupy Democrats': 7,
 'American Spectator, The': 9,
 'Counterpunch': 9,
 'Conservative Review': 10,
 'Crooks and Liars': 10,
 'FreeSpeech TV': 10,
 'IJR': 10,
 'Life News': 10,
 'Newsy': 10,
 'One America News Network': 10,
 'Progressive, The': 10,
 'Patribotics': 11,
 'Daily Kos': 12,
 'CBS': 13,
 'Conservative Tribune': 13,
 'Daily Beast': 13,
 'Financial Times': 13,
 'The Advocate': 13,
 'The Nation': 13,
 'Vanity Fair': 13,
 'Wonkette': 13,
 'World Truth TV': 13,
 'Axios': 14,
 'Breitbart': 14,
 'Daily Mail': 14,
 'Forbes': 14,
 'Fortune': 14,
 'Marketwatch': 14,
 'National Review': 14,
 'New York Post': 14,
 'NewsMax': 14,
 'RedState': 14,
 'Slate': 14,
 'The American Conservative': 14,
 'The Atlantic': 14,
 'The Economist': 14,
 'The Federalist': 14,
 'The Skimm': 14,
 'Truthout': 14,
 'Vox': 14,
 'Al Jazeera': 15,
 'BBC': 15,
 'Bipartisan Report': 15,
 'Business Insider': 15,
 'BuzzFe

In [16]:
sorted_failed = {k: v for k, v in sorted(Counter(failed_sources).items(), key=lambda item: item[1])}
sorted_failed

{'CBS': 1,
 'LA Times': 1,
 'Reuters': 1,
 'Second Nexus': 1,
 'Spoutable': 1,
 'Time': 3,
 'The Economist': 4,
 'The Skimm': 4,
 'UrNews24': 5,
 'Weather.com': 5,
 'Occupy Democrats': 7,
 'American Spectator, The': 9,
 'Counterpunch': 9,
 'FreeSpeech TV': 10,
 'Life News': 10,
 'One America News Network': 10,
 'Daily Kos': 12,
 'Conservative Tribune': 13,
 'Daily Beast': 13,
 'NewsPunch': 13,
 'The Advocate': 13,
 'World Truth TV': 13,
 'Axios': 14,
 'Fortune': 14,
 'RedState': 14,
 'The American Conservative': 14,
 'The Federalist': 14,
 'Truthout': 14,
 'Bipartisan Report': 15,
 'Daily Caller': 15,
 'Daily Signal': 15,
 'Guacamoley': 15,
 'Huffington Post': 15,
 'InfoWars': 15,
 'Intercept': 15,
 'PJ Media': 15,
 'ProPublica': 15,
 'Washington Monthly': 15,
 'Washington Times': 15,
 'WorldNetDaily': 15,
 'Alternet': 16,
 'Palmer Report': 16,
 'The Gateway Pundit': 16,
 'Twitchy': 16,
 'The Week': 20,
 'CNN': 25,
 'MSNBC': 33,
 'Fox News': 41}

In [17]:
source_df = pd.DataFrame(sorted_source.items(), columns=['Source', 'Count'])
failed_df = pd.DataFrame(sorted_failed.items(), columns=['Source', 'Failed'])

In [18]:
failed_df.head()

Unnamed: 0,Source,Failed
0,CBS,1
1,LA Times,1
2,Reuters,1
3,Second Nexus,1
4,Spoutable,1


In [19]:
compare_df = source_df.merge(failed_df, on='Source', how='left')
compare_df.head()

Unnamed: 0,Source,Count,Failed
0,CNSNews,1,
1,EPI,1,
2,Spoutable,1,1.0
3,Forward,4,
4,UrNews24,5,5.0


In [20]:
compare_df[compare_df['Failed'].notnull()]

Unnamed: 0,Source,Count,Failed
2,Spoutable,1,1.0
4,UrNews24,5,5.0
5,Weather.com,5,5.0
6,Occupy Democrats,7,7.0
7,"American Spectator, The",9,9.0
8,Counterpunch,9,9.0
11,FreeSpeech TV,10,10.0
13,Life News,10,10.0
15,One America News Network,10,10.0
18,Daily Kos,12,12.0


In [21]:
df[df['Source'] == 'Alternet']['Url']

35    https://www.alternet.org/2019/02/top-gop-leade...
36    https://www.alternet.org/2019/03/heres-what-re...
37    https://www.alternet.org/2019/03/white-nationa...
38    https://www.alternet.org/2019/04/house-republi...
39    https://www.alternet.org/2019/02/conservative-...
40    https://www.alternet.org/2019/02/ex-lawyer-for...
41    https://www.alternet.org/2018/09/michael-moore...
42    https://www.alternet.org/2019/01/evangelical-c...
43    https://www.alternet.org/2019/02/heres-how-muc...
44    https://www.alternet.org/2019/03/trump-has-sol...
45    https://www.alternet.org/2019/03/mueller-just-...
46    https://www.alternet.org/2019/03/this-needs-to...
47    https://www.alternet.org/2019/04/now-trump-thi...
48    https://www.alternet.org/2019/04/he-is-going-t...
49    https://www.alternet.org/2019/04/a-cognitive-s...
50    https://www.alternet.org/2019/04/trump-lashes-...
Name: Url, dtype: object

In [22]:
clean_scraped[clean_scraped['header_len'] == clean_scraped['header_len'].min()]

Unnamed: 0.1,Unnamed: 0,Source,Url,Bias,Quality,Header,Body,header_len,body_len
1024,1024,NewsMax,https://www.newsmax.com/world/globaltalk/jared...,10.67,52.33,\n\n,"Tuesday, 26 February 2019 06:38 AM In an inte...",2,4112
1026,1026,NewsMax,https://www.newsmax.com/world/globaltalk/trump...,5.5,50.5,\n\n,"JONATHAN LEMIRE\nTuesday, 26 February 2019 05...",2,8118
1033,1033,NewsMax,https://www.newsmax.com/finance/peter-morici/t...,13.75,26.75,\n\n,.authImage{float:left;margin-right:10px;heigh...,2,5740


In [23]:
clean_scraped[clean_scraped['Source'] == 'NewsMax']

Unnamed: 0.1,Unnamed: 0,Source,Url,Bias,Quality,Header,Body,header_len,body_len
1024,1024,NewsMax,https://www.newsmax.com/world/globaltalk/jared...,10.67,52.33,\n\n,"Tuesday, 26 February 2019 06:38 AM In an inte...",2,4112
1025,1025,NewsMax,https://www.newsmax.com/headline/border-wall-e...,9.0,45.5,Republicans Expect Few Defections as Congress ...,"Susan Cornwell and Richard Cowan\nTuesday, 26...",79,5797
1026,1026,NewsMax,https://www.newsmax.com/world/globaltalk/trump...,5.5,50.5,\n\n,"JONATHAN LEMIRE\nTuesday, 26 February 2019 05...",2,8118
1027,1027,NewsMax,https://www.newsmax.com/john-gizzi/trump-kim-v...,-5.75,25.0,"Trump, Kim Meet in Human Rights Hell",.authImage{float:left;margin-right:10px;heigh...,36,4394
1028,1028,NewsMax,https://www.newsmax.com/george-j-marlin/new-yo...,30.75,18.75,NY's Progressives Against Progress — AOC and t...,.authImage{float:left;margin-right:10px;heigh...,64,5730
1029,1029,NewsMax,https://www.newsmax.com/headline/trump-social-...,4.75,43.25,Trump Sees 'Collusion' Against Republicans on ...,"Tuesday, 19 March 2019 03:15 PM President Don...",68,1004
1030,1030,NewsMax,https://www.newsmax.com/newsfront/donaldtrumpj...,21.75,36.0,Donald Trump Jr: 'Brexit and My Father's Elect...,"By Theodore Bunker | \nTuesday, 19 March ...",71,1856
1031,1031,NewsMax,https://www.newsmax.com/streettalk/murdoch-fox...,4.5,51.0,"Murdoch's New Fox Debuts on Nasdaq, Names Ex-S...","Tuesday, 19 March 2019 11:33 AM Fox Corp. deb...",71,1825
1032,1032,NewsMax,https://www.newsmax.com/us/ralph-peters-john-m...,4.75,23.75,"Ralph Peters Trashes Trump: 'Draft Dodger, Fis...","By Jason Devaney | \nTuesday, 19 March 20...",57,1874
1033,1033,NewsMax,https://www.newsmax.com/finance/peter-morici/t...,13.75,26.75,\n\n,.authImage{float:left;margin-right:10px;heigh...,2,5740


In [24]:
clean_scraped[clean_scraped['Source'] == 'ABC']

Unnamed: 0.1,Unnamed: 0,Source,Url,Bias,Quality,Header,Body,header_len,body_len
0,0,ABC,https://abcnews.go.com/Politics/us-disrupted-a...,-5.33,52.33,US disrupted alleged Russian trolls' internet ...,U.S. cyber operators disrupted internet access...,76,3967
1,1,ABC,https://abcnews.go.com/Politics/appeals-court-...,0.67,51.67,Appeals court says special counsel Robert Muel...,A federal appeals court rejected the most dire...,104,3777
2,2,ABC,https://abcnews.go.com/Politics/electoral-coll...,-10.0,32.0,The Electoral College limits the campaign play...,"U.S Senator Elizabeth Warren, who is competing...",92,5349
3,3,ABC,https://abcnews.go.com/Politics/facebook-agree...,-2.33,52.33,Facebook agrees to hide demographics from land...,Facebook announced Tuesday that it will block ...,83,3187
4,4,ABC,https://abcnews.go.com/Politics/donald-trump-t...,-4.33,52.67,"Donald Trump and 'the Trump of the Tropics,' B...","President Donald Trump and ""the Trump of the T...",89,7120
5,5,ABC,https://abcnews.go.com/US/body-cam-video-shows...,0.67,51.0,New body cam video shows deadly North Carolina...,Newly-released body camera footage reveals the...,77,3670
6,6,ABC,https://abcnews.go.com/US/wireStory/michigan-a...,0.0,49.33,Michigan AG to review 2015 fatal shooting by I...,DETROIT -- \nMichigan's attorney general will ...,54,843
7,7,ABC,https://abcnews.go.com/Politics/mueller-report...,-2.33,54.33,"As Mueller report release nears, White House p...",While President Donald Trump and the White Hou...,78,3069
8,8,ABC,https://abcnews.go.com/Politics/abortion-right...,1.67,49.0,Abortion rights group asks Supreme Court to st...,Abortion rights advocates have asked the U.S. ...,85,3131
9,9,ABC,https://abcnews.go.com/Politics/trump-attacks-...,-6.0,52.0,"As Trump attacks Omar, progressive Democrats u...",Even as President Donald Trump continues his f...,76,6799


In [25]:
unscraped_url = scraped_df[scraped_df['Header'].isna()]['Url']

### Checking Errors

In [26]:
body_errors = []
head_errors = []

for url in unscraped_url:
    body_errors.append(scrape_body(url))
    head_errors.append(scrape_header(url))

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [27]:
{k: v for k, v in sorted(Counter(body_errors).items(), key=lambda item: item[1])}

{'/*<![CDATA[*/Insticator.ad.loadAd("div-insticator-ad-1");Insticator.ad.loadAd("div-insticator-ad-2");Insticator.load("em",{id : "6cf39429-6912-4a91-b1e2-3e9365a5e9c6"});/*]]>*/  “The biggest college scandal in our history!” exclaims a news reader, referring to the indictment of William Singer, who organized a scheme so the wealthy could bypass the college admissions process and get their children into prestigious schools through a combination of fraud, deceit, and bribery. The scandal might not have received the attention it generated had it not been for actresses Lori Loughlin and Felicity Huffman getting caught in the FBI’s net. If there are celebrities involved, the story must be important. As offensive as this scandal is, it is nowhere near the biggest college scandal in our history. Our colleges and universities have long been mired in ongoing scandals, but they are so common that we hardly consider them newsworthy. College tuition and textbook costs have climbed faster than any

In [28]:
scrape_body()

'ValueError'

In [30]:
df[df['Source'] == 'CNN']

Unnamed: 0,Source,Url,Bias,Quality
253,CNN,https://www.cnn.com/2019/02/26/opinions/americ...,-18.0,19.33
254,CNN,https://www.cnn.com/2019/02/26/politics/suprem...,-3.67,45.33
255,CNN,https://www.cnn.com/2019/03/19/politics/elizab...,-12.67,31.67
256,CNN,https://www.cnn.com/2019/03/19/us/washington-t...,-1.0,49.67
257,CNN,https://www.cnn.com/2019/03/19/opinions/elizab...,-13.33,28.0
258,CNN,https://www.cnn.com/2019/03/19/entertainment/s...,-3.33,47.67
259,CNN,https://edition.cnn.com/2019/04/17/africa/bash...,-0.33,48.67
260,CNN,https://us.cnn.com/2019/04/16/us/diaper-bag-gu...,-4.0,46.0
261,CNN,Cuomo Prime Time,-11.67,32.33
262,CNN,CNN NewsRoom with Poppy Harlow,-10.0,41.67
