# Analysis of Washington Post headlines during the COVID-19 pandemic

This notebook analyzes how the headlines have evolved since the day the first case appeared in the U.S.

Import libraries.

In [250]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
import datetime
import pandas as pd
import time

Set date range we are interested in.

In [242]:
start_day = datetime.date(2020, 1, 21)
end_day = datetime.date.today()
days = (end_day - start_day).days 
desired_range = pd.date_range(start_day, periods=days).tolist()
print('Range: ' + str(start_day) + ' to ' + str(end_day))
print('Days: ' + str(days))
desired_range

Range: 2020-01-21 to 2020-04-09
Days: 79


[Timestamp('2020-01-21 00:00:00', freq='D'),
 Timestamp('2020-01-22 00:00:00', freq='D'),
 Timestamp('2020-01-23 00:00:00', freq='D'),
 Timestamp('2020-01-24 00:00:00', freq='D'),
 Timestamp('2020-01-25 00:00:00', freq='D'),
 Timestamp('2020-01-26 00:00:00', freq='D'),
 Timestamp('2020-01-27 00:00:00', freq='D'),
 Timestamp('2020-01-28 00:00:00', freq='D'),
 Timestamp('2020-01-29 00:00:00', freq='D'),
 Timestamp('2020-01-30 00:00:00', freq='D'),
 Timestamp('2020-01-31 00:00:00', freq='D'),
 Timestamp('2020-02-01 00:00:00', freq='D'),
 Timestamp('2020-02-02 00:00:00', freq='D'),
 Timestamp('2020-02-03 00:00:00', freq='D'),
 Timestamp('2020-02-04 00:00:00', freq='D'),
 Timestamp('2020-02-05 00:00:00', freq='D'),
 Timestamp('2020-02-06 00:00:00', freq='D'),
 Timestamp('2020-02-07 00:00:00', freq='D'),
 Timestamp('2020-02-08 00:00:00', freq='D'),
 Timestamp('2020-02-09 00:00:00', freq='D'),
 Timestamp('2020-02-10 00:00:00', freq='D'),
 Timestamp('2020-02-11 00:00:00', freq='D'),
 Timestamp

Open the calendar for Washington Post in the Internet archives.

In [256]:
url = 'https://web.archive.org/web/*/washingtonpost.com'
browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver') # brew install chromedriver then see in terminal where it was installed to and paste this
browser.get(url)
WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'calendar-grid')))

<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="20be8f11-6315-7c47-a760-46c1d5f8343d", element="e2521cf8-956f-5f4c-9d05-1d1c0ebb45ef")>

In [257]:
dates = browser.find_elements_by_css_selector('.calendar-day')
data = {}

For each day since the beginning of our range, collect headlines from the last front page snapshot of that day.

In [258]:
for i in range(0, len(dates)):
    # Hover over the date, let popup appear, wait for loader to disappear, select scroll area
    hov = ActionChains(browser).move_to_element(dates[i])
    hov.perform()
    popup = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.popup-of-day-content')))
    WebDriverWait(browser, 20).until(EC.invisibility_of_element_located((By.TAG_NAME, 'svg')))
    scroll_area = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.popup-of-day-content > ul > div')))
    
    # Get date and check that it is in our range
    date = popup.find_element_by_class_name('day-tooltip-title')
    date_formatted = datetime.datetime.strptime(date.text, '%B %d, %Y')
    print('Date: ' + str(date_formatted))
    if date_formatted not in desired_range:
        continue # skip if it is not
    else:
        attempts = 0
        while attempts < 2:
            try:
                browser.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_area)
                snapshots = popup.find_elements_by_tag_name('a')        
                last_snapshot = snapshots[len(snapshots) - 1]

                print('Last snapshot taken at ' + last_snapshot.text)
                data[date_formatted] = {'link': last_snapshot.get_attribute('href'),
                                             'time': last_snapshot.text,
                                             'headlines': []}
                break
            except StaleElementReferenceException:
                attempts += 1

Date: 2020-01-01 00:00:00
Date: 2020-01-02 00:00:00
Date: 2020-01-03 00:00:00
Date: 2020-01-04 00:00:00
Date: 2020-01-05 00:00:00
Date: 2020-01-06 00:00:00
Date: 2020-01-07 00:00:00
Date: 2020-01-08 00:00:00
Date: 2020-01-09 00:00:00
Date: 2020-01-10 00:00:00
Date: 2020-01-11 00:00:00
Date: 2020-01-12 00:00:00
Date: 2020-01-13 00:00:00
Date: 2020-01-14 00:00:00
Date: 2020-01-15 00:00:00
Date: 2020-01-16 00:00:00
Date: 2020-01-17 00:00:00
Date: 2020-01-18 00:00:00
Date: 2020-01-19 00:00:00
Date: 2020-01-20 00:00:00
Date: 2020-01-21 00:00:00
Last snapshot taken at 22:49:50
Date: 2020-01-22 00:00:00
Last snapshot taken at 23:44:50
Date: 2020-01-23 00:00:00
Last snapshot taken at 23:28:13
Date: 2020-01-24 00:00:00
Last snapshot taken at 23:50:25
Date: 2020-01-25 00:00:00
Last snapshot taken at 23:58:33
Date: 2020-01-26 00:00:00
Last snapshot taken at 23:55:10
Date: 2020-01-27 00:00:00
Last snapshot taken at 23:57:41
Date: 2020-01-28 00:00:00
Last snapshot taken at 22:43:53
Date: 2020-01-29

In [259]:
data

{datetime.datetime(2020, 1, 21, 0, 0): {'link': 'https://web.archive.org/web/20200121224950/washingtonpost.com',
  'time': '22:49:50',
  'headlines': []},
 datetime.datetime(2020, 1, 22, 0, 0): {'link': 'https://web.archive.org/web/20200122234450/washingtonpost.com',
  'time': '23:44:50',
  'headlines': []},
 datetime.datetime(2020, 1, 23, 0, 0): {'link': 'https://web.archive.org/web/20200123232813/washingtonpost.com',
  'time': '23:28:13',
  'headlines': []},
 datetime.datetime(2020, 1, 24, 0, 0): {'link': 'https://web.archive.org/web/20200124235025/washingtonpost.com',
  'time': '23:50:25',
  'headlines': []},
 datetime.datetime(2020, 1, 25, 0, 0): {'link': 'https://web.archive.org/web/20200125235833/washingtonpost.com',
  'time': '23:58:33',
  'headlines': []},
 datetime.datetime(2020, 1, 26, 0, 0): {'link': 'https://web.archive.org/web/20200126235510/washingtonpost.com',
  'time': '23:55:10',
  'headlines': []},
 datetime.datetime(2020, 1, 27, 0, 0): {'link': 'https://web.archive.o

In [None]:
# Scrape headlines from each page in the links
for i, day in enumerate(data):
    link = data[day]['link']
    
    # Go to website
    browser.get(link)
    
    # Get headlines
    data[day]['headlines'] = [x.text for x in browser.find_elements_by_class_name('headline')]
    
    # Wait 5 seconds between each request 
    time.sleep(5) 

In [247]:
data[datetime.datetime(2020, 1, 21, 0, 0)]['headlines']

['House managers, Trump’s legal team spar over rules to guide trial',
 'Senate rejects Democratic effort to subpoena White House for Ukraine documents',
 'How McConnell’s proposed trial will work',
 'How much power does Chief Justice Roberts have in the impeachment trial?',
 'Analysis: Trump’s legal team begins its impeachment trial defense in very Trumpy form',
 'Mike Bloomberg shifts presidential ad campaign to focus on impeachment',
 'He’s a vulnerable Republican in a trending-blue state. So why won’t he defy Trump on impeachment?',
 'Analysis: Senate Majority Leader Mitch McConnell has a new nickname: #MidnightMitch',
 'First U.S. case of potentially deadly coronavirus confirmed in Washington state',
 'More U.S. troops leave Iraq for medical treatment after Iranian missile attack, Pentagon says',
 'Hillary Clinton savages Bernie Sanders: ‘Nobody likes him’',
 'Impeachment is a race to the bottom. And no one really wins.',
 'Democrats already have these four victories in the impeach

In [248]:
data

{datetime.datetime(2020, 1, 21, 0, 0): {'link': 'https://web.archive.org/web/20200121224950/washingtonpost.com',
  'time': '22:49:50',
  'headlines': ['House managers, Trump’s legal team spar over rules to guide trial',
   'Senate rejects Democratic effort to subpoena White House for Ukraine documents',
   'How McConnell’s proposed trial will work',
   'How much power does Chief Justice Roberts have in the impeachment trial?',
   'Analysis: Trump’s legal team begins its impeachment trial defense in very Trumpy form',
   'Mike Bloomberg shifts presidential ad campaign to focus on impeachment',
   'He’s a vulnerable Republican in a trending-blue state. So why won’t he defy Trump on impeachment?',
   'Analysis: Senate Majority Leader Mitch McConnell has a new nickname: #MidnightMitch',
   'First U.S. case of potentially deadly coronavirus confirmed in Washington state',
   'More U.S. troops leave Iraq for medical treatment after Iranian missile attack, Pentagon says',
   'Hillary Clinton 

8