# Scrape Trump Timeline Wikipedia Pages
* Author: Angelina Li
* Last run: Dec 30, 2019
* Purposes: Scrape trump timeline data

In [1]:
import datetime as dt
import os
import re
import requests
import time

from bs4 import BeautifulSoup

In [2]:
# These are relative paths - running the script from a different location may produce surprising results.
DATA_DIR = os.path.join("..", "..", "frontend", "data")
S_PAUSE = 2 # how many seconds to pause in between requests
REQ_SUCCESS = 200 # success status code
URL_TEMPLATE = "https://en.wikipedia.org/wiki/Timeline_of_the_Donald_Trump_presidency_({year}_Q{quarter})"

In [3]:
def get_soup(url_address, pause_secs=S_PAUSE):
    page = requests.get(url_address)
    if page.status_code != REQ_SUCCESS:
        print("Couldn't load content on this page:", url_address)
        return
    soup = BeautifulSoup(page.content, "html.parser")
    time.sleep(pause_secs)
    print("Loaded page:", url_address)
    return soup

In [4]:
get_soup(URL_TEMPLATE.format(year=2017, quarter=1)).prettify()[:400]

Loaded page: https://en.wikipedia.org/wiki/Timeline_of_the_Donald_Trump_presidency_(2017_Q1)


'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   Timeline of the Donald Trump presidency (2017 Q1) - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","Feb'

In [11]:
DATE_FORMAT = "%A %B %d" # e.g. Friday, January 27 - non 0 padded day

def get_page_rows(year, soup):
    
    row_data = {}
    
    trs = []
    for wt in soup.find_all("table", {"class": "wikitable"}):
        trs += wt.find_all("tr")
    
    for row in trs:
        # row is valid if it has a column with a valid date
        
        cols = row.find_all("td")
        if not cols: 
            continue
        date_string = clean_wiki_string(cols[0].getText())
        
        if len(cols) >= 2 and try_get_dt(date_string, year):
            date_formatted = try_get_dt(date_string, year).strftime("%Y-%m-%d")
            events = [ clean_wiki_string(li.getText()) for li in cols[1].find_all("li")]
            row_data[date_formatted] = events
            # print("PROCESSED DATE:", date_string)
        else:
            print("ERROR W/ ROW:", date_string)
    return row_data

def clean_wiki_string(string):
    # clean footnotes, standardize spaces, strip trailing whitespace
    cleaned = re.sub(" +", " ", re.sub("\[\d+\]", "", string)).strip()
    return cleaned

def try_get_dt(dt_string, year):
    if not dt_string: return None
    # replace commas to accomodate 2017 Q4 formatting
    dt_string = re.sub(",", "", dt_string)
    try:
        datetime = dt.datetime.strptime(dt_string, DATE_FORMAT)
        datetime = datetime.replace(year=year)
        return datetime
    except ValueError:
        return None

url_address = URL_TEMPLATE.format(year=2018, quarter=1)
soup = get_soup(url_address)
get_page_rows(2018, soup)

Loaded page: https://en.wikipedia.org/wiki/Timeline_of_the_Donald_Trump_presidency_(2018_Q1)
ERROR W/ ROW: Week 50[edit]
ERROR W/ ROW: Week 51[edit]
ERROR W/ ROW: Week 52[edit]
ERROR W/ ROW: Week 53[edit]
ERROR W/ ROW: Week 54[edit]
ERROR W/ ROW: Week 54[edit]
ERROR W/ ROW: Week 55[edit]
ERROR W/ ROW: Week 56[edit]
ERROR W/ ROW: Week 57[edit]
ERROR W/ ROW: Week 58[edit]
ERROR W/ ROW: Week 58[edit]
ERROR W/ ROW: Week 59[edit]
ERROR W/ ROW: Week 60[edit]
ERROR W/ ROW: Week 61[edit]
ERROR W/ ROW: Week 62[edit]
ERROR W/ ROW: Preceded byTrump presidency (2017 Q4)


{'2018-01-01': ['The Trump administration announces it will withhold the scheduled millions of military aid to Pakistan with President Trump declaring it a terrorist "safe haven".',
  "Nick Ayers announces that Vice President Mike Pence's chief lawyer, Mark Paoletta, and domestic policy director, Daris Meeks, are to resign."],
 '2018-01-02': ['President Trump tweets that his "nuclear button" is larger and more powerful than that of Kim Jong-un.',
  "On Twitter, President Trump refers to the Department of Justice as the 'deep state' and then calls on it to investigate former FBI director James Comey and Hillary Clinton's top aide, Huma Abedin.",
  'Via Twitter, President Trump threatens to cut off US aid to the Palestinian Authority, claiming the Palestinians were no longer willing to negotiate on a peace process with the Israelis - seemingly after his December 2017 decision to recognize Jerusalem as the capital of Israel.'],
 '2018-01-03': ['President Trump disbands his Presidential Ad

In [12]:
def get_all_rows():
    data = {}
    year = 2017
    quarter = 1
    while True:
        url_address = URL_TEMPLATE.format(year=year, quarter=quarter)
        soup = get_soup(url_address)
        if not soup:
            print("STOPPED AT Y: {}, Q: {}".format(year, quarter))
            break
        print("PROCESSING Y: {}, Q: {}".format(year, quarter))
        pg_data = get_page_rows(year, soup)
        print("=> FOUND {} ROWS".format(len(pg_data)))
        data.update(pg_data)
        
        # increment whatever is needed
        if quarter == 4:
            quarter = 1
            year += 1
        else:
            quarter += 1
    return data

all_rows = get_all_rows()

Loaded page: https://en.wikipedia.org/wiki/Timeline_of_the_Donald_Trump_presidency_(2017_Q1)
PROCESSING Y: 2017, Q: 1
ERROR W/ ROW: Week 1[edit]
ERROR W/ ROW: Week 2[edit]
ERROR W/ ROW: Week 2[edit]
ERROR W/ ROW: Week 3[edit]
ERROR W/ ROW: Week 4[edit]
ERROR W/ ROW: Week 5[edit]
ERROR W/ ROW: Week 6[edit]
ERROR W/ ROW: Week 6[edit]
ERROR W/ ROW: Week 7[edit]
ERROR W/ ROW: Week 8[edit]
ERROR W/ ROW: Week 9[edit]
ERROR W/ ROW: Week 10[edit]
ERROR W/ ROW: Preceded byObama presidency (2017)
=> FOUND 71 ROWS
Loaded page: https://en.wikipedia.org/wiki/Timeline_of_the_Donald_Trump_presidency_(2017_Q2)
PROCESSING Y: 2017, Q: 2
ERROR W/ ROW: Week 10[edit]
ERROR W/ ROW: Week 11[edit]
ERROR W/ ROW: Week 12[edit]
ERROR W/ ROW: Week 13[edit]
ERROR W/ ROW: Week 14[edit]
ERROR W/ ROW: Week 15[edit]
ERROR W/ ROW: Week 16[edit]
ERROR W/ ROW: Week 17[edit]
ERROR W/ ROW: Week 18[edit]
ERROR W/ ROW: Week 19[edit]
ERROR W/ ROW: Week 19[edit]
ERROR W/ ROW: Week 20[edit]
ERROR W/ ROW: Week 21[edit]
ERROR W/ 

In [13]:
first_key = list(all_rows.keys())[0]
print(first_key)
all_rows[first_key]

2017-01-20


['45th President Donald Trump and 48th Vice President Mike Pence take the Oath of Office.',
 'President Trump proclaims a National Day of Patriotic Devotion.',
 "According to denied reports in December 2017, while seated at Trump's inauguration speech, forthcoming National Security Advisor Michael Flynn texts a former business partner that Russian sanctions blocking a private Russian-backed plan to build nuclear plants in the Middle East will now be 'ripped up'.",
 'President Trump issues Executive Order 13765 to scale back parts of the Affordable Care Act.',
 'The Trump administration suspends an Obama administration cut to Federal Housing Authority mortgage insurance premiums.',
 'President Trump signs a bill waiving a rule that requires military personnel to wait seven years after retiring before serving in a civilian post, to allow retired Marine general James Mattis to become U.S. Secretary of Defense. The Senate confirms Mattis as the 26th U.S. Secretary of Defense in a vote of 9

In [15]:
import json

DATA_FN = os.path.join(DATA_DIR, "wiki.json")
with open(DATA_FN, "w") as outfile:
    outfile.write("var wiki = ")
    json.dump(all_rows, outfile)