# Notebook 1: Get Page Source for Official Speech Transcripts

### Introduction

This notebook scrapes the websites for both Mayor deBlasio and Governor Cuomo to retrieve transcripts of their official speeches, dating back to the beginning of 2020, before Covid-19 hit New York City. The original data were scraped in May 2020, yielding around five months of transcripts. *Because the websites have since been updated with more recent speeches, this code will yield a different set of speeches (or may not work) if it is run as-is.*

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os
import re
import pickle
import sys

sys.setrecursionlimit(10000) #to allow pickling
chromedriver = "/Applications/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver

### Scrape Mayor de Blasio's website

I scraped 40 pages of Mayor de Blasio's speech transcripts, resulting in speeches dating back to November 2019. These are later excluded. I started by scraping the urls from the main speech transcripts page, and then scraping the content of each url. 

In [None]:
list_urls = []
for i in range(1, 41):
    full_url = 'https://www1.nyc.gov/office-of-the-mayor/news.page' + '#page-' + str(i)
    list_urls.append(full_url)

In [None]:
def get_source(urls):
    driver = webdriver.Chrome(chromedriver)
    soup_list = []
    for i in urls:
        driver.get(i)
        driver.refresh()
        time.sleep(5)
        soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
    driver.close()
    return soup_list

In [None]:
# Scrape source for each page of main speech transcripts
soups = get_source(list_urls)

In [None]:
def get_links(soup_object):
    links_list = []
    for s in soup_object:
        links = s.find_all('a', {'href': re.compile(r'transcript')})
        for i in links:
            link1 = str(i).replace('"', '')
            if re.search('=(.+)>T', link1) is not None:
                link = re.search('=(.+)>T', link1).group(1)
            else:
                continue
            full_link = 'https://www1.nyc.gov' + link
            links_list.append(full_link)
    return links_list

In [None]:
# Extract links to each page of speech transcripts
link_list = get_links(soups)

In [None]:
# Save links to each page
# with open('data/bdblinks_519.pickle', 'wb') as to_write:
#     pickle.dump(link_list, to_write)

In [None]:
def get_transcript_source(urls):
    driver = webdriver.Chrome(chromedriver)
    doc_source = []
    for i in urls:
        driver.get(i)
        time.sleep(5)
        doc_source.append(BeautifulSoup(driver.page_source, 'html.parser'))
    driver.close()
    return doc_source

In [None]:
# Get source for each speech transcript
transcript_sources = get_transcript_source(link_list)

In [None]:
# Save source for each transcript
# with open('data/bdbsource_519.pickle', 'wb') as to_write:
#     pickle.dump(transcript_sources, to_write)

### Scrape Governor Cuomo's website

I scraped ~20 pages of Governor Cuomos's speech transcripts. Similarly to my approach to scraping Mayor de Blasio's speeches, I started by scraping the urls from the main speech transcripts page, and then scraping the content of each url. 

In [None]:
url = 'https://www.governor.ny.gov/keywords/media'

In [None]:
def get_source(urls):
    driver = webdriver.Chrome(chromedriver)
    soup_list = []
    driver.get(url)
    driver.refresh()
    time.sleep(5)
    soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
    pages = list(range(2,9)) + ([4] * 12) #because the xpath stops corresponding with page number after page 8
    for i in pages:
        path = '//*[@id="DataTables_Table_0_paginate"]/span/a[' + str(i) + ']'
        driver.find_element_by_xpath(path).click()
        time.sleep(5)
        soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
    driver.close()
    return soup_list

In [None]:
# Scrape source for each page of main speech transcripts
sources = get_source(url)

In [None]:
def get_links(soup_object):
    links_list = []
    for s in soup_object:
        links = s.find_all('a', {'href': re.compile(r'transcript')})
        for i in links:
            link1 = str(i).replace('"', '')
            if re.search('=(.+)>\n', link1) is not None:
                link = re.search('=(.+)>\n', link1).group(1)
            else:
                continue
            full_link = 'https://www.governor.ny.gov' + link
            links_list.append(full_link)
    return list(set(links_list))

In [None]:
# Extract links to each page of speech transcripts
link_list = get_links(sources)

In [None]:
# Save links to each page
# with open('data/cuomolinks_519.pickle', 'wb') as to_write:
#     pickle.dump(link_list, to_write)

In [None]:
def get_transcript_source(urls):
    driver = webdriver.Chrome(chromedriver)
    doc_source = []
    for i in urls:
        driver.get(i)
        time.sleep(5)
        doc_source.append(BeautifulSoup(driver.page_source, 'html.parser'))
    driver.close()
    return doc_source

In [None]:
# Get source for each speech transcript
transcript_sources = get_transcript_source(link_list)

In [None]:
# Save source for each transcript
# with open('data/cuomosource_519.pickle', 'wb') as to_write:
#     pickle.dump(transcript_sources, to_write)