# Scraping US Federal Bank Speeches from BIS Website

In [6]:
from bs4 import BeautifulSoup
import urllib.request
import time
import random
import os
import requests

First, I try to scrape the page containing the links to speeches using static scraping methods. 

In [25]:
fp = urllib.request.urlopen('https://www.bis.org/doclist/cbspeeches.htm')
mybytes = fp.read()
page = mybytes.decode('utf8')
fp.close()
page

I use mozilla devtools to find that speeches are store in a `table` tag with each `tr` representing a speech. Further, I see that the link to the individual speech is the first `a` tag in the `tr`. So I try to extract all `tr`s and grab the first `a` from within. 

In [49]:
soup = BeautifulSoup(page.text)
doc_list = soup.find_all('tr')
for doc in doc_list:
    print(doc.find_all('a')[0]['href'])

Seeing that my search isn't returning any items, I look in the network tab of mozilla devtools and confirm my suspicions that the speech rows are populated dynamically. I find an AJAX request to https://www.bis.org/doclist/cbspeeches.htm, which seems like a plausible request to get speeches. Then I use this [firefox extension](https://addons.mozilla.org/en-US/firefox/addon/copy-as-python-requests/) to extract all of the request parameters needed to call out to the AJAX request url.

I then create this function to return a given page by using the request data and changing the "page" parameter.

In [95]:
s = requests.session()

def retrieve_page(page_num):
    response = s.post("https://www.bis.org/doclist/cbspeeches.htm", data={"from": None, "till": None, "countries": "231", "objid": "cbspeeches", "page": str(page_num), "paging_length": "10", "sort_list": "date_desc", "theme": "cbspeeches", "ml": "false", "mlurl": None, "emptylisttext": None}, headers={"X-Requested-With": "XMLHttpRequest"}, cookies={"bisUsrID": "1693932373405160405970", "_pk_ref.1.6290": "[\"\",\"\",1693931557,\"https://www.google.com/\"]", "_pk_id.1.6290": "299803d0f5123d4e.1693420817.", "bisSession": "1693932373405", "_pk_ses.1.6290": "1"})
    soup = BeautifulSoup(response.text)
    speech_list = soup.find_all('tr')
    result = []
    for speech in speech_list:
        result.append(speech.find_all('a')[0]['href'])
    return result

Next, I apply this function to retrieve the links from each of these pages, manually specifying that this should go through 133 pages. I write these links to a file.

In [None]:
with open('urls.txt', 'a') as f:

    for i in range(134):
        print(i + 1)
        speech_urls = retrieve_page(i + 1)
        for speech_url in speech_urls:
            f.write(speech_url + '\n')

        time.sleep(10 + random.random() * 8)

I accidentally had to restart this above cell a few times, so I used the following method to remove duplicates and make a new version with only unique urls.

In [121]:
with open('urls.txt', 'r') as f:
    final = []
    while line := f.readline():
        final.append(line)

    final = set(final)
    with open('urls_dedup.txt', 'a') as f_new:
        for url in final:
            f_new.write(url)


Splitting PDF Retrieval Into Multiple Steps

Next, I retrieve the individual pdfs for the speeches by working through the de-duplicated list of speeches.

In [4]:
base_url = 'https://www.bis.org'

def download_pdf(url):
    f_name = url.split('/')[2]
    response = requests.get(f'{base_url}{url}')
    with open(os.path.join('pdfs', f_name), 'wb') as pdf:
        pdf.write(response.content)
    

I now read in the de-duplicated list of speech urls and change the file extension to pdf for the ones that end in `.htm` since it appears speeches and their corresponding pages use the same naming scheme, just with different file extensions.

In [1]:
final = []
with open('urls_dedup.txt', 'r') as f:
    while line := f.readline():
        final.append(line)

final = [f'{url.split(".")[0]}.pdf' for url in final]

In [14]:
len(final)

1333

I now apply this to retrieve the pdfs in chunks so execution time isn't too long.

In [134]:
for i in range(5):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [135]:
for i in range(5, 50):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [136]:
for i in range(50, 150):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [137]:
for i in range(150, 350):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [138]:
for i in range(350, 400):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [9]:

for i in range(400, 500):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [10]:
for i in range(500, 600):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [11]:
for i in range(600, 900):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])

In [None]:
for i in range(900, 1334):
    time.sleep(10 + random.random() * 10)
    download_pdf(final[i])