# Purpose

Extract all textual informations from official and government statements on the Responsibility to Protect (R2P) as found on https://www.globalr2p.org/resources/?s&filter%5B0%5D=official-statement&filter%5B1%5D=government-statement&tax=resource_type

In [1]:
import urllib.parse
import re
import shutil
import os
from collections import defaultdict, Counter

# Scraping
from requests import get
from bs4 import BeautifulSoup

# Data sheets 
import pandas as pd

# Collect direct download links for every available statement

Go through every page of results for https://www.globalr2p.org/resources/?s&filter%5B0%5D=official-statement&filter%5B1%5D=government-statement&tax=resource_type.

 - For every result, follow the link in the "Title section"
 - On the page we and on, look for a download link.
 - Store that download link and the cells in the table row in a list

In [2]:
# Only speeches from years earlier than this will be downloaded. Set to `None` if you want to include all speeches
max_year = 2020

In [3]:
%%time

# Fake user agent, otherwise we get a 403
request_headers = {'User-Agent': 'Mozilla/5.0'}

data = []
headers = None

# Collect all links to pages with links to files for a particular speech
# Process pages in turn until we get a non-200 response
page = 0
while True:
    page += 1
    print(f"---\tPage {page}\t---")

    url = f"https://www.globalr2p.org/resources/page/{page}/?s&filter[0]=official-statement&filter[1]=government-statement&tax=resource_type"
    resp = get(url, headers=request_headers)
    
    if not resp.status_code == 200:
        print(f"Got status code {resp.status_code}. Stopping.")
        break
    
    # Parse HTML in response
    raw_html = resp.content
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    table = soup.find("table")
    table_rows = table.find_all("tr")
              
    if headers is None:
        headers = list(h.text for h in table_rows[0].find_all("th"))
        headers.append("link")
    
    for table_row in table_rows[1:]:
        data_row = list(cell.text.strip() for cell in table_row.find_all("td"))
        # Get the link to the page with downloads
        link_to_download_page = table_row.find("a").attrs["href"]
        
        # Go to download page, get link to English version if available
        resp_down = get(link_to_download_page, headers=request_headers)
        
        if not resp_down.status_code == 200:
            print(f"Failed to fetch download page for {data_row} on page {page}")
            continue
        
        # Parse response and get all links
        dp_soup = BeautifulSoup(resp_down.content, 'html.parser')
        download_links = dp_soup.find_all(rel="download")
        
        if len(download_links) < 1:
            print(f"Could not find download links at {link_to_download_page}")
            continue
        
        # We want a link to an English version, if available. If only one link present, use that    
        correct_link = None
        if len(download_links) == 1:
            correct_link = download_links[0].attrs["href"]
        else:
            for link in download_links:
                if "english" in link.text.lower():
                    correct_link = link.attrs["href"]
        
        if correct_link is None:
            print(f"Could not find an English link at {link_to_download_page}. Will use first link available: "
                  f"{download_links[0].text.strip()}")
            correct_link = download_links[0].attrs["href"]
        
        data_row.append(correct_link)
        
        year = int(data_row[2][-4:])
        if max_year and year >= max_year:
            print(f"Skipping since entry is from year {year}.")
            continue
            
        data.append(data_row)

# Add unique IDs to every data row
headers.append("id")
for i, row in enumerate(data):
    row.append(i)

---	Page 1	---
Skipping since entry is from year 2020.
---	Page 2	---
---	Page 3	---
---	Page 4	---
Could not find an English link at https://www.globalr2p.org/resources/statement-by-the-un-special-adviser-on-the-prevention-of-genocide-on-his-visit-to-the-central-african-republic-october-2017/. Will use first link available: Statement by the UN Special Adviser on the Prevention of Genocide on his visit to the Central African Republic, October 2017 [EN]
---	Page 5	---
---	Page 6	---
---	Page 7	---
---	Page 8	---
---	Page 9	---
---	Page 10	---
---	Page 11	---
---	Page 12	---
Could not find an English link at https://www.globalr2p.org/resources/statement-by-thailand-at-the-2017-un-general-assembly-informal-interactive-dialogue-on-the-responsibility-to-protect/. Will use first link available: Download PDF Version
---	Page 13	---
Could not find download links at https://www.globalr2p.org/resources/opening-ceremony-statement-by-un-assistant-secretary-general-fabrizio-hochschild-at-the-7th-an

# Download and save documents

Once we have collected all the links, we download each document in turn. Documents are stored in files called `<document-id>.pdf` where `document-id` is a numerical identifier assigned to each document when collecting the download links in the step above

In [6]:
# Download and save a PDF in one line
def save_pdf(url, filepath, **request_kwargs):
    with open(filepath, 'wb') as fp:
        fp.write(get(url, **request_kwargs).content)

In [7]:
# Setup directory structure
MAIN_DIR = "."

DATA_DIR = os.path.join(MAIN_DIR, 'data')
GR2P_DIR = os.path.join(DATA_DIR, "gr2p")
RAW_DIR = os.path.join(GR2P_DIR, "pdf")

if os.path.exists(RAW_DIR):
    shutil.rmtree(RAW_DIR)
os.makedirs(RAW_DIR)

In [8]:
%%time
# Download each document in turn
request_headers = {'User-Agent': 'Mozilla/5.0'}
for i, row in enumerate(data):
    row_id = row[-1]
    download_link = row[-2]
    print(f"Downloading {i + 1} out of {len(data)}")
    
    try:
        save_pdf(download_link, os.path.join(RAW_DIR, f"{row_id}.pdf"), headers=request_headers)
    except Exception as e:
        print(f"Could not download and save for ID {row_id}, {download_link} : {e}")

Downloading 1 out of 461
Downloading 2 out of 461
Downloading 3 out of 461
Downloading 4 out of 461
Downloading 5 out of 461
Downloading 6 out of 461
Downloading 7 out of 461
Downloading 8 out of 461
Downloading 9 out of 461
Downloading 10 out of 461
Downloading 11 out of 461
Downloading 12 out of 461
Downloading 13 out of 461
Downloading 14 out of 461
Downloading 15 out of 461
Downloading 16 out of 461
Downloading 17 out of 461
Downloading 18 out of 461
Downloading 19 out of 461
Downloading 20 out of 461
Downloading 21 out of 461
Downloading 22 out of 461
Downloading 23 out of 461
Downloading 24 out of 461
Downloading 25 out of 461
Downloading 26 out of 461
Downloading 27 out of 461
Downloading 28 out of 461
Downloading 29 out of 461
Downloading 30 out of 461
Downloading 31 out of 461
Downloading 32 out of 461
Downloading 33 out of 461
Downloading 34 out of 461
Downloading 35 out of 461
Downloading 36 out of 461
Downloading 37 out of 461
Downloading 38 out of 461
Downloading 39 out of

Downloading 301 out of 461
Downloading 302 out of 461
Downloading 303 out of 461
Downloading 304 out of 461
Downloading 305 out of 461
Downloading 306 out of 461
Downloading 307 out of 461
Downloading 308 out of 461
Downloading 309 out of 461
Downloading 310 out of 461
Downloading 311 out of 461
Downloading 312 out of 461
Downloading 313 out of 461
Downloading 314 out of 461
Downloading 315 out of 461
Downloading 316 out of 461
Downloading 317 out of 461
Downloading 318 out of 461
Downloading 319 out of 461
Downloading 320 out of 461
Downloading 321 out of 461
Downloading 322 out of 461
Downloading 323 out of 461
Downloading 324 out of 461
Downloading 325 out of 461
Downloading 326 out of 461
Downloading 327 out of 461
Downloading 328 out of 461
Downloading 329 out of 461
Downloading 330 out of 461
Downloading 331 out of 461
Downloading 332 out of 461
Downloading 333 out of 461
Downloading 334 out of 461
Downloading 335 out of 461
Downloading 336 out of 461
Downloading 337 out of 461
D

# Create dataframe with document metadata

For each document, store the metadata found in every result row at https://www.globalr2p.org/resources/?s&filter%5B0%5D=official-statement&filter%5B1%5D=government-statement&tax=resource_type plus the `id` assigned to every document when scraping the download links.

The `id` is important since for every document the corresponding PDF is saved in a file under that name.

In [9]:
df_speeches = pd.DataFrame(data, columns=headers)
df_speeches["Date"] = pd.to_datetime(df_speeches["Date"])

df_speeches.set_index("id", inplace=True)
print("------\nInfo:\n\n")
display(df_speeches.info())
print("\n\n------\nSample:")
display(df_speeches.sample(2))

metadata_file = os.path.join(GR2P_DIR, "globalr2p_docs_data.csv")
print(f"\n\nStoring metadata in {metadata_file}")
df_speeches.to_csv(metadata_file);

------
Info:


<class 'pandas.core.frame.DataFrame'>
Int64Index: 461 entries, 0 to 460
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Title   461 non-null    object        
 1   Type    461 non-null    object        
 2   Date    461 non-null    datetime64[ns]
 3   Source  461 non-null    object        
 4   link    461 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 21.6+ KB


None



------
Sample:


Unnamed: 0_level_0,Title,Type,Date,Source,link
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
117,Statement by Switzerland at the 2017 UN Genera...,Government Statement,2017-09-06,Switzerland,https://www.globalr2p.org/wp-content/uploads/2...
374,Statement by Estonia on behalf of Latvia and L...,Government Statement,2014-09-08,Estonia,https://www.globalr2p.org/wp-content/uploads/2...




Storing metadata in ./gr2p/globalr2p_docs_data.csv
