# Purpose

Extract separate speakers from several UN documents:

 - https://undocs.org/A/73/PV.93
 - https://undocs.org/A/73/PV.94
 - https://undocs.org/A/73/PV.95
 - https://undocs.org/A/73/PV.96
 - https://undocs.org/A/73/PV.99
 - https://undocs.org/A/73/PV.100
 - https://undocs.org/A/72/PV.105
 
These are all formatted in the same way and have been downloaded to a local directory.

This notebook does the following for each document:
 - Converts it to text using the `pdftotext` utility
 - Finds the beginning of each statement (They all start with _Mr/Ms/Mrs Somebody (Country):_)
 - Splits the texts into speeches
 - Saves the new speeches as separate files

In [22]:
import os
import re
import shutil

from collections import defaultdict

In [23]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "data")
GA_DOCS_DIR = os.path.join(DATA_DIR, "general_assembly_docs")
PDF_DIR = os.path.join(GA_DOCS_DIR, "pdf")
TXT_DIR = os.path.join(GA_DOCS_DIR, "txt")
STATEMENTS_DIR = os.path.join(GA_DOCS_DIR, "statements")

# Remove any text files from previous runs
if os.path.exists(TXT_DIR):
    shutil.rmtree(TXT_DIR)
os.makedirs(TXT_DIR)

if os.path.exists(STATEMENTS_DIR):
    shutil.rmtree(STATEMENTS_DIR)
os.makedirs(STATEMENTS_DIR)

## Convert PDFs to text

In [24]:
for pdf_filename in os.listdir(PDF_DIR):
    text_filename = ".".join(list(pdf_filename.split(".")[:-1]) + ["txt"])
    
    pdf_filepath = os.path.join(PDF_DIR, pdf_filename)
    text_filepath = os.path.join(TXT_DIR, text_filename)
    
    print(f"Converting {pdf_filepath} to {text_filepath}")
    !pdftotext {pdf_filepath} -x 0 -W 1000 -y 90 -H 635 -nopgbrk {text_filepath} 

Converting ./data/general_assembly_docs/pdf/A_73_PV.93_E.pdf to ./data/general_assembly_docs/txt/A_73_PV.93_E.txt
Converting ./data/general_assembly_docs/pdf/A_72_PV.99_E.pdf to ./data/general_assembly_docs/txt/A_72_PV.99_E.txt
Converting ./data/general_assembly_docs/pdf/A_72_PV.105_E.pdf to ./data/general_assembly_docs/txt/A_72_PV.105_E.txt
Converting ./data/general_assembly_docs/pdf/A_73_PV.96_E.pdf to ./data/general_assembly_docs/txt/A_73_PV.96_E.txt
Converting ./data/general_assembly_docs/pdf/A_72_PV.100_E.pdf to ./data/general_assembly_docs/txt/A_72_PV.100_E.txt
Converting ./data/general_assembly_docs/pdf/A_73_PV.94_E.pdf to ./data/general_assembly_docs/txt/A_73_PV.94_E.txt
Converting ./data/general_assembly_docs/pdf/A_73_PV.95_E.pdf to ./data/general_assembly_docs/txt/A_73_PV.95_E.txt


## Extract dates from texts

In [25]:
# Map text file names to dates of documents

file_to_date = dict()
for filename in os.listdir(TXT_DIR):
    filepath = os.path.join(TXT_DIR, filename)
    with open(filepath, 'r') as fp:
        header = "".join(fp.readlines()[:6])
        # All headers are formed the same, use that to extract date
        ix = header.find("plenary meeting") + 15
        date_str = header[ix:].strip().split("\n")[0].strip()
        file_to_date[filename] = date_str

# Replace dates with slugs
for file, dt in file_to_date.items():
    date_slug = "-".join(dt.lower().strip().replace(".", "").replace(",", "").split())
    file_to_date[file] = date_slug

file_to_date

{'A_72_PV.105_E.txt': 'monday-2-july-2018-10-am',
 'A_73_PV.94_E.txt': 'thursday-27-june-2019-3-pm',
 'A_72_PV.99_E.txt': 'monday-25-june-2018-10-am',
 'A_73_PV.93_E.txt': 'thursday-27-june-2019-10-am',
 'A_73_PV.95_E.txt': 'friday-28-june-2019-10-am',
 'A_73_PV.96_E.txt': 'friday-28-june-2019-3-pm',
 'A_72_PV.100_E.txt': 'monday-25-june-2018-3-pm'}

## Extract speakers and speeches from texts

### Find the starting line for every speech

In [26]:
file_to_matches = defaultdict(lambda: [])

for filename in os.listdir(TXT_DIR):
    filepath = os.path.join(TXT_DIR, filename)
    with open(filepath, 'r') as fp:
        # We ignore the first match (this is the chair)
        first_match = True
        for line_ix, line in enumerate(fp.readlines()):#doc_text.split("\n")):
            match = re.match(r"(?:Mr.+?|Ms.+?)\((.*?)\)", line)
            if match is not None:
                if not first_match:
                    file_to_matches[filename].append((line_ix, match.groups()[0]))
#                     c.update(match.groups())
                first_match = False

### Split text into statements 

In [27]:
# This will hold a dict from country to speech for every file
files_to_statements = dict()

for filename in os.listdir(TXT_DIR):
    with open(os.path.join(TXT_DIR, filename), "r") as fp:
        lines = fp.readlines()

    statements = defaultdict(lambda: "")

    matches = file_to_matches[filename]
    for (first_line, country), (last_line, _) in zip(matches[:-1], matches[1:]):
        statement = "".join(lines[first_line:last_line])
        # Remove speaker and country from statement
        statement = statement[statement.find(")") + 2:]
        statements[country] += "\n" + statement.strip()

    last_start, country = matches[-1]
    last_statement = "".join(lines[last_start:])
    # Remove speaker and country
    last_statement = last_statement[last_statement.find(")") + 2:]
    # There is usually a closing statement from the acting president at the end of the speech: scrap that
    last_statement = last_statement[:last_statement.find("The Acting President")]
    statements[country] += "\n" + last_statement.strip()
    
    files_to_statements[filename] = statements

# Save statements

In [28]:
for filename in os.listdir(TXT_DIR):
    date_str = file_to_date[filename]
    for country, statement in files_to_statements[filename].items():
        country_str = "-".join(country.strip().split())
        dest_filepath = os.path.join(STATEMENTS_DIR, f"{date_str}_{country_str}.txt")
        print(f"Writing {len(statement.split())} words to {dest_filepath}")
        with open(dest_filepath, "w") as fp:
            fp.write(statement)

Writing 387 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Chile.txt
Writing 850 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Luxembourg.txt
Writing 788 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Armenia.txt
Writing 1045 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Russian-Federation.txt
Writing 430 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Libya.txt
Writing 746 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Ecuador.txt
Writing 538 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Indonesia.txt
Writing 1606 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Islamic-Republic-of-Iran.txt
Writing 777 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_Myanmar.txt
Writing 747 words to ./data/general_assembly_docs/statements/monday-2-july-2018-10-am_R