In [1]:
import os
import shutil
import re

import pandas as pd

# Setup

In [2]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "data")

# Where we will store data from all sources after combining it
ALL_DATA_DIR = os.path.join(DATA_DIR, "combined")
# This will hold the converted versions of the PDF documents
TEXT_FILES_DIR = os.path.join(ALL_DATA_DIR, "text_files")
# This will hold the text files but split by paragraphs and then joined back with special delimiters
SPLIT_TEXT_FILES_DIR = os.path.join(ALL_DATA_DIR, "text_files_split")

if os.path.exists(SPLIT_TEXT_FILES_DIR):
    shutil.rmtree(SPLIT_TEXT_FILES_DIR)
os.makedirs(SPLIT_TEXT_FILES_DIR)

# Read in metadata

In [3]:
df_speeches = pd.read_csv(os.path.join(ALL_DATA_DIR, "document_data.csv"), index_col="id")
df_speeches.Date = pd.to_datetime(df_speeches["Date"])
df_speeches.head(1)

Unnamed: 0_level_0,Title,Type,Date,Source,link,scanned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Accountability for Perpetrators: UN Officials ...,Official Statement,2019-11-07,UN Special Representative of the Secretary-Gen...,https://www.globalr2p.org/wp-content/uploads/2...,False


# Split and write new files

In [4]:
MIN_PAR_LENGTH_CHARS = 30

out_paragraph_delimiter = "\n\n" + "=" * 20 + "\n\n"


for i, filename in enumerate(os.listdir(TEXT_FILES_DIR)):    
    doc_id = int(filename.split(".")[0])
    
    with open(os.path.join(TEXT_FILES_DIR, filename), "r") as fp:
        text = fp.read()
    
    # Replace parsing artefacts
    text = text.replace("\xa0", "").replace("\t\n", "")
    text = re.sub(r"[ \t]+", " ", text)
    
    # Split text
    paragraphs = tuple(
        p for p in re.split('(\.\s*"?”?\s*\n)|(;\n\n)', text) if p and len(p) >= MIN_PAR_LENGTH_CHARS
    )
    
    # Join back up with special delimiter and write
    out_text = out_paragraph_delimiter.join(paragraphs)
    with open(os.path.join(SPLIT_TEXT_FILES_DIR, filename), "w") as fp:
        fp.write(out_text)