In [1]:
import os
import shutil

import pandas as pd

## Setup directories

In [2]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "data")

# Where we will store data from all sources after combining it
ALL_DATA_DIR = os.path.join(DATA_DIR, "combined")
# This will hold the converted versions of the PDF documents
SPLIT_TEXT_FILES_DIR = os.path.join(ALL_DATA_DIR, "text_files_split")
# This will hold everything related to translating text, including the results of the translations
TRANSLATIONS_DIR = os.path.join(ALL_DATA_DIR, "translations")

TRANSLATIONS_TEXT_DIR = os.path.join(TRANSLATIONS_DIR, "texts")

OUTPUT_DIR = os.path.join(ALL_DATA_DIR, "output")
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR)

## Paragraph delimiter

This is the string which divides paragraphs in our texts

In [3]:
paragraph_delimiter = "\n\n" + "=" * 20 + "\n\n"

## Read in metadata

In [4]:
df_speeches = pd.read_csv(os.path.join(ALL_DATA_DIR, "document_data.csv"), index_col="id")
df_speeches.Date = pd.to_datetime(df_speeches["Date"])
df_speeches.head(1)

Unnamed: 0_level_0,Title,Type,Date,Source,link
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Accountability for Perpetrators: UN Officials ...,Official Statement,2019-11-07,UN Special Representative of the Secretary-Gen...,https://www.globalr2p.org/wp-content/uploads/2...


## Read in paragraphs

In [5]:
par_list = []
for i, filename in enumerate(os.listdir(TRANSLATIONS_TEXT_DIR)):
    with open(os.path.join(TRANSLATIONS_TEXT_DIR, filename), "r") as fp:
        text = fp.read()

    paragraphs = list(text.split(paragraph_delimiter))
    for j, paragraph in enumerate(paragraphs):
        par_list.append({
            "id": int(filename.split(".")[0]),
            "paragraph": j,
            "text": paragraph
        })
df_paragraphs = pd.DataFrame(par_list).pivot(index="id", columns="paragraph", values="text")

df_char_lens = df_paragraphs.applymap(lambda x: len(x) if not pd.isna(x) else float("nan"))
df_word_lens = df_paragraphs.applymap(lambda x: len(x.split()) if not pd.isna(x) else float("nan"))

par_counts = df_paragraphs.shape[1] - df_paragraphs.isna().sum(axis=1)

## Join paragraphs

In [6]:
MIN_CHAR_COUNT = 50
MIN_WORD_COUNT = 10

out_separator = ".\n\n"
out_sep_replacement = "\n"

def join_paragraphs(paragraphs):
    return out_separator.join([
        p.replace(out_separator, out_sep_replacement) for p in paragraphs
        if len(p) >= MIN_CHAR_COUNT and len(p.split()) >= MIN_WORD_COUNT
    ])

df_speeches["text"] = df_paragraphs.apply(lambda ps: join_paragraphs(ps.dropna()), axis=1)

## Merge texts

In [7]:
final_docs = df_speeches.groupby([df_speeches.Date.dt.year, df_speeches.Source])["text"].apply(
    lambda texts: out_separator.join([t for t in texts if not pd.isna(t)])
)

## Save texts

In [8]:
for (year, actor), text in final_docs.iteritems():
    year_dir = os.path.join(OUTPUT_DIR, str(year))
    out_file = os.path.join(year_dir, f"{actor}.txt")
    if not os.path.exists(year_dir):
        os.makedirs(year_dir)
    
    with open(out_file, "w") as fp:
        fp.write(text)