In [187]:
import os
import shutil
import pickle
import subprocess

from collections import defaultdict, Counter
from time import sleep, time

# Data sheets 
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

# Language
from googletrans import Translator
# Language detection
import langdetect
langdetect.DetectorFactory.seed = 0

## Setup directory structure

In [2]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "data")

# Where we will store data from all sources after combining it
ALL_DATA_DIR = os.path.join(DATA_DIR, "combined")
# This will hold the converted versions of the PDF documents
SPLIT_TEXT_FILES_DIR = os.path.join(ALL_DATA_DIR, "text_files_split")
# This will hold everything related to translating text, including the results of the translations
TRANSLATIONS_DIR = os.path.join(ALL_DATA_DIR, "translations")

if not os.path.exists(TRANSLATIONS_DIR):
    os.makedirs(TRANSLATIONS_DIR)

## Paragraph delimiters

In [3]:
paragraph_delimiter = "\n\n" + "=" * 20 + "\n\n"

## Load metadata

In [4]:
df_speeches = pd.read_csv(os.path.join(ALL_DATA_DIR, "document_data.csv"), index_col="id")
df_speeches.Date = pd.to_datetime(df_speeches["Date"])
df_speeches.head(1)

Unnamed: 0_level_0,Title,Type,Date,Source,link
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Accountability for Perpetrators: UN Officials ...,Official Statement,2019-11-07,UN Special Representative of the Secretary-Gen...,https://www.globalr2p.org/wp-content/uploads/2...


# Translate /detect languages in paragraphs

Not all texts are in English (or not all of them. We can try and automatically detect which ones contain text in another language

In [8]:
load_existing_translations = True

In [9]:
%%time

# Translation takes a while, so if we have don it before, load
if load_existing_translations:
    with open(os.path.join(TRANSLATIONS_DIR, "lang_translations_df.pkl"), "rb") as fp:
        df_translations = pickle.load(fp)
        translations = df_translations.to_dict("records")

else:
    # Instantiate googletrans translator object
    translator = Translator(timeout=5)
    from json import JSONDecodeError
    from langdetect.lang_detect_exception import LangDetectException


    translations = []
    nfiles = len(os.listdir(SPLIT_TEXT_FILES_DIR))

    start = time()

    for i, filename in enumerate(os.listdir(SPLIT_TEXT_FILES_DIR)):
        with open(os.path.join(SPLIT_TEXT_FILES_DIR, filename), "r") as fp:
            text = fp.read()

        paragraphs = list(text.split(paragraph_delimiter))

        for j, paragraph in enumerate(paragraphs):

            gt_trans = None
            ld_lang = None

            # googletrans: need retry logic since it fails sometimes
            n_tries = 3
            for i_try in range(n_tries):
                try:
                    gt_trans = translator.translate(paragraph)
                    break
                except JSONDecodeError:
                    if i_try < n_tries - 1:
                        err_text = "Will re-connect and retry"
                        print(f"\tFailed to translate for {filename}, paragraph {j+1}. {err_text}")

                        # Google has blocked the current IP address, switch that
                        subprocess.run(["nordvpn", "d"], check=True)
                        subprocess.run(["nordvpn", "c"], check=True)
                        print(f"\tConnected successfully, attempt {i_try+1} | {n_tries}")
                        # Sometimes it takes a while for a connection to be actually established, hang for a bit
                        sleep(1)

                        # Important: need to re-instantiate the translator object after changing IP
                        translator = Translator(timeout=5)
                    else:
                        print(f"\tgoogletrans could not detect languages in {filename}, paragraph {j+1}")
                except Exception as e:
                    print(f"\tSomething went wrong {e}. Will retry.")

            # Langdetect
            try:
                ld_lang = langdetect.detect_langs(paragraph)
            except LangDetectException:
                print(f"\tlangdetect could not detect languages in {filename}, paragraph {j+1}")

            row = {
                "id": int(filename.split(".")[0]),
                "paragraph": j,
                "gt_trans": gt_trans,
                "ld_lang": ld_lang,
            }

            translations.append(row)


        if (i+1) % 20 == 0:
            print(f"Done with {i+1} out of {nfiles} in {time() - start:.1f} seconds")

CPU times: user 470 ms, sys: 26.7 ms, total: 497 ms
Wall time: 496 ms


## Save translations

In [12]:
df_translations = pd.DataFrame(translations)

with open(os.path.join(TRANSLATIONS_DIR, "lang_translations_df.pkl"), "wb") as fp:
    pickle.dump(df_translations, fp)

## Read in paragraphs

In [61]:
par_list = []
for i, filename in enumerate(os.listdir(SPLIT_TEXT_FILES_DIR)):
    with open(os.path.join(SPLIT_TEXT_FILES_DIR, filename), "r") as fp:
        text = fp.read()

    paragraphs = list(text.split(paragraph_delimiter))
    for j, paragraph in enumerate(paragraphs):
        par_list.append({
            "id": int(filename.split(".")[0]),
            "paragraph": j,
            "text": paragraph
        })
df_paragraphs = pd.DataFrame(par_list).pivot(index="id", columns="paragraph", values="text")

## Separate googletrans translations and langdetect detections

Produce dataframes with documents in rows and paragraphs in columns

In [13]:
df_tr_google = df_translations.pivot(index="id", columns="paragraph", values="gt_trans")
df_det_langdet = df_translations.pivot(index="id", columns="paragraph", values="ld_lang")

## Get googletrans translations

In [218]:
# See where googletrans failed
failed_translation_map = df_tr_google.applymap(lambda x: x is None)

df_translations = df_tr_google.applymap(lambda t: t.text if not pd.isna(t) else None)

# Where translations failed, fill with originals
df_trans_filled = df_translations.fillna(df_paragraphs)

## What languages were the paragraphs originally in?

In [178]:
gt_src_langs = df_tr_google.applymap(lambda x: x.src if not pd.isna(x) else None)

ld_src_langs = df_det_langdet.applymap(lambda x: x[0].lang if isinstance(x, list) else None)
df_ld_lens = df_det_langdet.applymap(lambda x: len(x) if isinstance(x, list) else 0)

In [213]:
gt_langs_per_doc = gt_src_langs.apply(lambda r: set(r) - set(["en", None]), axis=1)
ld_langs_per_doc = ld_src_langs.apply(lambda r: set(r) - set(["en", None]), axis=1)

# What non-English languages were there?
# gt_langs_per_doc[~(gt_langs_per_doc == set(["en"]))]
# set(ld_langs_per_doc[~(ld_langs_per_doc == set(["en"]))].apply(tuple))

### What languages were the paragraphs we could not translate in?

In [193]:
missed_paragraph_langs = pd.Series(np.ravel(ld_src_langs.where(failed_translation_map).values)).value_counts()
display(missed_paragraph_langs)

en    160
fr     12
es      4
ar      1
dtype: int64

## Non-English languages and texts in those languages:

## (Not necessarily all) texts with (possibly) more than one language:

# Store translations

In [238]:
TRANSLATIONS_TEXT_DIR = os.path.join(TRANSLATIONS_DIR, "texts")
if os.path.exists(TRANSLATIONS_TEXT_DIR):
    shutil.rmtree(TRANSLATIONS_TEXT_DIR)
os.makedirs(TRANSLATIONS_TEXT_DIR)

for doc_id, row in df_trans_filled.iterrows():
    out_text = paragraph_delimiter.join(row.dropna())
    
    with open(os.path.join(TRANSLATIONS_TEXT_DIR, f"{doc_id}.txt"), "w") as fp:
        fp.write(out_text)

# Store metadata

In [239]:
df_speeches.to_csv(os.path.join(TRANSLATIONS_DIR, "document_data.csv"))