In [1]:
import os
import shutil
import pickle
import subprocess

from collections import defaultdict, Counter
from time import sleep, time

import requests

# Data sheets 
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

# Language
from googletrans import Translator
# Language detection
import langdetect
langdetect.DetectorFactory.seed = 0

## Setup directory structure

In [2]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "data")

# Where we will store data from all sources after combining it
ALL_DATA_DIR = os.path.join(DATA_DIR, "combined")
# This will hold the converted versions of the PDF documents
SPLIT_TEXT_FILES_DIR = os.path.join(ALL_DATA_DIR, "text_files_split")
# This will hold everything related to translating text, including the results of the translations
TRANSLATIONS_DIR = os.path.join(ALL_DATA_DIR, "translations")

if not os.path.exists(TRANSLATIONS_DIR):
    os.makedirs(TRANSLATIONS_DIR)

## Paragraph delimiters

In [3]:
paragraph_delimiter = "\n\n" + "=" * 20 + "\n\n"

## Load metadata

In [4]:
df_speeches = pd.read_csv(os.path.join(ALL_DATA_DIR, "document_data.csv"), index_col="id")
df_speeches.Date = pd.to_datetime(df_speeches["Date"])
df_speeches.head(1)

Unnamed: 0_level_0,Title,Type,Date,Source,link,scanned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Accountability for Perpetrators: UN Officials ...,Official Statement,2019-11-07,UN Special Representative of the Secretary-Gen...,https://www.globalr2p.org/wp-content/uploads/2...,False


# Translate /detect languages in paragraphs

Not all texts are in English (or not all of them. We can try and automatically detect which ones contain text in another language

In [5]:
load_existing_translations = False

In [8]:
%%time

# Translation takes a while, so if we have don it before, load
if load_existing_translations:
    with open(os.path.join(TRANSLATIONS_DIR, "lang_translations_df.pkl"), "rb") as fp:
        df_translations = pickle.load(fp)
        translations = df_translations.to_dict("records")

else:
    # Instantiate googletrans translator object
    translator = Translator(timeout=5)
    from json import JSONDecodeError
    from langdetect.lang_detect_exception import LangDetectException


    translations = []
    nfiles = len(os.listdir(SPLIT_TEXT_FILES_DIR))

    start = time()

    for i, filename in enumerate(os.listdir(SPLIT_TEXT_FILES_DIR)):
        with open(os.path.join(SPLIT_TEXT_FILES_DIR, filename), "r") as fp:
            text = fp.read()

        paragraphs = list(text.split(paragraph_delimiter))

        for j, paragraph in enumerate(paragraphs):

            gt_trans = None
            ld_lang = None
            
            # Langdetect
            try:
                ld_lang = langdetect.detect_langs(paragraph)
            except LangDetectException:
                print(f"\tlangdetect could not detect languages in {filename}, paragraph {j+1}")
            
            # Only translate texts for which we are not sure they are in English
            must_translate = True
            if ld_lang is not None and len(ld_lang) == 1 and ld_lang[0].lang == "en":
                 must_translate = False
            
            if must_translate:
                # googletrans: need retry logic since it fails sometimes
                n_tries = 3
                for i_try in range(n_tries):

                    connected = False
                    # Make sure we have internet. Do not continue unless we do
                    while not connected:
                        try:
                            _ = requests.get("https://www.google.com", timeout=1)
                            connected = True
                        except:
                            print("\tNo internet connection!")
                            sleep(1)

                    # Attempt to translate
                    try:
                        gt_trans = translator.translate(paragraph)
                        break
                    except JSONDecodeError:
                        if i_try < n_tries - 1:
                            err_text = "Will re-connect and retry"
                            print(f"\tFailed to translate for {filename}, paragraph {j+1}. {err_text}")

                            # Google has blocked the current IP address, switch that
                            subprocess.run(["nordvpn", "d"], check=True)
                            subprocess.run(["nordvpn", "c", "be"], check=True)
                            print(f"\tConnected successfully, attempt {i_try+1} | {n_tries}")
                            # Sometimes it takes a while for a connection to be actually established, hang for a bit
                            sleep(1)

                            # Important: need to re-instantiate the translator object after changing IP
                            translator = Translator(timeout=5)
                        else:
                            print(f"\tgoogletrans could not detect languages in {filename}, paragraph {j+1}")
                    except Exception as e:
                        print(f"\tSomething went wrong {e}. Will retry.")


            row = {
                "id": int(filename.split(".")[0]),
                "paragraph": j,
                "gt_trans": gt_trans,
                "ld_lang": ld_lang,
                "translated": must_translate,
            }

            translations.append(row)


        if (i+1) % 50 == 0:
            print(f"Done with {i+1} out of {nfiles} in {time() - start:.1f} seconds")

Done with 50 out of 947 in 23.4 seconds
Done with 100 out of 947 in 38.0 seconds
	Failed to translate for 540.txt, paragraph 7. Will re-connect and retry
	Connected successfully, attempt 1 | 3
	No internet connection!
	No internet connection!
Done with 150 out of 947 in 64.9 seconds
Done with 200 out of 947 in 93.0 seconds
	Failed to translate for 320.txt, paragraph 11. Will re-connect and retry
	Connected successfully, attempt 1 | 3
	Failed to translate for 320.txt, paragraph 11. Will re-connect and retry
	Connected successfully, attempt 2 | 3
	googletrans could not detect languages in 320.txt, paragraph 11
Done with 250 out of 947 in 129.1 seconds
Done with 300 out of 947 in 149.2 seconds
	langdetect could not detect languages in 526.txt, paragraph 22
Done with 350 out of 947 in 159.7 seconds
	langdetect could not detect languages in 532.txt, paragraph 1
	Failed to translate for 532.txt, paragraph 1. Will re-connect and retry
	Connected successfully, attempt 1 | 3
	Failed to translat

	Connected successfully, attempt 1 | 3
	Failed to translate for 575.txt, paragraph 3. Will re-connect and retry
	Connected successfully, attempt 2 | 3
	googletrans could not detect languages in 575.txt, paragraph 3
	Failed to translate for 575.txt, paragraph 4. Will re-connect and retry
	Connected successfully, attempt 1 | 3
	Failed to translate for 575.txt, paragraph 4. Will re-connect and retry
	Connected successfully, attempt 2 | 3
	googletrans could not detect languages in 575.txt, paragraph 4
	Failed to translate for 379.txt, paragraph 1. Will re-connect and retry
	Connected successfully, attempt 1 | 3
	Failed to translate for 379.txt, paragraph 1. Will re-connect and retry
	Connected successfully, attempt 2 | 3
	googletrans could not detect languages in 379.txt, paragraph 1
	Failed to translate for 379.txt, paragraph 2. Will re-connect and retry
	Connected successfully, attempt 1 | 3
	Failed to translate for 379.txt, paragraph 2. Will re-connect and retry
	Connected successfully,

## Save translations

In [9]:
df_translations = pd.DataFrame(translations)

with open(os.path.join(TRANSLATIONS_DIR, "lang_translations_df.pkl"), "wb") as fp:
    pickle.dump(df_translations, fp)

## Read in paragraphs

In [100]:
par_list = []
for i, filename in enumerate(os.listdir(SPLIT_TEXT_FILES_DIR)):
    with open(os.path.join(SPLIT_TEXT_FILES_DIR, filename), "r") as fp:
        text = fp.read()

    paragraphs = list(text.split(paragraph_delimiter))
    for j, paragraph in enumerate(paragraphs):
        par_list.append({
            "id": int(filename.split(".")[0]),
            "paragraph": j,
            "text": paragraph
        })
        
df_paragraphs_raw = pd.DataFrame(par_list)
df_paragraphs = df_paragraphs_raw.merge(df_translations, on=["id", "paragraph"])

## Fix missing translations

We sometimes fail for some paragraphs, perhaps because they are too long. Re-do the translation, splitting long paragraphs

In [102]:
# See where googletrans failed
df_paragraphs["failed"] = df_paragraphs["gt_trans"].isna() & df_paragraphs["translated"]

df_translations_fixed = df_translations.copy(deep=True)

translator = Translator(timeout=5)

for row_id, row in df_paragraphs[df_paragraphs["failed"]].iterrows():
    text = row["text"]
    
    text_ix = 0
    
    translated_text = ""
    
    while text_ix < len(text):
        chunk = text[text_ix:text_ix + 2000]
        
        gt_trans = translator.translate(paragraph)
        
        translated_text += gt_trans.text
        
        text_ix += 2000
    
    gt_trans.text = translated_text
    df_translations_fixed.loc[row_id, "gt_trans"] = gt_trans

## Save translations (with additions)

In [107]:
with open(os.path.join(TRANSLATIONS_DIR, "lang_translations_df.pkl"), "wb") as fp:
    pickle.dump(df_translations, fp)

### Make new paragraphs dataframe

In [108]:
df_paragraphs = df_paragraphs_raw.merge(df_translations_fixed, on=["id", "paragraph"])

## What languages were the paragraphs originally in?

In [109]:
df_paragraphs["gt_det_lang"] = df_paragraphs["gt_trans"].apply(lambda x: x.src if not pd.isna(x) else None)
df_paragraphs["ld_det_lang"] = df_paragraphs["ld_lang"].apply(lambda x: x[0].lang if isinstance(x, list) else None)

# How many different languages might have been detected?
df_paragraphs["ld_n_langs"] = df_paragraphs["ld_lang"].apply(lambda x: len(x) if isinstance(x, list) else 0)

## Get googletrans translations

In [110]:
df_paragraphs["translation"] = df_paragraphs["gt_trans"].apply(lambda t: t.text if not pd.isna(t) else None)

# Where translations failed, fill with originals
df_paragraphs["translation"] = df_paragraphs["translation"].fillna(df_paragraphs["text"])

## Different detections

In [111]:
diff_langs = df_paragraphs[df_paragraphs["gt_det_lang"] != df_paragraphs["ld_det_lang"]].dropna()

## Non-English languages and texts in those languages:

In [123]:
non_english = df_paragraphs[
    (df_paragraphs["ld_det_lang"] != "en") |
    ((df_paragraphs["gt_det_lang"] != "en") & (~df_paragraphs["gt_det_lang"].isna()))
]

non_english["id"].unique()

array([474, 741, 651, 386, 318, 496, 331, 753, 740, 328, 578, 540, 242,
       340, 714, 420, 534, 206, 653, 361, 611, 493, 410, 320, 322, 291,
       396, 308,   0, 468, 475, 701, 381, 294, 526,  96, 323, 532, 345,
       511, 564, 635, 721, 529, 579, 622, 329, 678, 432, 479, 566, 571,
       470, 706, 576, 535,  77, 417,  10, 558, 598, 645, 391, 181, 719,
       590, 543, 575, 379, 111, 626, 503, 567, 607, 312, 694, 518, 380,
       275, 448, 554, 281, 742, 650, 338, 360, 697, 602, 733, 280, 354,
       488,   7, 597, 755, 720, 541, 675, 462, 445, 621])

## (Not necessarily all) texts with (possibly) more than one language:

In [126]:
df_paragraphs["src_lang"] = df_paragraphs["gt_det_lang"].fillna(df_paragraphs["ld_det_lang"])

In [143]:
n_langs_per_doc = df_paragraphs.groupby("id")["src_lang"].nunique()
multilingual_docs = n_langs_per_doc[n_langs_per_doc > 1].index

In [146]:
multilingual_docs

Int64Index([  0,   7,  10, 291, 318, 360, 361, 379, 432, 470, 474, 479, 503,
            518, 534, 540, 541, 543, 590, 597, 607, 626, 645, 678, 697, 719,
            740, 741],
           dtype='int64', name='id')

In [245]:
df_speeches["multilingual"] = False
df_speeches.at[multilingual_docs, "multilingual"] = True

## Remove (un)translated paragraphs in bilingual documents

Some documents include an official translation. For those, only keep the paragraphs with the translation

In [239]:
# Count the number of paragraphs for each language for bilingual documents

df_lan_counts = pd.DataFrame(df_paragraphs.groupby("id")["src_lang"].value_counts())
df_lan_counts.columns = ["count"]
df_lan_counts.reset_index(inplace=True)
df_lan_counts = df_lan_counts[df_lan_counts["id"].isin(multilingual_docs)]

In [240]:
# Add a column to signify whether this paragraph should be discarded or not
df_paragraphs["to_keep"] = True

for doc_id in df_lan_counts.id.unique():
    subset = df_lan_counts[df_lan_counts["id"] == doc_id]
    
    counts = subset[["src_lang", "count"]].set_index("src_lang").to_dict("dict")["count"]
    
    if not "en" in counts:
        continue
    
    n_paragraphs = sum(counts.values())
    
    keep_english = False
    if counts["en"] / n_paragraphs > 0.33:
        keep_english = True
    
    if keep_english:
        df_paragraphs.at[
            (df_paragraphs["id"] == doc_id) & ~(df_paragraphs["src_lang"] == "en"), "to_keep"] = False
    else:
        df_paragraphs.at[
            (df_paragraphs["id"] == doc_id) & (df_paragraphs["src_lang"] == "en"), "to_keep"] = False

# Store translations

In [246]:
TRANSLATIONS_TEXT_DIR = os.path.join(TRANSLATIONS_DIR, "texts")
if os.path.exists(TRANSLATIONS_TEXT_DIR):
    shutil.rmtree(TRANSLATIONS_TEXT_DIR)
os.makedirs(TRANSLATIONS_TEXT_DIR)


# Make a DOCxPAR matrix 
df_trans_filled = df_paragraphs[df_paragraphs["to_keep"]].pivot(
    index="id", columns="paragraph", values="translation")

for doc_id, row in df_trans_filled.iterrows():
    out_text = paragraph_delimiter.join(row.dropna())
    
    with open(os.path.join(TRANSLATIONS_TEXT_DIR, f"{doc_id}.txt"), "w") as fp:
        fp.write(out_text)

# Store metadata

In [247]:
df_speeches.to_csv(os.path.join(TRANSLATIONS_DIR, "document_data.csv"))