# Purpose

Some data was downloaded from the old version of the R2P website. This data is no longer available in the new version. This notebook adds that data to the folder with raw files and adds metadata to the file with metadata

In [1]:
import os
import shutil
from collections import defaultdict, Counter

# Data sheets 
import pandas as pd


# OCR
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

## Setup directory structure

In [2]:
MAIN_DIR = "."
DATA_DIR = os.path.join(MAIN_DIR, "data")

# Where we stored everything we downloaded from the new verison of the GR2P website
GR2P_DIR = os.path.join(DATA_DIR, "gr2p")
GR2P_RAW_PDF_DIR = os.path.join(GR2P_DIR, "pdf")
# Where we stored the metadata for the newly downloaded files
GR2P_METADATA_FILE = os.path.join(GR2P_DIR, "globalr2p_docs_data.csv")

# Where we store the PDF files which were downloaded previously 
OLD_PDF_DIR = os.path.join(DATA_DIR, "old_pdfs_per_year")

# Where we will store data from all sources after combining it
ALL_DATA_DIR = os.path.join(DATA_DIR, "combined")
RAW_PDF_DIR = os.path.join(ALL_DATA_DIR, "raw_pdf")
# This will hold the converted versions of the PDF documents
TEXT_FILES_DIR = os.path.join(ALL_DATA_DIR, "text_files")


if os.path.exists(ALL_DATA_DIR):
    shutil.rmtree(ALL_DATA_DIR)
os.makedirs(RAW_PDF_DIR)


if os.path.exists(TEXT_FILES_DIR):
    shutil.rmtree(TEXT_FILES_DIR)
os.makedirs(TEXT_FILES_DIR)

## Load metadata

In [3]:
df_gr2p_speeches = pd.read_csv(GR2P_METADATA_FILE, index_col="id")
df_gr2p_speeches.Date = pd.to_datetime(df_gr2p_speeches["Date"])
df_gr2p_speeches.head(1)

Unnamed: 0_level_0,Title,Type,Date,Source,link
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Accountability for Perpetrators: UN Officials ...,Official Statement,2019-11-07,UN Special Representative of the Secretary-Gen...,https://www.globalr2p.org/wp-content/uploads/2...


## Count number of documents per year

For some years, more documents were present in the old set. So we copy some of those over to our new dataset

Below is a small table with the number of documents for each year

In [4]:
# How many speeches per year were downloaded from globalr2p.com
gr2p_num_speeches_per_year = df_gr2p_speeches.groupby(df_gr2p_speeches["Date"].dt.year).size()
gr2p_num_actors_per_year = df_gr2p_speeches.groupby(df_gr2p_speeches["Date"].dt.year)["Source"].nunique()


# How many speeches per year we had in the old data we ha downloaded
old_pdf_counts_per_year = defaultdict(lambda: 0)
for year in sorted(os.listdir(OLD_PDF_DIR)):
    old_pdf_counts_per_year[int(year)] = len(os.listdir(os.path.join(OLD_PDF_DIR, year)))

old_pdf_counts_per_year = pd.Series(old_pdf_counts_per_year)

pd.DataFrame({
    "Old": old_pdf_counts_per_year,
    "New": gr2p_num_speeches_per_year,
    "New (num.actors)": gr2p_num_actors_per_year,
}).fillna(0)

Unnamed: 0,Old,New,New (num.actors)
2009,92.0,0.0,0.0
2010,26.0,0.0,0.0
2011,44.0,6.0,6.0
2012,67.0,0.0,0.0
2013,79.0,24.0,24.0
2014,73.0,78.0,71.0
2015,79.0,84.0,76.0
2016,142.0,144.0,88.0
2017,98.0,92.0,80.0
2018,2.0,19.0,4.0


## Copy GR2P data

All PDFs, old and new will be stored in the final "raw PDF" directory. Start by moving the newly downloaded ones

In [5]:
!cp {GR2P_RAW_PDF_DIR}/* {RAW_PDF_DIR}/

## Copy old data over

Based on the number of documents per year (see above), choose for which years we want to copy documents over

In [6]:
# We want to give IDs to the "old" data we will add to our pool of documents
# We already gave IDs to the docs we downloaded from GR2P, so start the new ones from there
biggest_id = df_gr2p_speeches.index.max()

# This will hold a list of dictionaries with data about the old files we will use
old_pdf_data = []
# For which years to copy files from the old set
years_to_copy = ("2009", "2010", "2011", "2012", "2013")

# The last index used in new files: so that we can assign unique indices to old files we copy over
doc_id = biggest_id + 1

for year in years_to_copy:
    for filename in os.listdir(os.path.join(OLD_PDF_DIR, year)):
        # Move old document to set with new documents
        old_path = os.path.join(OLD_PDF_DIR, year, filename)
        new_path = os.path.join(RAW_PDF_DIR, f"{doc_id}.pdf")
        print(f"Copying {old_path} to {new_path}")
        shutil.copyfile(old_path, new_path)
        
        old_pdf_data.append({
            "Date": pd.to_datetime(year),
            "Source": "".join(filename.split(".")[:-1]),
            "id": doc_id
        })
        
        doc_id += 1

Copying ./data/old_pdfs_per_year/2009/sweden-2009-r2p-debate.pdf to ./data/combined/raw_pdf/461.pdf
Copying ./data/old_pdfs_per_year/2009/peru-2009-r2p-debate.pdf to ./data/combined/raw_pdf/462.pdf
Copying ./data/old_pdfs_per_year/2009/papau-new-guinea-2009-r2p-debate.pdf to ./data/combined/raw_pdf/463.pdf
Copying ./data/old_pdfs_per_year/2009/solomon-islands-2009-r2p-debate.pdf to ./data/combined/raw_pdf/464.pdf
Copying ./data/old_pdfs_per_year/2009/malaysia-2009-r2p-debate.pdf to ./data/combined/raw_pdf/465.pdf
Copying ./data/old_pdfs_per_year/2009/germany-2009-r2p-debate.pdf to ./data/combined/raw_pdf/466.pdf
Copying ./data/old_pdfs_per_year/2009/bolivia-2009-r2p-debate.pdf to ./data/combined/raw_pdf/467.pdf
Copying ./data/old_pdfs_per_year/2009/venezuela-2009-r2p-debate.pdf to ./data/combined/raw_pdf/468.pdf
Copying ./data/old_pdfs_per_year/2009/croatia-2009-r2p-debate.pdf to ./data/combined/raw_pdf/469.pdf
Copying ./data/old_pdfs_per_year/2009/cameroon-2009-r2p-debate.pdf to ./dat

Copying ./data/old_pdfs_per_year/2009/benin-2009-r2p-debate.pdf to ./data/combined/raw_pdf/543.pdf
Copying ./data/old_pdfs_per_year/2009/sri-lanka-2009-r2p-debate.pdf to ./data/combined/raw_pdf/544.pdf
Copying ./data/old_pdfs_per_year/2009/israel-2009-r2p-debate.pdf to ./data/combined/raw_pdf/545.pdf
Copying ./data/old_pdfs_per_year/2009/nicaragua-2009-r2p-debate.pdf to ./data/combined/raw_pdf/546.pdf
Copying ./data/old_pdfs_per_year/2009/iran-2009-r2p-debate.pdf to ./data/combined/raw_pdf/547.pdf
Copying ./data/old_pdfs_per_year/2009/colombia-2009-r2p-debate.pdf to ./data/combined/raw_pdf/548.pdf
Copying ./data/old_pdfs_per_year/2009/slovakia-2009-r2p-debate.pdf to ./data/combined/raw_pdf/549.pdf
Copying ./data/old_pdfs_per_year/2009/hungary-2009-r2p-debate.pdf to ./data/combined/raw_pdf/550.pdf
Copying ./data/old_pdfs_per_year/2009/sierra-leone-2009-r2p-debate.pdf to ./data/combined/raw_pdf/551.pdf
Copying ./data/old_pdfs_per_year/2009/south-africa-2009-r2p-debate.pdf to ./data/combi

Copying ./data/old_pdfs_per_year/2012/spain-statement-2012-english-1.pdf to ./data/combined/raw_pdf/656.pdf
Copying ./data/old_pdfs_per_year/2012/alex-bellamy-statement-2012.pdf to ./data/combined/raw_pdf/657.pdf
Copying ./data/old_pdfs_per_year/2012/new-zealand-statement-2012.pdf to ./data/combined/raw_pdf/658.pdf
Copying ./data/old_pdfs_per_year/2012/nigeria-statement-2012.pdf to ./data/combined/raw_pdf/659.pdf
Copying ./data/old_pdfs_per_year/2012/gcr2p-statement-at-2012-interactive-dialogue.pdf to ./data/combined/raw_pdf/660.pdf
Copying ./data/old_pdfs_per_year/2012/peru-statement-transcribed.pdf to ./data/combined/raw_pdf/661.pdf
Copying ./data/old_pdfs_per_year/2012/iran-statement-transcribed.pdf to ./data/combined/raw_pdf/662.pdf
Copying ./data/old_pdfs_per_year/2012/costa-rica-statement-2012-transcribed-1.pdf to ./data/combined/raw_pdf/663.pdf
Copying ./data/old_pdfs_per_year/2012/united-arab-emirates-statement-2012-transcribed.pdf to ./data/combined/raw_pdf/664.pdf
Copying ./d

Copying ./data/old_pdfs_per_year/2013/france_en_official.pdf to ./data/combined/raw_pdf/760.pdf
Copying ./data/old_pdfs_per_year/2013/iran_transcription.pdf to ./data/combined/raw_pdf/761.pdf
Copying ./data/old_pdfs_per_year/2013/togo_en.pdf to ./data/combined/raw_pdf/762.pdf
Copying ./data/old_pdfs_per_year/2013/montenegro.pdf to ./data/combined/raw_pdf/763.pdf
Copying ./data/old_pdfs_per_year/2013/cotedivoire_en.pdf to ./data/combined/raw_pdf/764.pdf
Copying ./data/old_pdfs_per_year/2013/australia_en.pdf to ./data/combined/raw_pdf/765.pdf
Copying ./data/old_pdfs_per_year/2013/japan_en.pdf to ./data/combined/raw_pdf/766.pdf
Copying ./data/old_pdfs_per_year/2013/republic-of-korea-transcritpion.pdf to ./data/combined/raw_pdf/767.pdf
Copying ./data/old_pdfs_per_year/2013/nicaragua.pdf to ./data/combined/raw_pdf/768.pdf


## Add new document metadata to old metadata

In [7]:
df_gr2p_plus_old = df_gr2p_speeches.append(pd.DataFrame(old_pdf_data).set_index("id"))
df_gr2p_plus_old.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 769 entries, 0 to 768
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Title   461 non-null    object        
 1   Type    461 non-null    object        
 2   Date    769 non-null    datetime64[ns]
 3   Source  769 non-null    object        
 4   link    459 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 36.0+ KB


## Add general assembly speeches

These come from the UN General Assembly website and are extraced from a few documents in the `Extract speeches from GA documents` notebook. Here we copy over the resulting text files 

In [8]:
GA_STMTS_DIR = os.path.join(DATA_DIR, "general_assembly_docs", "statements")

In [9]:
%%time
# We want to give IDs to the "old" data we will add to our pool of documents
# We already gave IDs to the docs we downloaded from GR2P, so start the new ones from there
biggest_id = df_gr2p_plus_old.index.max()

# Will hold dictionaries with metadata about eash statement
ga_docs_list = []

doc_id = biggest_id + 1

# Add each file to the metadata and then copy it over to the text dir
for filename in os.listdir(GA_STMTS_DIR):
    # Parse date and source from filename
    date_slug = filename.split("_")[0]
    date = pd.to_datetime(" ".join(date_slug.split("-")))
    source = "_".join(filename.split("_")[1:])[:-4]
    source = " ".join(source.split("-"))
    
    # Metadata
    ga_docs_list.append({
        "Date": date,
        "Source": source,
        "id": doc_id
    })
    
    # Copy to text folder
    !cp {GA_STMTS_DIR}/{filename} {TEXT_FILES_DIR}/{doc_id}.txt
    
    # Increment ID
    doc_id += 1

CPU times: user 1 s, sys: 1.23 s, total: 2.23 s
Wall time: 22.4 s


## Update metadata dataframe

In [10]:
df_speeches = df_gr2p_plus_old.append(pd.DataFrame(ga_docs_list).set_index("id"))
df_speeches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 949 entries, 0 to 948
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Title   461 non-null    object        
 1   Type    461 non-null    object        
 2   Date    949 non-null    datetime64[ns]
 3   Source  949 non-null    object        
 4   link    459 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 44.5+ KB


## Unify source names

In [11]:
source_actor_map = pd.read_csv(os.path.join(DATA_DIR, "source_actor_map.csv"))
source_actor_map = source_actor_map.T.ffill().T.set_index("Unnamed: 0")
source_actor_map.index.name = None
source_actor_map = dict((source, d["changes"]) for (source, d) in source_actor_map.to_dict("index").items())

no_rename_list = set(df_speeches.Source) - set(source_actor_map)

if len(no_rename_list) > 0:
    print(f"No actor was specified for these sources: {list(no_rename_list)}")

df_speeches["Source"] = df_speeches["Source"].str.strip().replace(source_actor_map)

## Write file with descriptions

In [12]:
df_speeches.to_csv(os.path.join(ALL_DATA_DIR, "document_data.csv"), index=True)

# Convert PDFs to text

## Convert non-scanned files using the `pdftotext` utility

### Define bash script to do conversion

Will take two arguments: a folder with PDF files and a folder(must exist) to save converted files

In [13]:
pdf_to_text_script = """#!/bin/bash
let counter=0
for file in $(find $1 | grep pdf | sort)
do
  if [ -f $file ]
  then
    echo $file
    # Parse filename
    filename="${file##*/}"
    filename_no_ext="${filename%.pdf}"
    
    # Do conversion
    pdftotext $file -nopgbrk $2/$filename_no_ext.txt

    let counter++
  fi
done
echo "Converted $counter files"
"""

SCRIPTS_DIR = os.path.join(MAIN_DIR, "scripts")

if os.path.exists(SCRIPTS_DIR):
    shutil.rmtree(SCRIPTS_DIR)
os.makedirs(SCRIPTS_DIR)

script_path = os.path.join(SCRIPTS_DIR, "convert_pdfs_to_text.sh")

# Make bash file containing conversion code
with open(script_path, 'w') as fp:
    fp.write(pdf_to_text_script)
    
# Make script executable
!chmod +x $script_path

### Do conversion

In [14]:
%%time
raw_dir_abspath = os.path.abspath(RAW_PDF_DIR)
text_folder_abspath = os.path.abspath(TEXT_FILES_DIR)

# Run command
!$script_path $raw_dir_abspath $text_folder_abspath

/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/0.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/100.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/101.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/102.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/103.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/104.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/105.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/106.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/107.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/108.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/109.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/10.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/110.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/111.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/112.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/113.pdf
/home/yasen/misc/r2p-speeches/data/combined

/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/219.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/21.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/220.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/221.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/222.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/223.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/224.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/225.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/226.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/227.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/228.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/229.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/22.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/230.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/231.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/232.pdf
/home/yasen/misc/r2p-speeches/data/combine

/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/343.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/344.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/345.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/346.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/347.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/348.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/349.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/34.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/350.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/351.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/352.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/353.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/354.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/355.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/356.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/357.pdf
/home/yasen/misc/r2p-speeches/data/combin

/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/469.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/46.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/470.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/471.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/472.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/473.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/474.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/475.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/476.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/477.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/478.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/479.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/47.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/480.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/481.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/482.pdf
/home/yasen/misc/r2p-speeches/data/combine

/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/592.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/593.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/594.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/595.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/596.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/597.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/598.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/599.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/59.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/5.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/600.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/601.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/602.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/603.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/604.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/605.pdf
/home/yasen/misc/r2p-speeches/data/combined

/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/718.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/719.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/71.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/720.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/721.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/722.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/723.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/724.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/725.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/726.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/727.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/728.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/729.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/72.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/730.pdf
/home/yasen/misc/r2p-speeches/data/combined/raw_pdf/731.pdf
/home/yasen/misc/r2p-speeches/data/combine

# Convert scanned files to text

## See which files we did not manage to convert with `pdftotext`

We recognize them because they are only a few bytes in size

In [15]:
# Keep file names and sizes for small files
small_files = dict()

# Any files smaller than this should not contain meaningful text
FILE_SIZE_LOWER_LIMIT_B = 100

for filename in os.listdir(TEXT_FILES_DIR):
    # Get the size in bytes
    filepath = os.path.join(TEXT_FILES_DIR, filename)
    file_size = os.path.getsize(filepath)
    doc_number = filename.split('.')[0]
    
    # If smaller than X bytes: delete and record
    if file_size < FILE_SIZE_LOWER_LIMIT_B:
        os.remove(filepath)
        small_files[int(doc_number)] = file_size

## Convert the scanned PDFs to images

In [16]:
%%time
SCANNED_IMAGES_DIR = os.path.join(ALL_DATA_DIR, 'scanned_images')

if os.path.exists(SCANNED_IMAGES_DIR):
    shutil.rmtree(SCANNED_IMAGES_DIR)
os.makedirs(SCANNED_IMAGES_DIR)

for doc_number in small_files.keys():
    pdf_filepath = os.path.abspath(os.path.join(RAW_PDF_DIR, f"{doc_number}.pdf"))

    destination_path = os.path.abspath(os.path.join(SCANNED_IMAGES_DIR, str(doc_number)))
    os.makedirs(destination_path)
    
    destination_filepath = os.path.join(destination_path, "img")
    
    !pdftoppm $pdf_filepath $destination_filepath -png
    
    print(f"Converted {doc_number}. Saved to {destination_filepath}")

Converted 421. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/421/img
Converted 724. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/724/img
Converted 374. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/374/img
Converted 556. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/556/img
Converted 419. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/419/img
Converted 172. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/172/img
Converted 741. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/741/img
Converted 651. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/651/img
Converted 146. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/146/img
Converted 593. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/593/img
Converted 138. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/138/img

Converted 345. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/345/img
Converted 117. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/117/img
Converted 529. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/529/img
Converted 44. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/44/img
Converted 329. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/329/img
Converted 678. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/678/img
Converted 627. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/627/img
Converted 333. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/333/img
Converted 456. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/456/img
Converted 319. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/319/img
Converted 566. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/566/img
C

Converted 580. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/580/img
Converted 597. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/597/img
Converted 290. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/290/img
Converted 304. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/304/img
Converted 309. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/309/img
Converted 669. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/669/img
Converted 720. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/720/img
Converted 150. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/150/img
Converted 301. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/301/img
Converted 182. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/182/img
Converted 458. Saved to /home/yasen/misc/r2p-speeches/data/combined/scanned_images/458/img

## Perform OCR on images, save output

Some documents are not in English, so we need to tell Tesseract what language they are in, otherwise they will not be converted properly. Below is a manually compiled list of ID -> language pairs for non-English documents

**NOTE:** If the previous steps in the pipeline change the ordering of documents, you will need to update this list

In [36]:
ocr_langs = pd.read_csv(os.path.join(DATA_DIR, "scanned_docs_langs.csv"), index_col="id")

ocr_langs = ocr_langs[ocr_langs["secondary"].isna()]

In [38]:
# Mark scanned files in metadata
df_speeches["scanned"] = False

In [39]:
%%time
for doc_number in sorted(small_files.keys()):
    destination_path = os.path.abspath(os.path.join(SCANNED_IMAGES_DIR, str(doc_number)))
    
    df_speeches.loc[doc_number, "scanned"] = True
    
    document_text = "" 
    
    # What language is this document in? Matters for Tesseract. Use English by default
    lang = "eng"
    if doc_number in ocr_langs.index:
        lang = ocr_langs.loc[doc_number, "lang"]

    print(f"Processing document {doc_number}. Language: {lang.upper()}")
    
    # OCR
    for file in sorted(os.listdir(destination_path)):
        image_filepath = os.path.join(destination_path, file)
        print(f"\t{image_filepath}")
        image_text = pytesseract.image_to_string(Image.open(image_filepath), lang=lang)    
        document_text += " " + image_text
    
    
    # Save result
    text_filepath = os.path.abspath(os.path.join(TEXT_FILES_DIR, f"{doc_number}.txt"))
    print(f"\t{len(document_text)} characters to {text_filepath}")
    
    if os.path.exists(text_filepath):
        os.remove(text_filepath)
    
    with open(text_filepath, "w") as fp:
        fp.write(document_text)
    

Processing document 39. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/39/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/39/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/39/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/39/img-4.png
	4196 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/39.txt
Processing document 44. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/44/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/44/img-2.png
	7455 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/44.txt
Processing document 48. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/48/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/48/img-2.png
	4078 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/48.txt
Processing document 49. Language: ENG
	/home/yasen/misc/

	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/117/img-2.png
	3443 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/117.txt
Processing document 135. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/135/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/135/img-2.png
	3314 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/135.txt
Processing document 137. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/137/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/137/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/137/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/137/img-4.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/137/img-5.png
	4653 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/137.txt
Processing document 138. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combin

	2068 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/198.txt
Processing document 200. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/200/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/200/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/200/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/200/img-4.png
	14309 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/200.txt
Processing document 201. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/201/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/201/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/201/img-3.png
	8910 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/201.txt
Processing document 204. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/204/img-1.png
	/home/yasen/misc/r2p-speeches/data/combi

	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/308/img-2.png
	3255 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/308.txt
Processing document 309. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/309/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/309/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/309/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/309/img-4.png
	3781 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/309.txt
Processing document 311. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/311/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/311/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/311/img-3.png
	3534 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/311.txt
Processing document 312. Language: FRA
	/home/yasen/misc/r2p-speeches/data/combin

	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/345/img-8.png
	4559 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/345.txt
Processing document 347. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/347/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/347/img-2.png
	3760 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/347.txt
Processing document 354. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/354/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/354/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/354/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/354/img-4.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/354/img-5.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/354/img-6.png
	4900 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/354.txt
Process

	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/408/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/408/img-3.png
	4820 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/408.txt
Processing document 410. Language: SPA
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/410/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/410/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/410/img-3.png
	5965 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/410.txt
Processing document 414. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/414/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/414/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/414/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/414/img-4.png
	3951 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/414.txt
Process

	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/558/img-3.png
	2581 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/558.txt
Processing document 563. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/563/img-1.png
	2117 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/563.txt
Processing document 566. Language: FRA
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/566/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/566/img-2.png
	4699 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/566.txt
Processing document 567. Language: SPA
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/567/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/567/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/567/img-3.png
	8434 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/567.txt
Processing document 571. Language

	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/641/img-4.png
	4268 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/641.txt
Processing document 643. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/643/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/643/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/643/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/643/img-4.png
	4504 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/643.txt
Processing document 645. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/645/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/645/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/645/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/645/img-4.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/645/img-5.png
	/home/yasen/mis

	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/719/img-2.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/719/img-3.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/719/img-4.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/719/img-5.png
	9108 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/719.txt
Processing document 720. Language: SPA
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/720/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/720/img-2.png
	3289 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/720.txt
Processing document 723. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/723/img-1.png
	/home/yasen/misc/r2p-speeches/data/combined/scanned_images/723/img-2.png
	4389 characters to /home/yasen/misc/r2p-speeches/data/combined/text_files/723.txt
Processing document 724. Language: ENG
	/home/yasen/misc/r2p-speeches/data/combin

### Clean up images folder

In [40]:
shutil.rmtree(SCANNED_IMAGES_DIR)

### Check for misses

In [41]:
raw_documents = set(f.split('.')[0] for f in os.listdir(RAW_PDF_DIR))
text_documents = set(f.split('.')[0] for f in os.listdir(TEXT_FILES_DIR))

print(f"No text file for these documents: {raw_documents - text_documents}")

No text file for these documents: {'148', '170'}


## Write file with descriptions

In [43]:
df_speeches.to_csv(os.path.join(ALL_DATA_DIR, "document_data.csv"), index=True)