In [6]:
import pdfplumber
import os
import re
import numpy as np
import pandas as pd

In [None]:
def extract_two_column_text(pdf_path, start_pg_num=0, end_pg_num=None, print_every_n_pages=50):
    """
    Extract text from a two-column PDF, ignoring middle column.
    
    Args:
        pdf_path: Path to PDF file
        start_pg_num: Page from which to start extracting (0-indexed)
    
    Returns:
        Extracted text as string
    """
    with pdfplumber.open(pdf_path) as pdf:
        pages = pdf.pages[start_pg_num:end_pg_num]
        
        text_output = ""
        for page in pages:
            # Get page dimensions
            page_width = page.width
            page_height = page.height
            
            # Define column boundaries (adjust these based on your PDF)
            left_column = {
                'x0': 0,
                'top': 0,
                'x1': page_width * 0.45,  # Left 45% of page
                'bottom': page_height
            }
            
            right_column = {
                'x0': page_width * 0.55,  # Right 45% of page (skip middle 10%)
                'top': 0,
                'x1': page_width,
                'bottom': page_height
            }
            
            # Extract text from each column
            left_text = page.within_bbox(
                (left_column['x0'], left_column['top'], 
                left_column['x1'], left_column['bottom'])
            ).extract_text()
            
            right_text = page.within_bbox(
                (right_column['x0'], right_column['top'], 
                right_column['x1'], right_column['bottom'])
            ).extract_text()
            
            # Combine columns
            text_output += f"{left_text}\n\n{right_text}"

            if page.page_number % print_every_n_pages == 0:
                print(f"up to page {page.page_number} done")
        
        return text_output

In [None]:
START_PG_NUM = 10
END_PG_NUM = 731
PRINT_EVERY_N_PAGES = 100

cwd = os.path.abspath(os.getcwd())
os.chdir(cwd)  # changes cwd to script's directory
data_dirname = "data"
data_dir = os.path.join(cwd, data_dirname)
# create data directory if it doesn't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
input_filename = "Warao_Bible_cropped.pdf"
input_path = os.path.join(data_dir, input_filename)
text = extract_two_column_text(
                                input_path, 
                                start_pg_num=START_PG_NUM, 
                                end_pg_num=END_PG_NUM, 
                                print_every_n_pages=PRINT_EVERY_N_PAGES
                                )
# print(text)

up to page 20 done
up to page 40 done
up to page 60 done
up to page 80 done
up to page 100 done
up to page 120 done
up to page 140 done
up to page 160 done
up to page 180 done
up to page 200 done
up to page 220 done
up to page 240 done
up to page 260 done
up to page 280 done
up to page 300 done
up to page 320 done
up to page 340 done
up to page 360 done
up to page 380 done
up to page 400 done
up to page 420 done
up to page 440 done
up to page 460 done
up to page 480 done
up to page 500 done
up to page 520 done
up to page 540 done
up to page 560 done
up to page 580 done
up to page 600 done
up to page 620 done
up to page 640 done
up to page 660 done
up to page 680 done
up to page 700 done
up to page 720 done


In [20]:
# split text into sentences
text_array = np.array(re.split(r'[?!.]+', text))  # split sentences by any end punctuation (?, ., or !)

threshold = 1
filtered = []
for sentence in text_array:
    # sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # remove special characters
    sentence = re.sub(r'[\n]', '', sentence)  # remove newline characters
    if len(sentence.split()) > threshold:
        sentence = sentence.strip()  # remove leading and trailing whitespace
        sentence = sentence.lstrip('0123456789')
        filtered.append(sentence)
# display(df)

In [21]:
df = pd.DataFrame(filtered, columns=["warao_sentence"])
df.drop_duplicates(inplace=True)

print(f"number of duplicate sentences: {len(filtered) - len(df)}")
print(f"number of sentences: {len(df)}")

display(df[2000:2010])

number of duplicate sentences: 831
number of sentences: 26399


Unnamed: 0,warao_sentence
2021,Isaac Jacob yake rajatane abayaja akajo tabu e...
2022,Ribane:Canaán anobo tomo tatu kawa-namo tira n...
2023,Yake raja tane abakore Esaúnaminae yama
2024,Jacob arimaarani tane aribu eku nona-yaja Pada...
2025,Aribu eku nona koreEsaú naminae yama
2026,Canaánanobo tomo tatu jakutai Isaacaobo jona e...
2027,Naminai tane Ismael tatanaruae yama
2028,Ismael jakutai taiAbraham auka
2029,Esaú ori ata tusiatekoro tane Ismael auka tira...
2030,Mahalatjakutai tai Nebaiot arakoi sanu-kema


In [24]:
df.to_csv(os.path.join(data_dir, "monolingual_warao_sentences_bible.csv"), index=False)
