In [None]:
import pdfplumber
import os
import re
import numpy as np
import pandas as pd

In [None]:
def extract_two_column_text(pdf_path, start_pg_num=0):
    """
    Extract text from a two-column PDF, ignoring middle column.
    
    Args:
        pdf_path: Path to PDF file
        start_pg_num: Page from which to start extracting (0-indexed)
    
    Returns:
        Extracted text as string
    """
    with pdfplumber.open(pdf_path) as pdf:
        pages = pdf.pages[start_pg_num:]
        
        text_output = ""
        for page in pages:
            # Get page dimensions
            page_width = page.width
            page_height = page.height
            
            # Define column boundaries (adjust these based on your PDF)
            left_column = {
                'x0': 0,
                'top': 0,
                'x1': page_width * 0.45,  # Left 45% of page
                'bottom': page_height
            }
            
            right_column = {
                'x0': page_width * 0.55,  # Right 45% of page (skip middle 10%)
                'top': 0,
                'x1': page_width,
                'bottom': page_height
            }
            
            # Extract text from each column
            left_text = page.within_bbox(
                (left_column['x0'], left_column['top'], 
                left_column['x1'], left_column['bottom'])
            ).extract_text()
            
            right_text = page.within_bbox(
                (right_column['x0'], right_column['top'], 
                right_column['x1'], right_column['bottom'])
            ).extract_text()
            
            # Combine columns
            text_output += f"{left_text}\n\n{right_text}"

            if page.page_number % 20 == 0:
                print(f"page {page.page_number} done")
        
        return text_output

In [None]:
START_PG_NUM = 7

cwd = os.path.abspath(os.getcwd())
os.chdir(cwd)  # changes cwd to script's directory
data_dirname = "data"
data_dir = os.path.join(cwd, data_dirname)
# create data directory if it doesn't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
input_filename = "Warao_Bible_cropped.pdf"
input_path = os.path.join(data_dir, input_filename)
text = extract_two_column_text(input_path, start_pg_num=START_PG_NUM)
# print(text)

In [None]:
# normalize text and split into sentences
text_norm = text.lower().strip()
text_array = np.array(re.split(r'[?!.]+', text_norm))  # split sentences by any end punctuation (?, ., or !)

threshold = 1
filtered = []
for sentence in text_array:
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # remove special characters
    sentence = re.sub(r'[\n]', '', sentence)  # remove special characters
    if len(sentence.split()) > threshold:
        filtered.append(sentence)
# display(df)

In [None]:
df = pd.DataFrame(filtered, columns=["warao_sentence"])
df.drop_duplicates(inplace=True)

print(f"number of duplicate sentences: {len(filtered) - len(df)}")
print(f"number of sentences: {len(df)}")

In [None]:
df.to_csv(os.path.join(data_dir, "warao_bible_sentences.csv"), index=False)
df.head(10)