In [2]:
!python -m pip install pytesseract
!python -m pip install opencv-python



In [3]:
pip install pymupdf pillow

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # replace this path with the path to your tesseract installation

In [5]:
import fitz
from PIL import Image
import pandas as pd
from pandas import DataFrame as df
from io import StringIO
from pathlib import Path
import csv

In [6]:
def pdf_to_data(pdf_path, output_folder, include_pngs= False):
    '''Chop a given PDF into individual pages, then convert each PDF into an image (saved to the pngs folder). Convert OCR data about each page into a .tsv DataFrame.
        This function requires that you have a folder in the same level as your pdf for outputs, and then two folders within that folder titled 'pngs' and 'tsv_data'.
        For example:
            your pdf
            output folder
                pngs ** Only necessary if you want to see the processed pages-- set include_pngs to True'''
    
    open_pdf = fitz.open(pdf_path)

    sum_string = ''
    tsv_total = pd.DataFrame()
    for page_num in range(open_pdf.page_count): # iterate through individual pages
        page = open_pdf[page_num]

        img = page.get_pixmap()
        

        # make the image
        pil_img = Image.frombytes("RGB", [img.width, img.height], img.samples) # convert to PIL Image
        # improve resolution
        scale_factor = 3 # try changing this to improve resolution
        new_size = img.width * scale_factor, img.height * scale_factor
        resize = pil_img.resize(new_size, Image.LANCZOS)
        

        # process image into tsv and clean some values
        png_to_data = pytesseract.image_to_data(resize, config = r'--psm 6') 
        data_to_tsv = StringIO(png_to_data)
        data_read = pd.read_csv(data_to_tsv, sep='\t', quoting=csv.QUOTE_NONE)
        tsv_clean = data_read[['line_num', 'word_num', 'left', 'top', 'text', 'conf']]
        tsv_total = pd.concat([tsv_total, tsv_clean])

        img_to_string = pytesseract.image_to_string(resize, config = r'--psm 6')
        sum_string += img_to_string
        
        if include_pngs:
            output_png_path = Path(f"{output_folder}/pngs/page_{page_num + 1}.png")
            resize.save(output_png_path)

    output_tsv_total = Path(f"{output_folder}/location.csv")
    tsv_total.to_csv(output_tsv_total, sep= ',', index=False)
    #print(sum_string)
    output_str_path = Path(f"{output_folder}/str_data.txt")
    with open(output_str_path, 'w') as file:
        file.write(sum_string)

In [7]:
# replace these with the paths to your files
pdf_path = r'TOCs\NAAL TOCs\2007\2007_Baym_The_Norton_anthology_of_American.pdf'
output_folder = r'TOCs\NAAL TOCs\2007\processed'
include_pngs = False

pdf_to_data(pdf_path, output_folder, include_pngs)

Contents
PREFACE TO THE SEVENTH EDITION xvii
ACKNOWLEDGMENTS xXY
Beginnings to 1700
Introduction 1
Timeline 15
STORIES OF THE BEGINNING OF THE WORLD 17
The Iroquois Creation Story (version by David Cusick) 17
Pima Stories of the Beginning of the World
(versions by Thin Leather and J. W. Lloyd) 21
The Story of the Creation 22
The Story of the Flood 24
CHRISTOPHER COLUMBUS (1451-1506) 31
From Letter to Luis de Santangel Regarding the First Voyage
(February 15, 1493) 32
From Letter to Ferdinand and Isabella Regarding the Fourth Voyage
(July 7, 1503) 33
BARTOLOME DE LAS CASAS (1474-1566) 35
The Very Brief Relation of the Devastation of the Indies 36
From Hispaniola 36
From The Coast of Pearls, Paria, and the Island of Trinidad 38
ALVAR NUNEZ CABEZA DE VACA (ca. 1490-1558) 40
The Relation of Alvar Naiiez Cabeza de Vaca 41
[Dedication] 41
[The Malhado Way of Life] 42
[Our Life among the Avavares and Arbadaos} 44
[Pushing On] 45
[Customs of That Region} 45
[The First Confrontation] 46
[The Fa