In [1]:
!python -m pip install pytesseract
!python -m pip install opencv-python



In [2]:
    pip install pymupdf pillow

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # replace this path with the path to your tesseract installation

In [2]:
import fitz
from PIL import Image
import pandas as pd
from pandas import DataFrame as df
from io import StringIO
from pathlib import Path
import csv

In [26]:
tesseract_path = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

In [27]:
import processing

In [28]:
pdf_path = r'TOCs\NAAL TOCs\2007\2007_Baym_The_Norton_anthology_of_American.pdf'
output_folder = r'TOCs\NAAL TOCs\2007\processed'
include_pngs = False

In [29]:
processing.pdf_to_data(pdf_path, output_folder, tesseract_path, include_pngs)

Done!


In [31]:
def pdf_to_data(pdf_path, output_folder, include_pngs= False):
    '''Chop a given PDF into individual pages, then convert each PDF into an image (saved to the pngs folder). Convert OCR data about each page into a .csv.
        This function requires that you have a folder in the same level as your pdf for outputs, and then two folders within that folder titled 'pngs' and 'tsv_data'.
        For example:
            your pdf
            output folder
                pngs ** Only necessary if you want to see the processed pages-- set include_pngs to True'''
    
    open_pdf = fitz.open(pdf_path)

    sum_string = ''
    tsv_total = pd.DataFrame()
    for page_num in range(open_pdf.page_count): # iterate through individual pages
        page = open_pdf[page_num]

        img = page.get_pixmap()
        

        # make the image
        pil_img = Image.frombytes("RGB", [img.width, img.height], img.samples) # convert to PIL Image
        # improve resolution
        scale_factor = 3 # try changing this to improve resolution
        new_size = img.width * scale_factor, img.height * scale_factor
        resize = pil_img.resize(new_size, Image.LANCZOS)
        

        # process image into tsv and clean some values
        png_to_data = pytesseract.image_to_data(resize, config = r'--psm 6') 
        data_to_tsv = StringIO(png_to_data)
        data_read = pd.read_csv(data_to_tsv, sep='\t', quoting=csv.QUOTE_NONE)
        tsv_clean = data_read[['line_num', 'word_num', 'left', 'top', 'text', 'conf']]
        tsv_total = pd.concat([tsv_total, tsv_clean])

        img_to_string = pytesseract.image_to_string(resize, config = r'--psm 6', lang = 'en')
        sum_string += img_to_string
        
        if include_pngs:
            output_png_path = Path(f"{output_folder}/pngs/page_{page_num + 1}.png")
            resize.save(output_png_path)

    output_tsv_total = Path(f"{output_folder}/location.csv")
    tsv_total.to_csv(output_tsv_total, sep= ',', index=False)
    #print(sum_string)
    output_str_path = Path(f"{output_folder}/str_data.txt")
    with open(output_str_path, 'w') as file:
        file.write(sum_string)

In [4]:
import csv
import re

# Paths to input and output files
input_path = r'TOCs/Black_Lit_in_America/processed/str_data.txt' # Update with your actual file path
output_path = 'output_data.csv'

# Regular expressions for identifying titles and page numbers
title_page_pattern = re.compile(r'^(.*?)\s+(\d+)$')  # Matches "Title    PageNumber"
author_pattern = re.compile(r'^[A-Za-z]{2,}(?:\s[A-Za-z]{2,}){0,4}$')  # Matches lines with only author names

# Initialize list to store parsed entries and a variable to keep track of the last seen author
parsed_data = []
last_author = "Unknown"

# Read and parse the file
with open(input_path, 'r') as file:
    for line in file:
        line = line.strip()  # Remove whitespace

        # Check if the line contains only an author name
        author_match = author_pattern.match(line)
        if author_match:
            last_author = author_match.group(0).strip()
            continue  # Move to the next line

        # Check for "Title    PageNumber" format
        title_page_match = title_page_pattern.match(line)
        if title_page_match:
            work = title_page_match.group(1).strip()
            page_number = title_page_match.group(2)
            parsed_data.append([work, last_author, page_number])
            continue

# Write parsed data to a CSV file
with open(output_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Work", "Author", "Page Number"])  # Write headers
    writer.writerows(parsed_data)  # Write data rows

print(f'Data successfully written to {output_path}')

Data successfully written to output_data.csv


In [5]:
data = pd.read_csv('output_data.csv')
data

Unnamed: 0,Work,Author,Page Number
0,{ Black American Literature: An Overview,Preface nw,1
1,1 Black Folklore,Preface nw,19
2,References,Preface nw,21
3,“Sheer Crops”,Preface nw,22
4,“Ole Ss Goose”,Preface nw,24
...,...,...,...
132,"“Assasxination” fram Don't Cry, Scream",Julian Mayfield,422
133,"Criminal Insane"" from Poems from Prison",Etheridge Knight,422
134,“The Violent Space” from Poems from Privon,Etheridge Knight,423
135,"""Tt Was a Funky Deal” from Poems from Prison",Etheridge Knight,425
