### setting up the processing (including tesseract)

In [2]:
import processing
import pandas as pd
import numpy as np

In [3]:
pdf_path = r'TOC_testing\NAFAM 3\NAFAM.3.2014_v1_cropped.pdf'
output_folder = r'TOC_testing\NAFAM 3\processed_v1'
tesseract_path = r'C://Program Files//Tesseract-OCR//tesseract.exe'

processing.pdf_to_data(pdf_path, output_folder, tesseract_path, include_pngs = False)

Done!


In [None]:
import csv
import spacy
import re

# Load the spaCy model for Named Entity Recognition
nlp = spacy.load('en_core_web_sm')

# Paths to input and output files
input_folder = r'TOC_testing\NAFAM 3\processed_v1'
input_path = f'{input_folder}/shortened_str.txt'  # Update with your actual file path
output_path = f'{input_folder}/output_data.csv'

# Regular expressions for matching titles and page numbers
title_page_pattern = re.compile(r'^(.*?)\s+(\d+)$')  # Matches "Title    PageNumber"

# Initialize list to store parsed entries
parsed_data = []
last_author = None

# Read and parse the file
with open(input_path, 'r') as file:
    for line in file:
        line = line.strip()  # Remove leading and trailing whitespace
        line = line.replace('/', '')  # Remove slashes
        if not line:
            continue  # Skip empty lines

        # Use spaCy to process the line and extract named entities
        doc = nlp(line)
        
        # Search for person entities (author names)
        is_author_line = False
        for ent in doc.ents:
            if ent.label_ == "PERSON":  # If the entity is a person
                last_author = ent.text.strip()  # Update the last_author
                is_author_line = True
                break  # Stop processing further for this line

        # Skip the line if it's an author name
        if is_author_line:
            continue

        # Check if the line has a page number (matches "Title PageNumber")
        title_page_match = title_page_pattern.match(line)
        if title_page_match:
            work = title_page_match.group(1).strip()
            page_number = title_page_match.group(2)

            # Associate the title with the last author
            parsed_data.append([work, last_author or "Unknown Author", page_number])
            continue
        
        # Handle lines with stray formatting where we might have a title without page number
        if last_author and not title_page_match:
            parsed_data.append([line, last_author, "N/A"])

# Write parsed data to a CSV file
with open(output_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Work", "Author", "Page Number"])  # Write headers
    writer.writerows(parsed_data)  # Write data rows

print(f'Data successfully written to {output_path}')

Data successfully written to TOC_testing\NAFAM 3\processed_v1/output_data.csv
