# PDF to Text Conversion

This notebook converts 'Los-miserables.pdf' to text and saves it in CSV format.

In [None]:
# Install required packages if not already installed
!pip install PyPDF2 pandas

: 

In [1]:
import os
import PyPDF2
import pandas as pd
import re

In [2]:
# Define file paths
pdf_path = '../dat/Los-miserables.pdf'
output_csv_path = '../dat/Los-miserables.csv'

# Check if the PDF file exists
if not os.path.exists(pdf_path):
    raise FileNotFoundError(f"The PDF file was not found at {pdf_path}")
else:
    print(f"Found PDF file at {pdf_path}")

Found PDF file at ../dat/Los-miserables.pdf


In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file page by page
    """
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Get total number of pages
        num_pages = len(pdf_reader.pages)
        print(f"Total pages in PDF: {num_pages}")
        
        # List to store text from each page
        all_text = []
        
        # Extract text from each page
        for page_num in range(num_pages):
            # Get a specific page
            page = pdf_reader.pages[page_num]
            
            # Extract text from page
            text = page.extract_text()
            
            if text:
                # Add page number and text to list
                all_text.append({
                    'page_number': page_num + 1,
                    'text': text
                })
            
            # Print progress every 10 pages
            if (page_num + 1) % 10 == 0 or page_num == 0 or page_num == num_pages - 1:
                print(f"Processed page {page_num + 1} of {num_pages}")
        
        return all_text

In [4]:
# Extract text from the PDF
extracted_text = extract_text_from_pdf(pdf_path)
print(f"Extracted text from {len(extracted_text)} pages")

Total pages in PDF: 305
Processed page 1 of 305
Processed page 10 of 305
Processed page 20 of 305
Processed page 30 of 305
Processed page 40 of 305
Processed page 50 of 305
Processed page 60 of 305
Processed page 70 of 305
Processed page 80 of 305
Processed page 90 of 305
Processed page 100 of 305
Processed page 110 of 305
Processed page 120 of 305
Processed page 130 of 305
Processed page 140 of 305
Processed page 150 of 305
Processed page 160 of 305
Processed page 170 of 305
Processed page 180 of 305
Processed page 190 of 305
Processed page 200 of 305
Processed page 210 of 305
Processed page 220 of 305
Processed page 230 of 305
Processed page 240 of 305
Processed page 250 of 305
Processed page 260 of 305
Processed page 270 of 305
Processed page 280 of 305
Processed page 290 of 305
Processed page 300 of 305
Processed page 305 of 305
Extracted text from 305 pages


In [5]:
# Create a DataFrame from the extracted text
df = pd.DataFrame(extracted_text)

# Display the first few rows
print("\nPreview of extracted text:")
display(df.head())


Preview of extracted text:


Unnamed: 0,page_number,text
0,1,\n
1,2,"2 \n\tLos miserables I Hugo, Victor Novela \t..."
2,3,3 \nPRIMERA PARTE Fantine
3,4,4 \nLIBRO PRIMERO Un justo
4,5,"5 \nI El señor Myriel \nEn 1815, monseñor Cha..."


In [6]:
# Clean text (optional) - remove extra whitespace
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Save to CSV
df.to_csv(output_csv_path, index=False)
print(f"\nSaved text to CSV: {output_csv_path}")


Saved text to CSV: ../dat/Los-miserables.csv


In [7]:
# Verify the CSV file was created and show its size
if os.path.exists(output_csv_path):
    file_size = os.path.getsize(output_csv_path) / 1024  # Size in KB
    print(f"CSV file created successfully: {output_csv_path}")
    print(f"File size: {file_size:.2f} KB")
    
    # Read back the CSV and display a sample
    csv_df = pd.read_csv(output_csv_path)
    print(f"\nCSV contains {len(csv_df)} rows and {len(csv_df.columns)} columns")
    display(csv_df.head())
else:
    print(f"Failed to create CSV file at {output_csv_path}")

CSV file created successfully: ../dat/Los-miserables.csv
File size: 646.12 KB

CSV contains 305 rows and 2 columns


Unnamed: 0,page_number,text
0,1,
1,2,"2 Los miserables I Hugo, Victor Novela Se reco..."
2,3,3 PRIMERA PARTE Fantine
3,4,4 LIBRO PRIMERO Un justo
4,5,"5 I El señor Myriel En 1815, monseñor Charles-..."
