Convert PDF files to TXT files

In [None]:
import fitz  # PyMuPDF
import os
import glob

In [None]:
def pdf_to_text(pdf_path, txt_path):
    # Ensure the output directory exists
    os.makedirs(os.path.dirname(txt_path), exist_ok=True)

    # Open the provided PDF file
    document = fitz.open(pdf_path)

    # Write text to the corresponding text file using UTF-8 encoding
    with open(txt_path, 'w', encoding='utf-8') as text_file:
        for page_num in range(len(document)):
            # Get the page
            page = document.load_page(page_num)
            
            # Extract text from the page and write it to the text file using UTF-8 encoding
            text = page.get_text("text")
            text_file.write(text)
    
    # Close the document
    document.close()
    print(f"Converted: {pdf_path} -> {txt_path}")

def process_folder(base_folder, output_folder):
    # Iterate over all PDF files in the base folder and its subfolders
    for pdf_path in glob.glob(os.path.join(base_folder, '**/*.pdf'), recursive=True):
        #rename the text file name based on the PDF file name
        txt_filename = os.path.splitext(os.path.basename(pdf_path))[0] + '.txt'
        txt_path = os.path.join(output_folder, txt_filename)

        # convert the PDF to text
        pdf_to_text(pdf_path, txt_path)

# call the function
base_folder = r"D:\Corpora\nyed\nyed\docs"  
output_folder = r"D:\processed txt files"  
process_folder(base_folder, output_folder)

Re-arrange the files based on year

In [None]:
import re
import shutil

In [None]:
def extract_year_from_text(text_content):
    match = re.search(r'Filed \d{2}/\d{2}/(\d{2})', text_content)
    if match:
        return "20" + match.group(1)
    return "unknown"

def reorganize_text_files(file_path, base_output_folder):
    print(f"Processing {file_path}...")
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            year = extract_year_from_text(content)

        year_folder = os.path.join(base_output_folder, year)
        os.makedirs(year_folder, exist_ok=True)
        destination_path = os.path.join(year_folder, os.path.basename(file_path))

        # Move the file to the year folder
        shutil.move(file_path, destination_path)
        
        print(f"File moved: {file_path} -> {destination_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

def process_folder(base_folder, base_output_folder):
    txt_files = glob.glob(os.path.join(base_folder, '**/*.txt'), recursive=True)
    if not txt_files:
        print("No text files found in the directory.")
        return
    
    for file_path in txt_files:
        reorganize_text_files(file_path, base_output_folder)

# call the function
base_folder = r"D:\processed txt files"  
base_output_folder = r"D:\rearrange_files"  
process_folder(base_folder, base_output_folder)

In [None]:
Descriptive statistics of the corpus

In [None]:
def count_files_and_words(base_folder):
    # Dictionary to store the count of files and words for each subfolder
    folder_counts = {}
    
    # Iterate through each directory and subdirectory in the base folder
    for root, dirs, files in os.walk(base_folder):
        txt_files = [file for file in files if file.endswith('.txt')]
        num_files = len(txt_files)
        total_words = 0

        # Count words in each text file
        for txt_file in txt_files:
            file_path = os.path.join(root, txt_file)
            with open(file_path, 'r', encoding='utf-8') as file:
                words = file.read().split()
                total_words += len(words)
        
        # Only add to dictionary if there are .txt files
        if num_files > 0:
            folder_name = os.path.basename(root)
            folder_counts[folder_name] = {'files': num_files, 'words': total_words}

    # Print and return the counts
    total_files = 0
    total_words = 0
    for folder, counts in folder_counts.items():
        print(f"{folder}: {counts['files']} files, {counts['words']} words")
        total_files += counts['files']
        total_words += counts['words']

    print(f"Total: {total_files} files, {total_words} words")
    return folder_counts

# Example usage
base_folder = r"D:\pro_re_rank"  # Replace with your base folder path
count_files_and_words(base_folder)