In [2]:
import os
from PyPDF2 import PdfReader
import re

unique_words = set()

def extract_text_from_pdf(pdf_file_path):
    reader = PdfReader(pdf_file_path)
    text = ''
    stop_extracting = False
    for page_num in range(1, len(reader.pages)):
        if stop_extracting:
            break
        page = reader.pages[page_num]
        extracted_text = page.extract_text()
        if f'{page_num+1}/{page_num+1}' in extracted_text:
            text += extracted_text.split(f'{page_num+1}/{page_num+1}')[0]
            stop_extracting = True
        else:
            text += extracted_text
    return text

def remove_punctuation_and_numbers(text):
    # Remove punctuation marks and numbers using regular expressions
    clean_text = re.sub(r'[^\w\s]', '', text)
    clean_text = re.sub(r'\d+', '', clean_text)
    # Remove extra spaces
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text.strip()

def extract_text_from_directory(directory_path):
      # Use set to store unique words
    num_pdfs = 0
    pdf_files = [filename for filename in os.listdir(directory_path) if filename.endswith('.pdf')]
    num_chunks = (len(pdf_files) + 99) // 100  # Calculate number of chunks
    for chunk_num in range(num_chunks):
        chunk_start = chunk_num * 100
        chunk_end = min((chunk_num + 1) * 100, len(pdf_files))
        chunk_pdfs = pdf_files[chunk_start:chunk_end]
        for filename in chunk_pdfs:
            num_pdfs += 1
            pdf_file_path = os.path.join(directory_path, filename)
            extracted_text = extract_text_from_pdf(pdf_file_path)
            cleaned_text = remove_punctuation_and_numbers(extracted_text)
            unique_words.update(cleaned_text.split())  # Update set with unique words
        print(f"Chunk {chunk_num + 1} completed. Number of PDFs iterated: {num_pdfs}")
    print(len(unique_words))
    print(f"Total number of PDFs iterated: {num_pdfs}")


def main():
    directory_path = r'C:\Stories'
    extract_text_from_directory(directory_path)

if __name__ == "__main__":
    main()


Chunk 1 completed. Number of PDFs iterated: 100
Chunk 2 completed. Number of PDFs iterated: 200
Chunk 3 completed. Number of PDFs iterated: 300
Chunk 4 completed. Number of PDFs iterated: 400
Chunk 5 completed. Number of PDFs iterated: 500
Chunk 6 completed. Number of PDFs iterated: 600
Chunk 7 completed. Number of PDFs iterated: 700
Chunk 8 completed. Number of PDFs iterated: 800
Chunk 9 completed. Number of PDFs iterated: 900
Chunk 10 completed. Number of PDFs iterated: 1000
Chunk 11 completed. Number of PDFs iterated: 1100
Chunk 12 completed. Number of PDFs iterated: 1200
Chunk 13 completed. Number of PDFs iterated: 1300
Chunk 14 completed. Number of PDFs iterated: 1400
Chunk 15 completed. Number of PDFs iterated: 1500
Chunk 16 completed. Number of PDFs iterated: 1600
Chunk 17 completed. Number of PDFs iterated: 1700
Chunk 18 completed. Number of PDFs iterated: 1800
Chunk 19 completed. Number of PDFs iterated: 1900
Chunk 20 completed. Number of PDFs iterated: 2000
Chunk 21 completed

In [10]:
# Assuming `unique_words` is properly defined elsewhere in your code
unique_words = list(unique_words)

# Define the file name
file_name = "separate_words.txt"

try:
    # Open the file in write mode with UTF-8 encoding
    with open(file_name, 'w', encoding='utf-8') as file:
        # Iterate over the list
        for item in unique_words:
            # Write each element followed by a newline character
            file.write(item + '\n')
    print("Unique words have been written to", file_name)
except Exception as e:
    print("An error occurred:", e)


Unique words have been written to separate_words.txt
