In [1]:
import os
import shutil
from PyPDF2 import PdfReader
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def list_and_process_pdfs(input_directory, output_directory, search_string, threshold=0.8):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    pdf_files = [f for f in os.listdir(input_directory) if f.lower().endswith('.pdf')]

    for pdf in pdf_files:
        pdf_path = os.path.join(input_directory, pdf)
        try:
            with open(pdf_path, 'rb') as file:
                reader = PdfReader(file)
                content = ""
                for page in reader.pages:
                    content += page.extract_text()
                if similar(content.lower(), search_string.lower()) >= threshold:
                    shutil.move(pdf_path, os.path.join(output_directory, pdf))
        except Exception as e:
            print(f"Error processing {pdf}: {e}")

# Configuration
input_directory = r"E:\downloads\CICIMA"
output_directory = r"E:\downloads\CICIMA\processed"
search_string = "centro de investigacion en ciencia e ingenieria de materiales"
search_strings = ["centro de investigacion en ciencia e ingenieria de materiales", "CICIMA" , "cicima"]
threshold = 0.8

list_and_process_pdfs(input_directory, output_directory, search_string, threshold)




In [None]:
import os
import shutil
from PIL import Image
import fitz  # PyMuPDF for PDF to PNG conversion
import pytesseract
from difflib import SequenceMatcher
from tqdm import tqdm

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def list_and_process_pdfs(input_directory, output_directory, search_strings, threshold=0.8):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    pdf_files = [f for f in os.listdir(input_directory) if f.lower().endswith('.pdf')]

    for pdf in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(input_directory, pdf)
        try:
            # Convert PDF to PNG
            pdf_document = fitz.open(pdf_path)
            for page_number in range(len(pdf_document)):
                page = pdf_document.load_page(page_number)
                pix = page.get_pixmap()
                temp_png = f"temp_page_{page_number}.png"
                pix.save(temp_png)

                # Perform OCR on PNG
                text = pytesseract.image_to_string(Image.open(temp_png))
                os.remove(temp_png)  # Clean up temporary PNG file

                # Check for matches
                for search_string in search_strings:
                    if similar(text.lower(), search_string.lower()) >= threshold:
                        shutil.move(pdf_path, os.path.join(output_directory, pdf))
                        break
            pdf_document.close()
        except Exception as e:
            print(f"Error processing {pdf}: {e}")

# Configuration
input_directory = r"E:\downloads\CICIMA"
output_directory = r"E:\downloads\CICIMA\processed"
search_strings = ["centro de investigacion en ciencia e ingenieria de materiales", "CICIMA", "cicima"]
threshold = 0.8

list_and_process_pdfs(input_directory, output_directory, search_strings, threshold)


Processing PDFs:  56%|█████████████████████████████████▍                          | 44/79 [1:52:07<2:37:17, 269.64s/it]

In [None]:
import os
import shutil
from PIL import Image
import fitz  # PyMuPDF for PDF to PNG conversion
import pytesseract
from difflib import SequenceMatcher
from tqdm import tqdm

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def list_and_process_pdfs(input_directory, output_directory, ocr_directory, search_strings, threshold=0.8):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    if not os.path.exists(ocr_directory):
        os.makedirs(ocr_directory)

    pdf_files = [f for f in os.listdir(input_directory) if f.lower().endswith('.pdf')]

    for pdf in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(input_directory, pdf)
        txt_path = os.path.join(ocr_directory, f"{os.path.splitext(pdf)[0]}.txt")

        try:
            # Check if OCR text file already exists
            if not os.path.exists(txt_path):
                pdf_document = fitz.open(pdf_path)
                full_text = ""

                # Convert PDF pages to PNG and perform OCR
                for page_number in range(len(pdf_document)):
                    page = pdf_document.load_page(page_number)
                    
                    # Increase resolution and apply anti-aliasing
                    zoom_x = 2.0  # Horizontal zoom (2.0 = 144 DPI, adjust as needed)
                    zoom_y = 2.0  # Vertical zoom
                    matrix = fitz.Matrix(zoom_x, zoom_y)
                    pix = page.get_pixmap(matrix=matrix, alpha=False)  # Generate pixmap
                    
                    temp_png = f"temp_page_{page_number}.png"
                    pix.save(temp_png)

                    text = pytesseract.image_to_string(Image.open(temp_png))
                    full_text += text
                    os.remove(temp_png)  # Clean up temporary PNG file

                pdf_document.close()

                # Save the extracted text to a .txt file
                with open(txt_path, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(full_text)

            # Read text from the OCR file
            with open(txt_path, 'r', encoding='utf-8') as txt_file:
                content = txt_file.read()

            # Check for matches in the content
            for search_string in search_strings:
                if similar(content.lower(), search_string.lower()) >= threshold:
                    shutil.move(pdf_path, os.path.join(output_directory, pdf))
                    break
        except Exception as e:
            print(f"Error processing {pdf}: {e}")

# Configuration
input_directory = r"E:\downloads\CICIMA"
output_directory = r"E:\downloads\CICIMA\processed"
ocr_directory = r"E:\downloads\CICIMA\OCR"
search_strings = ["centro de investigacion en ciencia e ingenieria de materiales", "CICIMA", "cicima"]
threshold = 0.8

list_and_process_pdfs(input_directory, output_directory, ocr_directory, search_strings, threshold)


In [None]:
import os
import shutil
from PIL import Image
import fitz  # PyMuPDF for PDF to PNG conversion
import pytesseract
from difflib import SequenceMatcher
from tqdm import tqdm
import language_tool_python


def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


def correct_text_with_languagetool(text):
    tool = language_tool_python.LanguageTool("en-US")
    matches = tool.check(text)
    return language_tool_python.utils.correct(text, matches)


def list_and_process_pdfs(input_directory, output_directory, ocr_directory, search_strings, threshold=0.8):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    if not os.path.exists(ocr_directory):
        os.makedirs(ocr_directory)

    pdf_files = [f for f in os.listdir(input_directory) if f.lower().endswith('.pdf')]

    print("Starting PDF processing...")
    for pdf in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(input_directory, pdf)
        txt_path = os.path.join(ocr_directory, f"{os.path.splitext(pdf)[0]}.txt")

        try:
            # Step 1: OCR Conversion
            if not os.path.exists(txt_path):
                pdf_document = fitz.open(pdf_path)
                full_text = ""

                for page_number in tqdm(range(len(pdf_document)), desc=f"OCR Pages for {pdf}", leave=False):
                    page = pdf_document.load_page(page_number)
                    matrix = fitz.Matrix(2.0, 2.0)  # Improved resolution
                    pix = page.get_pixmap(matrix=matrix, alpha=False)
                    temp_png = f"temp_page_{page_number}.png"
                    pix.save(temp_png)

                    text = pytesseract.image_to_string(Image.open(temp_png))
                    full_text += text
                    os.remove(temp_png)  # Clean up temporary PNG file

                pdf_document.close()

                # Save the extracted text to a .txt file
                with open(txt_path, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(full_text)

            # Step 2: Grammar and Spelling Correction
            print(f"Correcting text for {pdf}...")
            with open(txt_path, 'r', encoding='utf-8') as txt_file:
                content = txt_file.read()
            corrected_text = correct_text_with_languagetool(content)

            with open(txt_path, 'w', encoding='utf-8') as txt_file:
                txt_file.write(corrected_text)

            # Step 3: Check for Matches
            print(f"Checking for matches in {pdf}...")
            for search_string in tqdm(search_strings, desc=f"Matching {pdf}", leave=False):
                if similar(corrected_text.lower(), search_string.lower()) >= threshold:
                    shutil.move(pdf_path, os.path.join(output_directory, pdf))
                    break

        except Exception as e:
            print(f"Error processing {pdf}: {e}")


# Configuration
input_directory = r"E:\downloads\CICIMA"
output_directory = r"E:\downloads\CICIMA\processed"
ocr_directory = r"E:\downloads\CICIMA\OCR"
search_strings = ["centro de investigacion en ciencia e ingenieria de materiales", "CICIMA", "cicima"]
threshold = 0.8

list_and_process_pdfs(input_directory, output_directory, ocr_directory, search_strings, threshold)


In [None]:
import os
import shutil
from PIL import Image
import fitz  # PyMuPDF for PDF to PNG conversion
import pytesseract
from difflib import SequenceMatcher
from tqdm import tqdm
import language_tool_python
from textblob import TextBlob

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def correct_text_with_languagetool(text):
    try:
        tool = language_tool_python.LanguageTool("en-US")
        matches = tool.check(text)
        return language_tool_python.utils.correct(text, matches)
    except Exception as e:
        print(f"LanguageTool error: {e}. Falling back to TextBlob.")
        return correct_text_with_textblob(text)

def correct_text_with_textblob(text):
    blob = TextBlob(text)
    return str(blob.correct())

def list_and_process_pdfs(input_directory, output_directory, ocr_directory, search_strings, threshold=0.8):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    if not os.path.exists(ocr_directory):
        os.makedirs(ocr_directory)

    pdf_files = [f for f in os.listdir(input_directory) if f.lower().endswith('.pdf')]

    print("Starting PDF processing...")
    for pdf in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(input_directory, pdf)
        txt_path = os.path.join(ocr_directory, f"{os.path.splitext(pdf)[0]}.txt")

        try:
            # Step 1: OCR Conversion
            if not os.path.exists(txt_path):
                pdf_document = fitz.open(pdf_path)
                full_text = ""

                for page_number in tqdm(range(len(pdf_document)), desc=f"OCR Pages for {pdf}", leave=False):
                    page = pdf_document.load_page(page_number)
    
                    # Increase resolution and apply anti-aliasing
                    zoom_x = 4.0  # Adjust zoom factor (4.0 corresponds to 288 DPI)
                    zoom_y = 4.0
                    matrix = fitz.Matrix(zoom_x, zoom_y).preRotate(0)  # Optional: Add rotation handling if needed
                    pix = page.get_pixmap(matrix=matrix, alpha=False)  # Generate high-resolution pixmap
                
                    # Convert pixmap directly to an in-memory image
                    image_bytes = BytesIO(pix.tobytes(output="png"))
                    image = Image.open(image_bytes)
                
                    # Perform OCR on the high-resolution image
                    text = pytesseract.image_to_string(image)
                    full_text += text
                
                    # Close the in-memory image to free up resources
                    image.close()

                pdf_document.close()

                # Save the extracted text to a .txt file
                with open(txt_path, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(full_text)

            # Step 2: Grammar and Spelling Correction
            print(f"Correcting text for {pdf}...")
            with open(txt_path, 'r', encoding='utf-8') as txt_file:
                content = txt_file.read()
            corrected_text = correct_text_with_languagetool(content)

            with open(txt_path, 'w', encoding='utf-8') as txt_file:
                txt_file.write(corrected_text)

            # Step 3: Check for Matches
            print(f"Checking for matches in {pdf}...")
            for search_string in tqdm(search_strings, desc=f"Matching {pdf}", leave=False):
                if similar(corrected_text.lower(), search_string.lower()) >= threshold:
                    shutil.move(pdf_path, os.path.join(output_directory, pdf))
                    break

        except Exception as e:
            print(f"Error processing {pdf}: {e}")


# Configuration
input_directory = r"E:\downloads\CICIMA"
output_directory = r"E:\downloads\CICIMA\processed"
ocr_directory = r"E:\downloads\CICIMA\OCR"
search_strings = ["centro de investigacion en ciencia e ingenieria de materiales", "CICIMA", "cicima"]
threshold = 0.8

list_and_process_pdfs(input_directory, output_directory, ocr_directory, search_strings, threshold)
