## Extract Text from the PDFs

In [None]:
import re
import os
import easyocr
from pdf2image import convert_from_path
from PIL import Image
from PyPDF2 import PdfReader
from rich.console import Console
import json
from tqdm.auto import tqdm
import numpy as np

console = Console()

# Initialize EasyOCR reader
reader = easyocr.Reader(["en"])  # Specify language(s) here


def read_pdf(pdf_file):
    text = ""
    reader_pdf = PdfReader(pdf_file)

    # Extract text from PDF pages
    for page in reader_pdf.pages:
        try:
            text += page.extract_text() or ""
        except Exception as e:
            console.print(f"Error reading {pdf_file} | {e}", style="red")
            continue

    # If no text was extracted, fallback to OCR on images
    if not text:
        try:
            images = convert_from_path(pdf_file)
            for image in images:
                processed_image = image.convert("L")
                processed_image = processed_image.resize(
                    tuple(2 * s for s in processed_image.size), Image.Resampling.LANCZOS
                )
                processed_image = processed_image.point(lambda p: p > 128 and 255)
                ocr_result = reader.readtext(np.array(processed_image), detail=0)
                text += " ".join(ocr_result)
        except Exception as e:
            console.print(f"Error reading {pdf_file} | {e}", style="red")
            return ""

    # Remove UUID-like patterns
    text = re.sub(
        r"\b([A-F0-9]{8}-([A-F0-9]{4}-){3}[A-F0-9]{12})\b",
        "",
        text,
        flags=re.IGNORECASE,
    )

    # Remove large blocks of capital letters and digit sequences that seem like noise
    text = re.sub(r"\b[A-Z]{2,}\b", "", text)
    text = re.sub(r"\b[0-9]{4,}\b", "", text)  # Remove long numbers

    # Remove standalone special characters or sequences of non-alphanumeric symbols
    text = re.sub(r"[-=_+.,;:!?\"\'^~|\\\/]", " ", text)

    # Clean multiple spaces, tabs, and newlines
    text = re.sub(r"\s+", " ", text)
    text = text.strip()

    return text


def save_to_json(data_batch, file_path="data.json"):
    if os.path.exists(file_path):
        with open(file_path, "r") as json_file:
            existing_data = json.load(json_file)
    else:
        existing_data = []

    existing_data.extend(data_batch)

    with open(file_path, "w") as json_file:
        json.dump(existing_data, json_file, indent=4)


ORGANS = [
    "BLADDER",
    "BRAIN",
    "CERVIX",
    "CRC",
    "HEADnNECK",
    "KIDNEY",
    "LIVER",
    "LUNG",
    "PROSTATE",
]

data_batch = []
batch_size = 10

for organ in tqdm(ORGANS, desc="Processing PDFs", leave=False, dynamic_ncols=True):
    pdf_paths = [
        f"/TCGA Pathology Reports/{organ}/pdfs/"
    ]

    for pdf_path in pdf_paths:
        for pdf_file in tqdm(
            os.listdir(pdf_path),
            desc=f"Reading {organ} PDFs",
            leave=False,
            dynamic_ncols=True,
        ):
            if pdf_file.endswith(".pdf") or pdf_file.endswith(".PDF"):
                pdf_file_path = os.path.join(pdf_path, pdf_file)
                text = read_pdf(pdf_file_path)

                if text:
                    data_batch.append(
                        {
                            "pdf_file_name": pdf_path + pdf_file,
                            "organ": organ,
                            "text": text,
                        }
                    )

                if len(data_batch) >= batch_size:
                    save_to_json(data_batch)
                    data_batch.clear()

if data_batch:
    save_to_json(data_batch)

## LLM

In [1]:
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import Ollama
from typing import Any
import json
import os
from tqdm.auto import tqdm
from prompts import (
    stage_1_prompt_template,
    stage_2_prompt_template,
    stage_3_prompt_template,
)


# Define the model handler
class ModelHandler:
    def __init__(self, model_name: str, temperature: float = 0.0, num_ctx: int = 16384):
        self.chat = Ollama(
            model=model_name, temperature=temperature, num_ctx=num_ctx, keep_alive=-1
        )

    def invoke(self, prompt: Any) -> str:
        return self.chat.invoke(prompt)


# Instantiate chat model
chat = ModelHandler("qwen2.5:7b-instruct-fp16")

# Define file path and chunk size
output_file = "data.json"
chunk_size = 10


# Load existing data from JSON file
def load_data(file_path: str) -> list:
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            return json.load(f)
    return []


# Save data to JSON file
def save_data(file_path: str, data: list):
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)


# Process a specific stage
def process_stage(
    data: list, stage_prompt_template: ChatPromptTemplate, stage_output_key: str
):
    pathology_report_data = []
    new_data_count = 0

    for entry in tqdm(data, desc=f"Processing {stage_output_key}", dynamic_ncols=True):
        if stage_output_key not in entry:
            if stage_output_key == "stage_1_output":
                input_message = stage_prompt_template.format_messages(
                    report=entry["text"]
                )
            elif stage_output_key == "stage_2_output":
                input_message = stage_prompt_template.format_messages(
                    report=entry["text"], stage_1_output=entry["stage_1_output"]
                )
            elif stage_output_key == "stage_3_output":
                input_message = stage_prompt_template.format_messages(
                    stage_1_output=entry["stage_1_output"],
                    stage_2_output=entry["stage_2_output"],
                )
            stage_output = chat.invoke(input_message)
            entry[stage_output_key] = stage_output
            pathology_report_data.append(entry)
            new_data_count += 1

        if new_data_count >= chunk_size:
            save_data(output_file, data)
            pathology_report_data.clear()
            new_data_count = 0

    if pathology_report_data:
        save_data(output_file, data)


data = load_data(output_file)

## Stage 1

In [2]:
stage_1_prompt = ChatPromptTemplate.from_template(stage_1_prompt_template)
process_stage(data, stage_1_prompt, "stage_1_output")

Processing stage_1_output:   0%|          | 0/5716 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Stage 2

In [None]:
stage_2_prompt = ChatPromptTemplate.from_template(stage_2_prompt_template)
process_stage(data, stage_2_prompt, "stage_2_output")

## Stage 3

In [None]:
stage_3_prompt = ChatPromptTemplate.from_template(stage_3_prompt_template)
process_stage(data, stage_3_prompt, "stage_3_output")

## Convert txt to json

In [7]:
import os
import json

# Define the folder path containing the .txt files
folder_path = "./Test Pathology Files/"

# Output JSON file path
output_file = "test_data.json"

# Initialize an empty list to store the data
data = []

# Iterate through all .txt files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        print(f"Reading {file_path}")
        with open(file_path, "r") as file:
            text_content = file.read().strip()  # Read and strip any extra whitespace
            data.append({
                "pdf_file_name": os.path.splitext(filename)[0],  # Filename without extension
                "organ": "N/A",
                "text": text_content,
            })

# Save the data to a JSON file
with open(output_file, "w") as json_file:
    json.dump(data, json_file, indent=4)

print(f"Data successfully written to {output_file}")


Reading ./Test Pathology Files/Intermediate2.txt
Reading ./Test Pathology Files/Intermediate5.txt
Reading ./Test Pathology Files/Intermediate4.txt
Reading ./Test Pathology Files/Simple2.txt
Reading ./Test Pathology Files/Simple1.txt
Reading ./Test Pathology Files/Difficult5.txt
Reading ./Test Pathology Files/Difficult1.txt
Reading ./Test Pathology Files/Difficult4.txt
Reading ./Test Pathology Files/Simple5.txt
Reading ./Test Pathology Files/Difficult2.txt
Reading ./Test Pathology Files/Intermediate1.txt
Reading ./Test Pathology Files/Difficult3.txt
Reading ./Test Pathology Files/Simple4.txt
Reading ./Test Pathology Files/Intermediate3.txt
Reading ./Test Pathology Files/Simple3.txt
Data successfully written to test_data.json
