In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-14B-Instruct"


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [2]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [51]:
def build_paraphrase_prompt(original_report):
    system_msg = """ You are an expert medical assistant AI capable of modifying clinical documents to
user specifications. You make minimal changes to the original document to satisfy
user requests. You never add information that is not already directly stated in
the original document.

Extract only two sections from the input radiology report: 'Findings' and 'Impression'. If 'Finding' or 'Impression' is None, keep 'None'.  An Indication section can refer to the History, Indication or Reason for Study sections in the
original report. Remove any information not directly observable from the current
imaging study. For instance, remove any patient demographic data, past medical
history, or comparison to prior images or studies. The generated 'Findings' and
'Impression' sections should not reference any changes based on prior images,
studies, or external knowledge about the patient. For example, paraphase sentence containing words like "new", "unchanged", "increase", "decrease" such that the section is related to this specific image.
Rewrite such comparisons as a status observation based only on the current image or study. The output should be sentences for each section.

Also remove deidentified patient information represented by "___".
"""
    user_msg = (
        f"Rewrite this chest X-ray report"
        f"\n\n{original_report}"
    )
    return [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]


In [52]:
def paraphrase_mimic_report(original_report, max_new_tokens=512):
    # 2) Build Qwen messages
    messages = build_paraphrase_prompt(original_report)

    # 3) Convert messages to a Qwen chat prompt
    text_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # 4) Tokenize and move to GPU
    model_inputs = tokenizer([text_prompt], return_tensors="pt").to(model.device)

    # 5) Generate
    with torch.no_grad():
        output_ids = model.generate(
            **model_inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    # 6) Remove the prompt portion from the final output (this slices off input tokens)
    trimmed_ids = [out[len(inp) :] for inp, out in zip(model_inputs.input_ids, output_ids)]

    # 7) Decode to string
    paraphrased = tokenizer.batch_decode(trimmed_ids, skip_special_tokens=True)[0]
    return paraphrased.strip()

In [53]:
with open("/home/arpanp/Fed_Vision_Language/report/files_preprocessed/p12/p12000432/s51638543.txt", "r", encoding="utf-8") as f:
            text = f.read().strip()

In [54]:
print(text)

FINDINGS:
Slight increase in interstitial markings may be due to minimal
 interstitial edema.  No focal consolidation is seen.  There is no pleural
 effusion or pneumothorax.  The aorta is calcified and tortuous.  The cardiac
 silhouette is top normal-to-mildly enlarged.  The bones are diffusely
 osteopenic.

IMPRESSION:
Possible minimal interstitial edema.  Otherwise, no acute
 cardiopulmonary process.


In [55]:
cleaned_text = paraphrase_mimic_report(text)



In [56]:
print(cleaned_text)

FINDINGS:
Slight increase in interstitial markings. No focal consolidation is seen. There is no pleural effusion or pneumothorax. The aorta is calcified and tortuous. The cardiac silhouette is normal-to-mildly enlarged. The bones are diffusely osteopenic.

IMPRESSION:
Minimal interstitial edema is present. Otherwise, no acute cardiopulmonary process is identified.


In [1]:
import os
import shutil

def copy_incomplete_files(src_dir, processed_dir, incomplete_dir, suffix=".txt"):
    """
    Recursively scan the source directory for files ending with the specified suffix.
    For each file, if the corresponding file does not exist in the processed directory,
    copy the file to the incomplete directory, preserving the folder structure.

    Args:
        src_dir (str): Path to the original source files.
        processed_dir (str): Path where the processed files are stored.
        incomplete_dir (str): Destination path for unprocessed files.
        suffix (str): File extension to filter text files (default: ".txt").
    """
    for root, _, files in os.walk(src_dir):
        for file in files:
            if file.lower().endswith(suffix):
                # Construct the full path for the source file.
                src_path = os.path.join(root, file)
                # Compute the relative file path with respect to the source directory.
                rel_path = os.path.relpath(src_path, src_dir)
                # Determine the corresponding processed file path.
                processed_path = os.path.join(processed_dir, rel_path)
                
                if not os.path.exists(processed_path):
                    # This file was not processed; prepare the destination path in the incomplete directory.
                    incomplete_path = os.path.join(incomplete_dir, rel_path)
                    # Ensure that the directory exists.
                    os.makedirs(os.path.dirname(incomplete_path), exist_ok=True)
                    print(f"Copying {src_path} to {incomplete_path}")
                    # Copy the file (using copy2 to preserve metadata).
                    shutil.copy2(src_path, incomplete_path)

In [2]:
src_directory = "/home/arpanp/Fed_Vision_Language/report/files_preprocessed"
processed_directory = "/home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned"
incomplete_directory = "/home/arpanp/Fed_Vision_Language/report/incomplete"

# Execute the incomplete file copying.
copy_incomplete_files(src_directory, processed_directory, incomplete_directory)



In [2]:
import os
import re

def fix_headings_in_text(text: str) -> str:
    """
    Normalize all variants of 'Findings:' and 'Impression:' (with or without **, any case)
    to uppercase 'FINDINGS:' and 'IMPRESSION:'.
    """
    # Pattern list: (regex, replacement)
    patterns = [
        (r"\*{0,2}findings:\*{0,2}", "FINDINGS:"),
        (r"\*{0,2}impression:\*{0,2}", "IMPRESSION:")
    ]
    for pattern, replacement in patterns:
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

def process_all_reports(root_dir: str):
    """
    Walk through root_dir (and subfolders), find every .txt file,
    apply fix_headings_in_text, and overwrite the file if it changed.
    """
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for fname in filenames:
            if not fname.lower().endswith(".txt"):
                continue

            fullpath = os.path.join(dirpath, fname)
            with open(fullpath, "r", encoding="utf-8") as f:
                content = f.read()

            new_content = fix_headings_in_text(content)
            if new_content != content:
                with open(fullpath, "w", encoding="utf-8") as f:
                    f.write(new_content)
                print(f"Fixed headings in {fullpath}")

if __name__ == "__main__":
    # Path to your alreadycleaned reports
    root_cleaned = "/home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned"
    process_all_reports(root_cleaned)


Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10234145/s56455036.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10025791/s56326147.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10754405/s52606630.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10754405/s53479531.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10754405/s52857349.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10264646/s58554392.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10691691/s56706450.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10712190/s54273520.txt
Fixed headings in /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10839205/s57314232.txt
Fixed headings in /home/arpanp/Fed_Vision_Lang

In [1]:
import os
import re

def remove_empty_reports(root_dir: str):
    """
    Recursively traverse root_dir and delete any .txt file that does
    NOT contain either 'FINDINGS:' or 'IMPRESSION:' (casesensitive).
    """
    removed_count = 0

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for fname in filenames:
            if not fname.lower().endswith(".txt"):
                continue

            fullpath = os.path.join(dirpath, fname)
            with open(fullpath, "r", encoding="utf-8") as f:
                content = f.read()

            has_findings = bool(re.search(r'\bFINDINGS:', content))
            has_impression = bool(re.search(r'\bIMPRESSION:', content))

            if not (has_findings or has_impression):
                os.remove(fullpath)
                print(f"Removed (no FINDINGS/IMPRESSION): {fullpath}")
                removed_count += 1

    print(f"\nTotal files removed: {removed_count}")

if __name__ == "__main__":
    # Directory containing your cleaned reports
    cleaned_reports_dir = "/home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned"
    remove_empty_reports(cleaned_reports_dir)


Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10577868/s59149000.txt
Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10208285/s54110008.txt
Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10755700/s59252898.txt
Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10802399/s50653526.txt
Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10612379/s58115883.txt
Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10582595/s56200493.txt
Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10381729/s57853169.txt
Removed (no FINDINGS/IMPRESSION): /home/arpanp/Fed_Vision_Language/report/qwen_report_cleaned/p10/p10709298/s52041528.txt
Removed (no FINDINGS/IMP

KeyboardInterrupt: 