In [1]:
import os
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Load Qwen2-VL-7B-Instruct model

In [2]:
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# Load the processor
min_pixels = 1000000
max_pixels = 1000000
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
)

STAGES = ['Wake', 'N1', 'N2', 'N3', 'REM']

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.27s/it]


# Load images generated based on ConMIL interpretations and prediction sets

In [3]:
# Load prediction set figures
sample_path = 'sleepEDF'

# Load ground truth and prediction set
gt_path = os.path.join(sample_path, "ground_truth.txt")
with open(gt_path, "r") as f:
    gt_text = f.read().strip()
    gt = STAGES.index(gt_text.split(":")[1].strip())

prediction_set = [fname.replace(".png", "") for fname in os.listdir(sample_path) if fname.endswith(".png") and fname != "org.png"]
pset_indices = [STAGES.index(code) for code in prediction_set]

# Load images for predictions
conmil_paths = [os.path.join(sample_path, f"{STAGES[idx]}.png") for idx in pset_indices]
images = [Image.open(path).convert("RGB") for path in conmil_paths]

# Generate instructions

In [4]:
# Create dynamic instruction
diagnosis_options = ", ".join([STAGES[idx] for idx in sorted(pset_indices)])
instruction = (
    f"Given a Fpz-Cz EEG and visual interpretations for the following possible sleep stages: {diagnosis_options}, "
    f"determine the most likely sleep stage. Use the provided model interpretations as reference and "
    f"base your decision solely on these visual features without additional analysis or introducing new criteria.\n\n"
    f"Provide your answer in the following format:\n"
    f"Conclusion: <Selected Sleep Stage>\n"
    f"Reason: <Brief reason for the choice based on visual features>"
)

# Prepare messages for the processor
content = [{"type": f"Model interpretation for sleep stage: {STAGES[idx]}", "image": img} for idx, img in zip(pset_indices, images)]
content.append({"type": "Instruction", "text": instruction})
messages = [{"role": "user", "content": content}]

# Process vision info and prepare inputs
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=[image_inputs],
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")  # Move inputs to GPU if using CUDA

In [5]:
# Perform inference
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

# Parse LLM response
response = output_text[0].strip()
diagnosis = response.split("Conclusion: ")[1].split("\n")[0].strip() if "Conclusion: " in response else "Unknown"

# Show LLM response 

In [6]:
print("Instruction:", instruction)
print("Generated Response:", response)
print("Extracted Conclusion:", diagnosis)
print("Ground Truth:", STAGES[gt])

Instruction: Given a Fpz-Cz EEG and visual interpretations for the following possible sleep stages: Wake, N1, REM, determine the most likely sleep stage. Use the provided model interpretations as reference and base your decision solely on these visual features without additional analysis or introducing new criteria.

Provide your answer in the following format:
Conclusion: <Selected Sleep Stage>
Reason: <Brief reason for the choice based on visual features>
Generated Response: Conclusion: N1
Reason: The EEG trace shows a relatively low amplitude with some theta waves, which is characteristic of the N1 sleep stage. The green bars indicate the model's attention overlay, which aligns with the N1 stage.
Extracted Conclusion: N1
Ground Truth: N1


# Now we see how LLM works without ConMIL's support

In [7]:
# Load original image
org_image_path = os.path.join(sample_path, "org.png")
org_image = Image.open(org_image_path).convert("RGB")

In [8]:
# Create instruction
instruction = (
    f"Given a Fpz-Cz EEG, determine the most likely sleep stage among the following categories:"
    f"Wake, N1, N2, N3, or REM."
    f"Base your decision solely on the visual features of the provided EEG without performing additional analysis or introducing new criteria.\n\n"
    f"Provide your answer in the following format:\n"
    f"Conclusion: <Selected Sleep Stage>\n"
    f"Reason: <Brief reason for the choice based on visual features>"
)

# Prepare messages for the processor
messages = [
    {
        "role": "user",
        "content": [
            {"type": "EEG plot", "image": org_image},
            {"type": "Instruction", "text": instruction},
        ],
    }
]

# Process vision info and prepare inputs
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=[image_inputs],
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")  # Move inputs to GPU if using CUDA

In [9]:
# Perform inference
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

# Parse LLM response
response = output_text[0].strip()
diagnosis = response.split("Conclusion: ")[1].split("\n")[0].strip() if "Conclusion: " in response else "Unknown"

In [10]:
print("Instruction:", instruction)
print("Generated Response:", response)
print("Extracted Diagnosis:", diagnosis)
print("Ground Truth:", STAGES[gt])

Instruction: Given a Fpz-Cz EEG, determine the most likely sleep stage among the following categories:Wake, N1, N2, N3, or REM.Base your decision solely on the visual features of the provided EEG without performing additional analysis or introducing new criteria.

Provide your answer in the following format:
Conclusion: <Selected Sleep Stage>
Reason: <Brief reason for the choice based on visual features>
Generated Response: Conclusion: N2
Reason: The EEG shows a relatively smooth and regular pattern with occasional small amplitude fluctuations, which is characteristic of stage N2 sleep.
Extracted Diagnosis: N2
Ground Truth: N1


# We can also see how LLM works with only ConMIL prediction set

In [11]:
# Create instruction
instruction = (
    f"Given a Fpz-Cz EEG, determine the most likely sleep stage among the following categories:"
    f"{diagnosis_options}"
    f"Base your decision solely on the visual features of the provided EEG without performing additional analysis or introducing new criteria.\n\n"
    f"Provide your answer in the following format:\n"
    f"Conclusion: <Selected Sleep Stage>\n"
    f"Reason: <Brief reason for the choice based on visual features>"
)

# Prepare messages for the processor
messages = [
    {
        "role": "user",
        "content": [
            {"type": "EEG plot", "image": org_image},
            {"type": "Instruction", "text": instruction},
        ],
    }
]

# Process vision info and prepare inputs
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=[image_inputs],
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")  # Move inputs to GPU if using CUDA

In [12]:
# Perform inference
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

# Parse LLM response
response = output_text[0].strip()
diagnosis = response.split("Conclusion: ")[1].split("\n")[0].strip() if "Conclusion: " in response else "Unknown"

In [13]:
print("Instruction:", instruction)
print("Generated Response:", response)
print("Extracted Diagnosis:", diagnosis)
print("Ground Truth:", STAGES[gt])

Instruction: Given a Fpz-Cz EEG, determine the most likely sleep stage among the following categories:Wake, N1, REMBase your decision solely on the visual features of the provided EEG without performing additional analysis or introducing new criteria.

Provide your answer in the following format:
Conclusion: <Selected Sleep Stage>
Reason: <Brief reason for the choice based on visual features>
Generated Response: Conclusion: Wake
Reason: The EEG shows a high level of variability and irregularity, with no clear patterns or consistent waveforms that are characteristic of sleep stages such as N1, N2, or REM. The absence of a consistent pattern and the high level of noise suggest that the individual was likely awake during the recording.
Extracted Diagnosis: Wake
Ground Truth: N1
