In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

  return self.fget.__get__(instance, owner)()


In [3]:
def generate_prompt(image_data):
    """
    Generates a clinical-style report from the image data.

    image_data: Dictionary where keys are image filenames and values are lists of dictionaries 
                containing 'box' and 'class_label' information.
    """
    reports = {}

    for image_id, findings in image_data.items():
        if not findings:  # Handle case with no findings
            reports[image_id] = "No abnormalities detected."
            continue

        prompt = f"Chest X-ray for image {image_id} reveals the following findings:\n"
        
        for finding in findings:
            box = finding['box']
            class_label = finding['class_label']
            x_min, y_min, x_max, y_max = box
            
            # Create a clinical sentence for each finding
            location_desc = f"between coordinates ({x_min}, {y_min}) and ({x_max}, {y_max})"
            prompt += f"- {class_label} observed {location_desc}.\n"
        
        prompt += "No other significant abnormalities detected."
        reports[image_id] = prompt

    return reports

In [4]:
json_file_path = '../segmentation_model/results.json'

In [5]:
with open(json_file_path, 'r') as file:
    image_data = json.load(file)

In [6]:
clinical_reports = generate_prompt(image_data)

In [7]:
output_data = {}

In [8]:
for image_id, report in clinical_reports.items():
    # Tokenize the text of the report
    inputs = tokenizer(report, return_tensors='pt')
    
    # Get the text features
    with torch.no_grad():
        text_features = model(**inputs).last_hidden_state.cpu().numpy().tolist() 
    
    # Save the report and text features in the output data
    output_data[image_id] = {
        "report": report,
        "text_features": text_features
    }

In [None]:
output_json_path = 'text_features.json'
with open(output_json_path, 'w') as outfile:
    json.dump(output_data, outfile, indent=4)