In [None]:
#===tag2text with rad========

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import io
import sys
import os
import json
import torch
from PIL import Image
import pandas as pd
from tqdm.notebook import tqdm


sys.path.append('/content/drive/MyDrive/PhD/Research1/recognize-anything-main')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
try:
    from ram.models import tag2text
    from ram import get_transform
    print("Successfully imported tag2text from Google Drive")
except ImportError as e:
    print(f"Error importing from Google Drive: {e}")
    print("Falling back to cloning the repository in Colab's local filesystem")
    !git clone https://github.com/xinyu1205/recognize-anything.git /content/recognize-anything
    %cd /content/recognize-anything
    !pip install -e .
    %cd /content
    sys.path.append('/content/recognize-anything')
    from ram.models import tag2text
    from ram import get_transform

Error importing from Google Drive: No module named 'timm'
Falling back to cloning the repository in Colab's local filesystem
Cloning into '/content/recognize-anything'...
remote: Enumerating objects: 737, done.[K
remote: Counting objects: 100% (447/447), done.[K
remote: Compressing objects: 100% (223/223), done.[K
remote: Total 737 (delta 310), reused 290 (delta 224), pack-reused 290 (from 1)[K
Receiving objects: 100% (737/737), 27.14 MiB | 24.14 MiB/s, done.
Resolving deltas: 100% (397/397), done.
/content/recognize-anything
Obtaining file:///content/recognize-anything
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting clip@ git+https://github.com/openai/CLIP.git (from ram==0.0.1)
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-install-xvwk44zd/clip_818c47fbc2474fb6a792c08071d16b03
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-install-xvwk44zd/clip_818c47fbc2474fb6a792c08071d16b03
  Resolved https://github.

  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore


In [None]:

global_models = {}

def load_models():
    if 'tag2text' not in global_models:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")


        delete_tag_index = [127, 2961, 3351, 3265, 3338, 3355, 3359]
        global_models['tag2text'] = tag2text(pretrained="/content/drive/MyDrive/PhD/Research1/recognize-anything-main/pretrained/tag2text_swin_14m.pth",
                                             image_size=384,
                                             vit='swin_b',
                                             delete_tag_index=delete_tag_index)
        global_models['tag2text'].threshold = 0.68  # threshold
        global_models['tag2text'].eval()
        global_models['tag2text'] = global_models['tag2text'].to(device)
        print("Tag2Text model loaded successfully")


        global_models['transform'] = get_transform(image_size=384)

def process_image(image_data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    if isinstance(image_data, dict):
        image_bytes = image_data['bytes']
    elif isinstance(image_data, str):
        image_bytes = json.loads(image_data)['bytes']
    else:
        image_bytes = image_data
    image_bytes = bytes(image_bytes)
    image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
    print(f"Image size: {image.size}")

    image_tensor = global_models['transform'](image).unsqueeze(0).to(device)

    try:

        with torch.no_grad():
            result = global_models['tag2text'].generate(image_tensor, return_tag_predict=True)

        if isinstance(result, tuple) and len(result) == 2:
            captions, tags = result
            caption = captions[0] if captions else "No caption generated"
        else:
            tags = result[0] if isinstance(result, list) else "No tags generated"
            caption = "No caption generated"


        if isinstance(tags, str):
            tags = tags.split('|')

        print(f"Tag2Text generated tags: {tags}")
        print(f"Tag2Text generated caption: {caption}")

        return tags, caption
    except Exception as e:
        print(f"Error processing image: {e}")
        return [], "Error generating caption"


In [None]:
def main():

    load_models()


    input_parquet = '/content/drive/MyDrive/PhD/Research1/RADdataset/test-00000-of-00001-e5bc3d208bb4deeb.parquet'
    output_json = '/content/drive/MyDrive/PhD/Research1/RADdataset/processed_data.json'


    df = pd.read_parquet(input_parquet)
    print(f"Loaded {len(df)} rows from the parquet file.")

    results = []


    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing images"):
        image_data = row['image']
        question = row['question']
        answer = row['answer']

        print(f"\nProcessing image {index}")
        print(f"Question: {question}")

        try:

            tags, tag2text_caption = process_image(image_data)


            result = {
                "index": index,
                "question": question,
                "original_answer": answer,
                "tag2text_caption": tag2text_caption,
                "tags": tags,
                "blip3_caption": None
            }
            results.append(result)
        except Exception as e:
            print(f"Error processing image {index}: {e}")

            results.append({
                "index": index,
                "question": question,
                "original_answer": answer,
                "tag2text_caption": "Error generating caption",
                "tags": [],
                "blip3_caption": None
            })

        print("-" * 50)


    with open(output_json, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"Processing complete. Results saved to {output_json}")

In [None]:

if __name__ == "__main__":
    main()

Using device: cuda
/encoder/layer/0/crossattention/self/query is tied
/encoder/layer/0/crossattention/self/key is tied
/encoder/layer/0/crossattention/self/value is tied
/encoder/layer/0/crossattention/output/dense is tied
/encoder/layer/0/crossattention/output/LayerNorm is tied
/encoder/layer/0/intermediate/dense is tied
/encoder/layer/0/output/dense is tied
/encoder/layer/0/output/LayerNorm is tied
/encoder/layer/1/crossattention/self/query is tied
/encoder/layer/1/crossattention/self/key is tied
/encoder/layer/1/crossattention/self/value is tied
/encoder/layer/1/crossattention/output/dense is tied
/encoder/layer/1/crossattention/output/LayerNorm is tied
/encoder/layer/1/intermediate/dense is tied
/encoder/layer/1/output/dense is tied
/encoder/layer/1/output/LayerNorm is tied
--------------
/content/drive/MyDrive/PhD/Research1/recognize-anything-main/pretrained/tag2text_swin_14m.pth
--------------
load checkpoint from /content/drive/MyDrive/PhD/Research1/recognize-anything-main/pretr

Processing images:   0%|          | 0/451 [00:00<?, ?it/s]


Processing image 0
Question: is there evidence of an aortic aneurysm?
Image size: (1024, 1291)
Tag2Text generated tags: ['chest']
Tag2Text generated caption: a chest xray of a person's chest
--------------------------------------------------

Processing image 1
Question: is there airspace consolidation on the left side?
Image size: (480, 503)
Tag2Text generated tags: ['chest | patient']
Tag2Text generated caption: a chest xray of a patient with a broken right middle x ray
--------------------------------------------------

Processing image 2
Question: is there any intraparenchymal abnormalities in the lung fields?
Image size: (1024, 1024)
Tag2Text generated tags: ['chest']
Tag2Text generated caption: a chest xray of a person's chest
--------------------------------------------------

Processing image 3
Question: which side of the heart border is obscured?
Image size: (867, 979)
Tag2Text generated tags: ['chest | patient']
Tag2Text generated caption: a chest xray of a patient with a he

In [None]:
#===tag2text with slake========

In [None]:

from google.colab import drive
drive.mount('/content/drive')#
import io
import sys
import os
import json
import torch
from PIL import Image
import pandas as pd
from tqdm.notebook import tqdm



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

sys.path.append('/content/drive/MyDrive/PhD/Research1/recognize-anything-main')
try:

    import importlib
    if not importlib.util.find_spec("fairscale"):
        print("Installing fairscale...")
        !pip install fairscale
    if not importlib.util.find_spec("timm"):
        print("Installing timm...")
        !pip install timm
    if not importlib.util.find_spec("transformers"):
        print("Installing transformers...")
        !pip install transformers

    from ram.models import tag2text
    from ram import get_transform
    print("Successfully imported tag2text from Google Drive")
except ImportError as e:
    print(f"Error importing from Google Drive: {e}")
    print("Falling back to cloning the repository in Colab's local filesystem")
    !git clone https://github.com/xinyu1205/recognize-anything.git /content/recognize-anything
    %cd /content/recognize-anything

    !pip install fairscale timm transformers
    !pip install -e .
    %cd /content
    sys.path.append('/content/recognize-anything')
    from ram.models import tag2text
    from ram import get_transform


global_models = {}



Successfully imported tag2text from Google Drive


In [None]:
def load_models():
    if 'tag2text' not in global_models:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")


        delete_tag_index = [127, 2961, 3351, 3265, 3338, 3355, 3359]
        global_models['tag2text'] = tag2text(pretrained="/content/drive/MyDrive/PhD/Research1/recognize-anything-main/pretrained/tag2text_swin_14m.pth",
                                           image_size=384,
                                           vit='swin_b',
                                           delete_tag_index=delete_tag_index)
        global_models['tag2text'].threshold = 0.68
        global_models['tag2text'].eval()
        global_models['tag2text'] = global_models['tag2text'].to(device)
        print("Tag2Text model loaded successfully")


        global_models['transform'] = get_transform(image_size=384)

def process_image(image_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    try:
        print(f"Loading image from: {image_path}")

        if not os.path.exists(image_path):
            print(f"ERROR: Image file not found: {image_path}")
            return [], "Error: Image file not found"


        image = Image.open(image_path).convert('RGB')
        print(f"Image successfully loaded. Size: {image.size}")


        print("Applying image transformation...")
        image_tensor = global_models['transform'](image).unsqueeze(0).to(device)
        print(f"Image transformed to tensor of shape: {image_tensor.shape}")


        print("Running Tag2Text model inference...")
        with torch.no_grad():
            result = global_models['tag2text'].generate(image_tensor, return_tag_predict=True)
        print("Tag2Text inference completed")

        if isinstance(result, tuple) and len(result) == 2:
            captions, tags = result
            caption = captions[0] if captions else "No caption generated"
            print(f"Got caption: {caption}")
        else:
            tags = result[0] if isinstance(result, list) else "No tags generated"
            caption = "No caption generated"
            print("No caption in result, only tags")


        if isinstance(tags, str):
            tags = tags.split('|')

        print(f"Tag2Text generated tags: {tags}")
        print(f"Tag2Text generated caption: {caption}")

        return tags, caption
    except Exception as e:
        import traceback
        print(f"Error processing image: {e}")
        print(traceback.format_exc())
        return [], f"Error generating caption: {str(e)}"

def main():

    load_models()


    slake_base_dir = '/content/drive/MyDrive/PhD/Research1/slakedataset/Slake1.0'
    json_path = os.path.join(slake_base_dir, 'test.json')
    output_dir = '/content/drive/MyDrive/PhD/Research1/output'
    output_json = os.path.join(output_dir, 'slake_tag2text_results.json')


    os.makedirs(output_dir, exist_ok=True)


    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)


    english_samples = [item for item in data if item.get('q_lang') == 'en']
    print(f"Loaded {len(english_samples)} English samples from test.json")

    results = []


    temp_output_json = os.path.join(output_dir, 'slake_tag2text_results_temp.json')


    for idx, sample in tqdm(enumerate(english_samples), total=len(english_samples), desc="Processing images"):
        img_name = sample.get('img_name', '')
        img_id = sample.get('img_id', '')
        question = sample.get('question', '')
        answer = sample.get('answer', '')
        modality = sample.get('modality', '')


        img_path = os.path.join(slake_base_dir, 'imgs', img_name)

        print(f"\nProcessing image {idx} (ID: {img_id})")
        print(f"Image path: {img_path}")
        print(f"Question: {question}")
        print(f"Modality: {modality}")

        try:

            if not os.path.exists(img_path):
                print(f"WARNING: Image file not found at {img_path}")

                alternative_paths = [
                    os.path.join(slake_base_dir, 'img', img_name),
                    os.path.join(slake_base_dir, 'images', img_name)
                ]
                for alt_path in alternative_paths:
                    if os.path.exists(alt_path):
                        img_path = alt_path
                        print(f"Found image at alternative path: {img_path}")
                        break


            tags, tag2text_caption = process_image(img_path)


            result = {
                "id": idx,
                "img_id": img_id,
                "img_name": img_name,
                "question": question,
                "original_answer": answer,
                "modality": modality,
                "tag2text_caption": tag2text_caption,
                "tags": tags,
                "blip3_caption": None
            }
            results.append(result)
        except Exception as e:
            import traceback
            print(f"Error processing sample {idx}, image {img_name}: {e}")
            print(traceback.format_exc())

            results.append({
                "id": idx,
                "img_id": img_id,
                "img_name": img_name,
                "question": question,
                "original_answer": answer,
                "modality": modality,
                "tag2text_caption": f"Error generating caption: {str(e)}",
                "tags": [],
                "blip3_caption": None
            })


        if (idx + 1) % 10 == 0:
            try:
                with open(temp_output_json, 'w') as f:
                    json.dump(results, f, indent=2)
                print(f"Temporary results saved to {temp_output_json} after processing {idx+1} samples")
            except Exception as save_error:
                print(f"Error saving temporary results: {save_error}")

        print("-" * 50)


    try:
        with open(output_json, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"Processing complete. Results saved to {output_json}")
    except Exception as save_error:
        print(f"Error saving final results: {save_error}")

        backup_output = os.path.join('/content', 'slake_tag2text_results_backup.json')
        with open(backup_output, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to backup location: {backup_output}")


if __name__ == "__main__":
    main()

Using device: cuda


BertLMHeadModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


/encoder/layer/0/crossattention/self/query is tied
/encoder/layer/0/crossattention/self/key is tied
/encoder/layer/0/crossattention/self/value is tied
/encoder/layer/0/crossattention/output/dense is tied
/encoder/layer/0/crossattention/output/LayerNorm is tied
/encoder/layer/0/intermediate/dense is tied
/encoder/layer/0/output/dense is tied
/encoder/layer/0/output/LayerNorm is tied
/encoder/layer/1/crossattention/self/query is tied
/encoder/layer/1/crossattention/self/key is tied
/encoder/layer/1/crossattention/self/value is tied
/encoder/layer/1/crossattention/output/dense is tied
/encoder/layer/1/crossattention/output/LayerNorm is tied
/encoder/layer/1/intermediate/dense is tied
/encoder/layer/1/output/dense is tied
/encoder/layer/1/output/LayerNorm is tied
--------------
/content/drive/MyDrive/PhD/Research1/recognize-anything-main/pretrained/tag2text_swin_14m.pth
--------------
load checkpoint from /content/drive/MyDrive/PhD/Research1/recognize-anything-main/pretrained/tag2text_swin

Processing images:   0%|          | 0/1061 [00:00<?, ?it/s]

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
--------------------------------------------------

Processing image 730 (ID: 471)
Image path: /content/drive/MyDrive/PhD/Research1/slakedataset/Slake1.0/imgs/xmlab471/source.jpg
Question: Where is the brain tumor?
Modality: MRI
Loading image from: /content/drive/MyDrive/PhD/Research1/slakedataset/Slake1.0/imgs/xmlab471/source.jpg
Image successfully loaded. Size: (240, 240)
Applying image transformation...
Image transformed to tensor of shape: torch.Size([1, 3, 384, 384])
Running Tag2Text model inference...
Tag2Text inference completed
Got caption: an image of the human brain
Tag2Text generated tags: ['brain | figure | image | photo']
Tag2Text generated caption: an image of the human brain
--------------------------------------------------

Processing image 731 (ID: 471)
Image path: /content/drive/MyDrive/PhD/Research1/slakedataset/Slake1.0/imgs/xmlab471/source.jpg
Question: What is the organ system visualized?
Modality: MRI
Loading image from: 