In [6]:
import torch
import gradio as gr
from llava.model.builder import load_pretrained_model
from llava.mm_utils import process_images
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init

disable_torch_init()

model_path = "llava-medmnist-ftmodel"
tokenizer, model, image_processor, _ = load_pretrained_model(
    model_path,
    None,
    model_name="llava-v1.5-13b",
    device="cuda",
    device_map="auto",
    load_8bit=False,
    load_4bit=False
)

print(f"Model type: {type(model)}")
print(f"Tokenizer type: {type(tokenizer)}")
print(f"Image processor type: {type(image_processor)}")

def generate_response(image, prompt):
    conv = conv_templates["v1"].copy()
    roles = conv.roles

    image_tensor = process_images([image], image_processor, model.config)
    inp = f"{roles[0]}: {prompt}\n{roles[1]}:"
    conv.append_message(conv.roles[0], prompt)
    conv.append_message(conv.roles[1], None)

    input_ids = tokenizer(conv.get_prompt()).input_ids
    input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
    input_ids = torch.tensor(input_ids).unsqueeze(0).cuda()

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=image_tensor.cuda(),
            do_sample=True,
            temperature=0.2,
            max_new_tokens=1024,
        )

    response = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    conv.messages[-1][-1] = response
    return response

iface = gr.Interface(
    fn=generate_response,
    inputs=[gr.Image(type="pil"), "text"],
    outputs="text",
    title="LLaVA-MedMNIST Pathology Classifier",
    description="Upload a medical image and ask about its pathology class."
)

iface.launch()



Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some weights of the model checkpoint at llava-medmnist-ftmodel were not used when initializing LlavaLlamaForCausalLM: ['model.vision_tower.vision_tower.vision_model.embeddings.class_embedding', 'model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.

Model type: <class 'llava.model.language_model.llava_llama.LlavaLlamaForCausalLM'>
Tokenizer type: <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
Image processor type: <class 'transformers.models.clip.image_processing_clip.CLIPImageProcessor'>
Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.16.0, however version 4.29.0 is available, please upgrade.
--------


Traceback (most recent call last):
  File "/home/ubuntu/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
  File "/home/ubuntu/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/route_utils.py", line 232, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/ubuntu/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/blocks.py", line 1561, in process_api
    result = await self.call_function(
  File "/home/ubuntu/.pyenv/versions/3.10.14/lib/python3.10/site-packages/gradio/blocks.py", line 1179, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/ubuntu/.pyenv/versions/3.10.14/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/ubuntu/.pyenv/versions/3.10.14/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", lin

In [3]:
print(f"Model type: {type(model)}")
print(f"Tokenizer type: {type(tokenizer)}")
print(f"Image processor type: {type(image_processor)}")

Model type: <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
Tokenizer type: <class 'llava.model.language_model.llava_llama.LlavaLlamaForCausalLM'>
Image processor type: <class 'transformers.models.clip.image_processing_clip.CLIPImageProcessor'>


In [5]:
torch.cuda.empty_cache()

In [7]:
pip install huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [9]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
from huggingface_hub import HfApi, create_repo

# Initialize the Hugging Face API
api = HfApi()

# Set your username and the desired repository name
username = "athreesh"
repo_name = "llava-medmnist"
repo_id = f"{username}/{repo_name}"

# Create a new repository
create_repo(repo_id, repo_type="model", private=False)

# Upload the model files
api.upload_folder(
    folder_path="./llava-medmnist-ftmodel",
    repo_id=repo_id,
    repo_type="model",
)

print(f"Model uploaded to: https://huggingface.co/{repo_id}")

model-00001-of-00006.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

In [13]:
# Create a README file
readme_content = """
# LLaVA-MedMNIST

This is a fine-tuned version of LLaVA on the MedMNIST dataset for medical image analysis.

## Model description

Trained on MedMNIST dataset (https://medmnist.com/) which contains 18x Standardized Datasets for 2D / 3D Biomedical Image Classification
## Intended uses & limitations

[Explain what tasks the model is good for and any limitations]

## Training data

[Describe the MedMNIST dataset you used]

## Training procedure

[Explain your training procedure, hyperparameters, etc.]

## Evaluation results

[Share any evaluation metrics or results]
"""

with open("README.md", "w") as f:
    f.write(readme_content)

# Upload the README file
api.upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_id=repo_id,
    repo_type="model"
)

- empty or missing yaml metadata in repo card


CommitInfo(commit_url='https://huggingface.co/athreesh/llava-medmnist/commit/153727a58314e6b7e7eee1092f01406783eefed5', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='153727a58314e6b7e7eee1092f01406783eefed5', pr_url=None, pr_revision=None, pr_num=None)