In [26]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_json("hf://datasets/Moaaz55/skin_cancer_questions_answers/dataset.json", lines=True)
df

Unnamed: 0,Question,Answer,Input
0,What is skin cancer?,Skin cancer is a malignant neoplasia that orig...,
1,What are the main types of skin cancer?,The main types are melanoma and non-melanoma s...,
2,What is the most common type of skin cancer?,Basal cell carcinoma (BCC) is the most common ...,
3,What is the most dangerous type of skin cancer?,Malignant melanoma (MM) is the most dangerous ...,
4,What is a cutaneous carcinoma?,Cutaneous carcinomas are malignant neoplasias ...,
...,...,...,...
793,"What does the term ""perineural invasion"" refer...",The spread of cancer cells around nerves,
794,What is a major risk associated with skin canc...,Increased risk of squamous cell carcinoma,
795,What does MTX refer to?,Methotrexate,
796,What is a typical use of acitretin in transpla...,It is used for those at higher risk of skin ca...,


In [27]:
# CONVERTIR LAS RESPUESTAS A AUDIO CON LA LIBRERIA TTS
import os
import io
import hashlib
from datetime import datetime
from gtts import gTTS
from mutagen.mp3 import MP3

# PARÁMETROS DE CONFIGURACIÓN
OUT_DIR = "output_audio"
LANG = "es"
TEXT_COL = "Answer"

# CREAR DIRECTORIOS DE AUDIO Y METADATA
def ensure_dirs(root):
    audio_dir = os.path.join(root, "audio")
    metadata_dir = os.path.join(root, "metadata")
    os.makedirs(audio_dir, exist_ok=True)
    os.makedirs(metadata_dir, exist_ok=True)
    return audio_dir, metadata_dir

# FUNCIÓN PARA GENERAR HASH MD5 (EVITAR DUPLICADOS)
def md5(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()

# CONVERTIR TEXTO A BYTES DE AUDIO
def tts_bytes(text: str, lang: str) -> bytes:
    buf = io.BytesIO()
    gTTS(text=text, lang=lang).write_to_fp(buf)
    buf.seek(0)
    return buf.read()

# OBTENER DURACIÓN DEL MP3
def mp3_duration(b: bytes):
    try:
        return float(MP3(io.BytesIO(b)).info.length)
    except Exception:
        return None

# FUNCIÓN PRINCIPAL PARA CONVERTIR RESPUESTAS A AUDIO
def answers_to_audio(df: pd.DataFrame):
    audio_dir, meta_dir = ensure_dirs(OUT_DIR)
    df_out = df.copy()

    # Añadir columnas para metadatos de audio
    df_out["Answer_audio_relpath"] = None
    df_out["Answer_duration_sec"] = None
    df_out["Answer_size_bytes"] = None
    df_out["Answer_text_md5"] = None

    total = len(df_out)
    for i, text in enumerate(df_out[TEXT_COL].astype(str)):
        if not text.strip():
            continue

        h = md5(text)
        filename = f"answer_{h}.mp3"
        abspath = os.path.join(audio_dir, filename)
        relpath = os.path.join("audio", filename)

        # Solo generar si no existe
        if not os.path.exists(abspath):
            mp3 = tts_bytes(text, LANG)
            with open(abspath, "wb") as f:
                f.write(mp3)
            size = len(mp3)
            dur = mp3_duration(mp3)
        else:
            with open(abspath, "rb") as f:
                data = f.read()
            size = len(data)
            dur = mp3_duration(data)

        df_out.at[i, "Answer_audio_relpath"] = relpath.replace("\\", "/")
        df_out.at[i, "Answer_duration_sec"] = dur
        df_out.at[i, "Answer_size_bytes"] = size
        df_out.at[i, "Answer_text_md5"] = h

        if (i+1) % 50 == 0 or i+1 == total:
            print(f"[{i+1}/{total}] {relpath} ({dur:.2f}s)")

    # Guardar dataset enriquecido
    today = datetime.utcnow().strftime("%Y-%m-%d")
    out_latest = os.path.join(meta_dir, "answers_dataset-latest.parquet")
    out_dated = os.path.join(meta_dir, f"answers_dataset-{today}.parquet")

    df_out.to_parquet(out_latest, index=False)
    df_out.to_parquet(out_dated, index=False)

    print("\n✅ Listo")
    print("Audios →", os.path.abspath(audio_dir))
    print("Nuevo dataset →", out_latest)
    return df_out

print("✅ Funciones de conversión a audio cargadas correctamente")

✅ Funciones de conversión a audio cargadas correctamente


In [None]:
df_amb_audio = answers_to_audio(df)

[50/798] audio/answer_5544150527b1ad26afc15a5d31f063e4.mp3 (9.41s)
[100/798] audio/answer_76b1db7a0c39310bd97d342885acf2dc.mp3 (3.86s)
[150/798] audio/answer_03de1c1bb9511a2324b5d749eb060289.mp3 (10.22s)
[200/798] audio/answer_8827568ef5f7fb84622569ceb7efa3a6.mp3 (14.40s)
[250/798] audio/answer_9db5907d11826e9999baba066259794b.mp3 (7.20s)


In [14]:
from transformers import AutoModelForVision2Seq, AutoProcessor

model_id = "abaryan/DrDiag_qwen2vl_Ham10000"


In [None]:
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("abaryan/ham10000_bbox")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 8012/8012 [00:05<00:00, 1389.05 examples/s]
Generating test split: 100%|██████████| 2003/2003 [00:03<00:00, 522.65 examples/s]


In [None]:
# HERE WE'RE GOING TO GET THE DATA FROM THE DATA SET OF HAM10000.

# THE PREPARATION CONSISTS OF GETTING THE IMAGES AND THE TABULAR DATA AND SPLIT THEM TO DIFFERENT LOCATIONS IN MINIO.

data = ds['train']  #HERE WE GET THE DATA FROM THE DATA SET, IN THAT CASE DUE TO THE DATASET HAVE TO PARTS.
                    #ONE FROM THE TRAIN AND THE OTHER FROM THE TEST, WE'LL USE THE TRAIN.

# NOW WE GET THE IMAGES AND THE TABULAR DATA.
images = data['image'] # HERE WE GET THE IMAGES FROM THE DATASET.
tabular_data = data.remove_columns('image') # HERE WE GET THE TABULAR DATA FROM THE DATASET.

# NOW WE PRINT THE FIRST 3 IMAGES AND THE FIRST 3 ROWS OF THE TABULAR DATA.

# NOW WE'RE GOING TO PUSH THE DATA TO MINIO.


<PIL.PngImagePlugin.PngImageFile image mode=RGB size=600x450 at 0x1205FF400>
{'lesion_id': 'HAM_0007418', 'image_id': 'ISIC_0031372', 'diagnosis': 'df', 'dx_type': 'consensus', 'age': 50.0, 'sex': 'male', 'localization': 'lower extremity', 'bbox': [235.0, 175.0, 368.0, 264.0], 'area_coverage': 0.03166111186146736}
---
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=600x450 at 0x1205FCDC0>
{'lesion_id': 'HAM_0004785', 'image_id': 'ISIC_0030788', 'diagnosis': 'nv', 'dx_type': 'follow_up', 'age': 60.0, 'sex': 'male', 'localization': 'back', 'bbox': [58.0, 0.0, 496.0, 420.0], 'area_coverage': 0.502837061882019}
---
<PIL.PngImagePlugin.PngImageFile image mode=RGB size=600x450 at 0x1205FC520>
{'lesion_id': 'HAM_0004585', 'image_id': 'ISIC_0032881', 'diagnosis': 'nv', 'dx_type': 'consensus', 'age': 55.0, 'sex': 'male', 'localization': 'unknown', 'bbox': [189.0, 131.0, 357.0, 297.0], 'area_coverage': 0.07799629867076874}
---
