# Obtaining the data from datasources

The first part of this project is to obtain the data from the different datsources we have chosen. Also, there are some datasources that for purposes of this project, we have created ourselves. In this notebook we will explain how the data is created, if needed, and how the data is collected from the different data sources. Specifically, we will obtain the data from three different data sources:
1. [huggingface dataset](https://huggingface.co/datasets/Moaaz55/skin_cancer_questions_answers). This dataset is used for **joel pots explicar aqui**
2. Self made audio dataset. This dataset is used for ...
3. Wikipedia web scrapping information. This dataset is used for...

In the next sections, we will discuss how the data is obtained to further insert them into the first zone of the pipeline.

## Huggingface Dataset

Explicar huggingface dataset

In [3]:
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("abaryan/ham10000_bbox")

## Self made audio dataset

Explicar self made audio dataset

In [None]:
import pandas as pd

# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_json("hf://datasets/Moaaz55/skin_cancer_questions_answers/dataset.json", lines=True)

def limpiar_dataset(df, columna='Answer'):
    # Convertir a string y limpiar
    df_temp = df.copy()
    df_temp[columna] = df_temp[columna].astype(str)
    
    # filter valid answers.
    df_limpio = df_temp[
        df_temp[columna].notna() &
        (df_temp[columna].str.strip() != '') &
        (df_temp[columna].str.strip() != 'nan') &
        (df_temp[columna].str.strip() != 'None') &
        (df_temp[columna].str.strip() != 'null') &
        (df_temp[columna].str.len() > 10)  # Mínimo 10 caracteres
    ]
    
    print(f" Limpieza completada:")
    print(f"  Original: {len(df)} filas")
    print(f"  Limpio: {len(df_limpio)} filas")
    print(f"  Eliminadas: {len(df) - len(df_limpio)} filas")
    
    return df_limpio


df = limpiar_dataset(df)
df = df.sample(n=100, random_state=42)
df_text = df
df_text = df.apply(lambda row: f"Q: {row['Question']}\nA: {row['Answer']}\n", axis=1)
df_text
with open("../output/dataset1_preguntes.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(df_text.tolist()))

In [None]:
# CONVERT THE TEXT TO AUDIO WITH THE TT'S LIBRARY
import os
import io
import hashlib
from datetime import datetime
from gtts import gTTS
from mutagen.mp3 import MP3

# CONFIGRATION PARAMETERS
OUT_DIR = "output_audio"
LANG = "es"
TEXT_COL = "Answer"

# CREATE AUDIO AND METADATA DIRECTORIES
def ensure_dirs(root):
    audio_dir = os.path.join(root, "audio")
    metadata_dir = os.path.join(root, "metadata")
    os.makedirs(audio_dir, exist_ok=True)
    os.makedirs(metadata_dir, exist_ok=True)
    return audio_dir, metadata_dir

# FUNCTION TO GENERATE MD5 HASH (AVOID DUPLICATES)
def md5(s: str) -> str:
    return hashlib.md5(s.encode("utf-8")).hexdigest()

# CONVERT TEXT TO AUDIO BYTES
def tts_bytes(text: str, lang: str) -> bytes:
    buf = io.BytesIO()
    gTTS(text=text, lang=lang).write_to_fp(buf)
    buf.seek(0)
    return buf.read()

# GET MP3 DURATION
def mp3_duration(b: bytes):
    try:
        return float(MP3(io.BytesIO(b)).info.length)
    except Exception:
        return None

# MAIN FUNCTION TO CONVERT ANSWERS TO AUDIO
def answers_to_audio(df: pd.DataFrame):
    audio_dir, meta_dir = ensure_dirs(OUT_DIR)
    df_out = df.copy()

    # Add columns for audio metadata
    df_out["Answer_audio_relpath"] = None
    df_out["Answer_duration_sec"] = None
    df_out["Answer_size_bytes"] = None
    df_out["Answer_text_md5"] = None

    total = len(df_out)
    for i, text in enumerate(df_out[TEXT_COL].astype(str)):
        if not text.strip():
            continue

        h = md5(text)
        filename = f"answer_{h}.mp3"
        abspath = os.path.join(audio_dir, filename)
        relpath = os.path.join("audio", filename)

        # Only generate if it doesn't exist
        if not os.path.exists(abspath):
            mp3 = tts_bytes(text, LANG)
            with open(abspath, "wb") as f:
                f.write(mp3)
            size = len(mp3)
            dur = mp3_duration(mp3)
        else:
            with open(abspath, "rb") as f:
                data = f.read()
            size = len(data)
            dur = mp3_duration(data)

        df_out.at[i, "Answer_audio_relpath"] = relpath.replace("\\", "/")
        df_out.at[i, "Answer_duration_sec"] = dur
        df_out.at[i, "Answer_size_bytes"] = size
        df_out.at[i, "Answer_text_md5"] = h

        if (i+1) % 50 == 0 or i+1 == total:
            print(f"[{i+1}/{total}] {relpath} ({dur:.2f}s)")

    # Save enriched dataset
    today = datetime.utcnow().strftime("%Y-%m-%d")
    out_latest = os.path.join(meta_dir, "answers_dataset-latest.parquet")
    out_dated = os.path.join(meta_dir, f"answers_dataset-{today}.parquet")

    df_out.to_parquet(out_latest, index=False)
    df_out.to_parquet(out_dated, index=False)

    print("\n Listo")
    print("Audios →", os.path.abspath(audio_dir))
    print("Nuevo dataset →", out_latest)
    return df_out

print(" Funciones de conversión a audio cargadas correctamente")