In [1]:
import boto3
import os

In [2]:
acces_key_id = "minioadmin"
secret_access_key = "minioadmin"
minio_url = "http://localhost:9000"

minio_client = boto3.client(
    "s3",
    aws_access_key_id = acces_key_id,
    aws_secret_access_key = secret_access_key,
    endpoint_url = minio_url
)

In [5]:
import io
import pandas as pd
from tqdm import tqdm

bucket_origen = "persistent-landing"
bucket_desti = "formatted-zone"
prefix_origen = "tabular/"

# Si saps els tipus esperats, pots definir-los aquí
TIPUS_ESPERATS = {
    "lesion_id": "string",
    "image_id": "string",
    "diagnosis": "string",
    "dx_type": "string",
    "age": "float",
    "sex": "string",
    "localization": "string",
    "bbox": "string",
    "area_coverage": "float"
}

# Funció de neteja
def netejar_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    # Noms de columnes nets
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    #Forçar tipus
    for col, tipus in TIPUS_ESPERATS.items():
        if col in df.columns:
            try:
                if tipus == "float":
                    df[col] = pd.to_numeric(df[col], errors="coerce")
                elif tipus == "string":
                    df[col] = df[col].astype("string")
            except Exception as e:
                print(f"Error convertint {col}: {e}")

    #Tractament de valors nuls
    df = df.fillna("unknown")

    #Normalitzar valors de text
    if "sex" in df.columns:
        df["sex"] = df["sex"].str.lower().replace({
            "m": "male", "f": "female", "man": "male", "woman": "female"
        })

    #Eliminar duplicats
    df = df.drop_duplicates()

    return df

#Processar tots els fitxers del bucket
paginator = minio_client.get_paginator("list_objects_v2")

for page in paginator.paginate(Bucket=bucket_origen, Prefix=prefix_origen):
    for obj in tqdm(page.get("Contents", []), desc="Processant fitxers tabulars"):
        key = obj["Key"]
        filename = key.split("/")[-1]

        if not filename.lower().endswith(".csv"):
            continue

        # Llegir CSV des del bucket
        response = minio_client.get_object(Bucket=bucket_origen, Key=key)
        content = response["Body"].read()
        df = pd.read_csv(io.BytesIO(content), encoding="utf-8")

        # Netejar DataFrame
        df_clean = netejar_dataframe(df)
        df_clean = df_clean[~df_clean.isin(["unknown"]).any(axis=1)]  
        
        print(df)

        # Convertir a Parquet
        parquet_buffer = io.BytesIO()
        df_clean.to_parquet(parquet_buffer, index=False)
        parquet_buffer.seek(0)

        # Pujar a formatted-zone
        new_key = f"tabular/{filename.replace('.csv', '.parquet')}"
        minio_client.put_object(
            Bucket=bucket_desti,
            Key=new_key,
            Body=parquet_buffer
        )

Processant fitxers tabulars:   0%|          | 0/1 [00:00<?, ?it/s]

Processant fitxers tabulars: 100%|██████████| 1/1 [00:00<00:00,  3.78it/s]

        lesion_id      image_id diagnosis    dx_type   age     sex  \
0     HAM_0007418  ISIC_0031372        df  consensus  50.0    male   
1     HAM_0004785  ISIC_0030788        nv  follow_up  60.0    male   
2     HAM_0004585  ISIC_0032881        nv  consensus  55.0    male   
3     HAM_0000086  ISIC_0028081        nv  follow_up  45.0  female   
4     HAM_0004204  ISIC_0033772        nv  consensus  40.0  female   
...           ...           ...       ...        ...   ...     ...   
8007  HAM_0000593  ISIC_0031401       mel      histo  50.0    male   
8008  HAM_0000420  ISIC_0026864        nv  follow_up  40.0    male   
8009  HAM_0004821  ISIC_0031885        nv  follow_up  50.0    male   
8010  HAM_0007400  ISIC_0027329        nv  follow_up  50.0  female   
8011  HAM_0003157  ISIC_0031578     akiec      histo  65.0    male   

         localization                   bbox  area_coverage  
0     lower extremity  [235. 175. 368. 264.]       0.031661  
1                back  [ 58.   0. 4


