In [4]:
!pip install h5py
!pip install tqdm


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


Importing the necessary Libraries

In [5]:
import os
import h5py
import numpy as np
from sklearn.model_selection import train_test_split
from PIL import Image
from tqdm import tqdm
import requests


In [8]:
base_dir = "/home/astroesul/Asaf_mestrado/Códigos_atuais/Hardware - 02"
dataset_path = os.path.join(base_dir, "Galaxy10_DECals.h5")

# Apagar arquivo corrompido se existir
if os.path.exists(dataset_path):
    os.remove(dataset_path)
    print("⚠️ Arquivo anterior removido (possivelmente incompleto).")

# Novo download via streaming
url = "https://zenodo.org/records/10845026/files/Galaxy10_DECals.h5"

print("📥 Baixando novamente Galaxy10_DECals (~2 GB)...")

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    total = int(r.headers.get("content-length", 0))
    with open(dataset_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

# Verificar tamanho final
size_gb = os.path.getsize(dataset_path) / (1024**3)
print(f"✅ Download concluído ({size_gb:.2f} GB). Arquivo salvo em: {dataset_path}")


📥 Baixando novamente Galaxy10_DECals (~2 GB)...
✅ Download concluído (2.55 GB). Arquivo salvo em: /home/astroesul/Asaf_mestrado/Códigos_atuais/Hardware - 02/Galaxy10_DECals.h5


In [9]:
with h5py.File(dataset_path, "r") as f:
    print(f.keys())  # Deve mostrar: ['ans', 'images']

<KeysViewHDF5 ['ans', 'dec', 'images', 'pxscale', 'ra', 'redshift']>


In [10]:
# ==========================
# ⬇️ Download do dataset
# ==========================
if not os.path.exists(dataset_path):
    print("📥 Baixando Galaxy10_DECals (~2 GB)...")
    response = requests.get(url, stream=True)
    with open(dataset_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    print(f"✅ Download concluído: {dataset_path}")
else:
    print("✅ Arquivo já existe, pulando download.")

# ==========================
# 📂 Extrair imagens e rótulos
# ==========================
print("🔍 Lendo arquivo HDF5...")
with h5py.File(dataset_path, "r") as f:
    images = np.array(f["images"])
    labels = np.array(f["ans"])

print(f"Total de imagens: {len(images)}")
print(f"Dimensões: {images.shape}")

# ==========================
# 🪞 Mapear classes relevantes
# ==========================

# Correto para Galaxy10_DECals
class_map = {
    0: "smooth",  # completely round
    1: "smooth",  # in-between smooth
    2: "smooth",  # cigar-shaped smooth
    3: "spiral",  # edge-on disk
    4: "spiral",  # barred spiral
    5: "spiral",  # unbarred tight spiral
    6: "spiral",  # unbarred medium spiral
    7: "spiral",  # unbarred loose spiral
    8: "irregular",  # merging system
    9: "irregular"   # artifact/odd
}


selected_idx = [i for i, lbl in enumerate(labels) if lbl in class_map]
images = images[selected_idx]
labels = [class_map[lbl] for lbl in labels[selected_idx]]

print(f"✅ Após filtragem: {len(images)} imagens em {len(set(labels))} classes.")

# ==========================
# 📁 Estruturar dataset (YOLO-CLS)
# ==========================
train_imgs, val_imgs, train_lbls, val_lbls = train_test_split(
    images, labels, test_size=0.2, stratify=labels, random_state=42
)

for subset, imgs, lbls in [("train", train_imgs, train_lbls), ("val", val_imgs, val_lbls)]:
    for lbl in set(lbls):
        os.makedirs(os.path.join(base_dir, f"dataset_cls/{subset}/{lbl}"), exist_ok=True)
    
    print(f"💾 Salvando {subset}...")
    for i, (img, lbl) in enumerate(tqdm(zip(imgs, lbls), total=len(lbls))):
        path = os.path.join(base_dir, f"dataset_cls/{subset}/{lbl}/img_{i:05d}.png")
        Image.fromarray(img).save(path)

print("✅ Dataset estruturado com sucesso!")
print(f"Estrutura final: {os.path.join(base_dir, 'dataset_cls')}")


✅ Arquivo já existe, pulando download.
🔍 Lendo arquivo HDF5...
Total de imagens: 17736
Dimensões: (17736, 256, 256, 3)
✅ Após filtragem: 17736 imagens em 3 classes.
💾 Salvando train...


100%|██████████| 14188/14188 [02:02<00:00, 115.73it/s]


💾 Salvando val...


100%|██████████| 3548/3548 [00:30<00:00, 114.55it/s]

✅ Dataset estruturado com sucesso!
Estrutura final: /home/astroesul/Asaf_mestrado/Códigos_atuais/Hardware - 02/dataset_cls



