In [None]:
# =============
# CARGA DE LIBRERIAS

from astroquery.utils.tap.core import TapPlus
from astroquery.ipac.ned import Ned
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import pandas as pd
import pickle
import re
import requests
import time
from SciServer import Authentication
from SciServer import CasJobs as cj

# Iniciar sesion
username = "username"
password = "password"
Authentication.login(UserName = username, Password = password)

In [None]:
# =============
# BUCLE PARA OBTENER LOS .fits DE LA SDSS A PARTIR DE UN CIERTO REDSHIFT MINIMO ACTUALIZADO EN CADA ITERACION

# Modificar redshift minimo
# ultimo_redshift = 0
# with open('extra/redshift_init.pickle', 'wb') as handle:
#     pickle.dump(ultimo_redshift, handle)

# Parametros iniciales
batch_size_inicial = 50000       # tamaño inicial del lote
min_batch_size = 1000            # tamaño minimo de lote permitido
max_retries = 10                 # maximo numero de reintentos por lote
offset = 0                       # parametro de parada
all_results = []
batch_results = []

while offset < 1000000:

    with open('extra/redshift_init.pickle', 'rb') as handle:
        redshift_init = pickle.load(handle)

    redshift_init = float(str(redshift_init)[:8]) # Solo se toman los primeros 7 digitos, si no se excede el límite de la consulta
    current_batch_size = batch_size_inicial
    retries = 0
    success = False

    # Intentar ejecutar el query con el tamaño de lote actual. Si hay error, se reduce el lote y se reintenta.
    while not success and retries < max_retries:
        sql_query = f"""
        SELECT
            p.objid,
            s.specObjID,
            dbo.fGetUrlFitsSpectrum(s.specObjID) AS fits_url,
            s.z AS redshift
        FROM 
            PhotoObj AS p
        JOIN 
            SpecObj AS s ON s.bestobjid = p.objid
        WHERE 
            s.z BETWEEN {redshift_init} AND 9
            AND s.zWarning = 0
            AND (s.class = 'QSO' OR s.class = 'GALAXY')
        ORDER BY s.z
        OFFSET 0 ROWS
        FETCH NEXT {current_batch_size} ROWS ONLY
        """
        try:
            batch_results = pd.DataFrame(cj.executeQuery(sql_query, context="DR18"))
            success = True
            all_results.append(batch_results)
            offset += len(batch_results)

            with open('extra/all_results.pickle', 'wb') as handle:
                pickle.dump(all_results, handle)
            print(f"Recuperados {offset} registros hasta ahora.")

            ultimo_redshift = batch_results["redshift"].iloc[-1]

            ultimo_redshift = float(str(ultimo_redshift)[:8])
            print(f"Redshift inicial: {redshift_init}")
            print(f"Redshift último: {ultimo_redshift}")
            if ultimo_redshift == redshift_init:    # Si el redshift no cambio, se incrementa un poco para evitar loops infinitos
                ultimo_redshift = ultimo_redshift + 0.000002
                print(f"Se incrementó el redshift a {ultimo_redshift}")

            with open('extra/redshift_init.pickle', 'wb') as handle:
                pickle.dump(ultimo_redshift, handle)

        except Exception as e:
            print(f"Error en la consulta con batch_size = {current_batch_size}: {e}")
            # Reducir el tamaño del lote, pero sin bajar de min_batch_size
            current_batch_size = max(min_batch_size, current_batch_size // 2)
            retries += 1
            time.sleep(2)  # Esperar un poco antes de reintentar

    if len(batch_results) == 0:
        print("No hay más registros para recuperar.")
        break

    if retries == max_retries:
        print("No se pudo recuperar el batch tras varios reintentos. Terminando.")
        break

In [None]:
with open('extra/all_results.pickle', 'rb') as handle:
    all_results = pickle.load(handle)

# Combinar todos los resultados
combined_results = pd.concat(all_results, ignore_index=True)
print(f"Total de registros recuperados: {len(combined_results)}")

# Extraer las URLs de los archivos FITS
fits_urls = combined_results["fits_url"].tolist()
print(f"Se han obtenido {len(fits_urls)} URLs de FITS.")

ultimo_redshift = all_results[len(all_results)-2]["redshift"].iloc[0]
print(f"Último redshift recuperado: {ultimo_redshift:.7f}")

with open('extra/redshift_init.pickle', 'wb') as handle:
    pickle.dump(ultimo_redshift, handle)

In [None]:
# =============
# DSECARGA EN PARALELO DE LOS .fits OBTENIDOS

# Directorio de descarga
output_dir = r"spectrums"
os.makedirs(output_dir, exist_ok=True)

archivos_descargados = []

def download_file(url):
    filename = os.path.basename(url)
    local_path = os.path.join(output_dir, filename)
    
    # Saltar si ya se descargo este archivo
    if os.path.exists(local_path):
        print(f"Saltando {filename} (ya descargado).")
        return filename, None
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(local_path, "wb") as f:
            f.write(response.content)
        print(f"Descargado: {filename}")
        return filename, None
    except Exception as e:
        print(f"Error al descargar {url}: {e}")
        return filename, e

# Numero de descargas en paralelo
paralelo = 100
i=0

with ThreadPoolExecutor(max_workers=paralelo) as executor:
    future_to_url = {executor.submit(download_file, url): url for url in fits_urls}
    
    for future in as_completed(future_to_url):
        i+=1
        print(f"Descargando archivo {i} de {len(fits_urls)}")
        filename, error = future.result()
        if error is None:
            archivos_descargados.append(filename)

print("Descarga finalizada")

In [None]:
# =============
# CODIGO COMPACTO PARA LA DESCARGA DE ESPECTROS DE LA NED

# Funciones y configuración inicial

def sanitize(name):
    """Reemplaza caracteres inválidos para Windows por '_'."""
    return re.sub(r'[^A-Za-z0-9_-]', '_', name)

output_dir    = "spectrums NASA"
redshift_path = 'extra/redshift_init_ned.pickle'
os.makedirs(output_dir, exist_ok=True)

# Modificar redshift minimo
redshift_init = 0
with open(redshift_path, 'wb') as handle:
    pickle.dump(redshift_init, handle)

batch_size_in = 10000      # lote inicial ADQL
max_retries   = 5          # reintentos por lote

all_meta = []              # acumulador de metadatos

# Paginacion por redshift para metadatos
while True:

    with open(redshift_path, 'rb') as f:
        redshift_init = float(str(pickle.load(f))[:8])
        print(f"Redshift actual: {redshift_init}")
        print(f"Redshift actual: {redshift_init:.7f}")

    current_batch = batch_size_in
    retries       = 0
    success       = False

    # Intentos de consulta con reduccion de lote
    while not success and retries < max_retries:
        adql = f"""
        SELECT TOP {current_batch}
            prefname, z
        FROM NEDTAP.objdir
        WHERE
            z >= {redshift_init:.7f}
            AND z < 9.0
            AND n_spectra > 0
        ORDER BY z
        """
        try:
            tap     = TapPlus(url="https://ned.ipac.caltech.edu/tap")
            batch   = tap.launch_job(adql).get_results().to_pandas()
            success = True

            if batch.empty:
                break

            all_meta.append(batch)

            # Actualizar z_init  
            last_z = float(str(batch['z'].iloc[-1])[:8])
            if last_z == redshift_init:
                last_z += 0.000002
            with open(redshift_path, 'wb') as f:
                pickle.dump(last_z, f)

        except Exception as e:
            retries += 1
            print(e)
            time.sleep(2)

    if not success or batch.empty:
        break

meta_df = pd.concat(all_meta, ignore_index=True)

# Descarga en paralelo con manejo de existentes
def download_spectra(prefname, z):
    base = sanitize(prefname)
    saved = []
    try:
        spectra = Ned.get_spectra(prefname, show_progress=False)
    except Exception:
        return saved

    for idx, hdulist in enumerate(spectra, start=1):
        filename = f"{base}_{idx}_z{z:.5f}.fits"
        path     = os.path.join(output_dir, filename)
        if os.path.exists(path):
            print(f"Saltando (ya existe): {filename}")
        else:
            hdulist.writeto(path, overwrite=True)
            print(f"Guardado: {filename}")
        saved.append(path)
    return saved

with ThreadPoolExecutor(max_workers=2) as executor:
    futures = {
        executor.submit(download_spectra, row['prefname'], row['z']): ix
        for ix, row in meta_df.iterrows()
    }
    for future in as_completed(futures):
        pass

print(f"Redshift final: {redshift_init}")