In [1]:
import numpy as np
import pandas as pd
import pylab as plt
from tqdm.auto import tqdm
from astropy.io import ascii

In [2]:
import sys

sys.path.append("../")
import sidhelpers

In [3]:
# Read spectra metadata
spectra_info = pd.read_csv("../1. download ALL wise data/wiserep_spectra_combined.csv")

# keep p60 only
# spectra_info = spectra_info[spectra_info["Telescope"] == "P60"]

In [4]:
spectra_info

Unnamed: 0,wise_objid,IAU name,Internal name/s,Obj. RA,Obj. DEC,Obj. Type,Redshift,Spec. ID,Obs-date,JD,...,Grating,Blaze,Lambda-min,Lambda-max,Del-Lambda,Contrib,Publish,Remarks,Created by,Creation date
0,1,SN 2012az,,47.217042,17.300889,SN Ia,0.047000,1,2012-03-16 18:36:12.00,2.456003e+06,...,,,3284.000000,8226.184607,4.831070,,,Data was ingested from the &lt;a href=&#039;ht...,Migration From WISeREP1.0,2016-03-31 15:16:52
1,3,SN 2006kv,,0.777750,0.913780,SN II,0.062000,3,2006-10-18 05:27:54.00,2.454027e+06,...,,,3616.267654,9720.425641,3.490085,Ostman et al. 2011,2011A%26A...526A..28O,,Migration From WISeREP1.0,2012-03-22 13:37:00
2,6,SN 2007jy,,312.839290,0.399390,SN Ib,0.183200,7,2007-09-20 02:06:03.00,2.454364e+06,...,,,3615.128175,9720.328669,3.490681,Ostman et al. 2011,2011A%26A...526A..28O,,Migration From WISeREP1.0,2012-03-22 13:37:01
3,7,SN 2016drl,Gaia16aec,178.203800,-33.124300,SN IIn,0.061000,8,2016-02-25 04:42:48.00,2.457444e+06,...,Free,,3644.048340,9239.227457,5.512492,,,,Migration From WISeREP1.0,2016-02-25 10:30:28
4,7,SN 2016drl,Gaia16aec,178.203800,-33.124300,SN IIn,0.061000,71988,2016-02-25 04:42:49.00,2.457444e+06,...,,,3644.066400,9240.395000,5.513624,,,Released as part of DR3,WIS_Bot1,2023-05-04 13:56:43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54000,27660,SN 2025bys,"GOTO25amu, ATLAS25bsx, ZTF25aafrcem",106.328322,58.144333,SN II,0.040000,85889,2025-03-09 21:00:43.00,2.460744e+06,...,,,3952.612590,8099.832500,4.649350,,,[TNS reporting group: 48 - ZTF ],TNS_Bot1,2025-03-10 11:10:05
54001,27661,SN 2025ddd,"ZTF25aaisfid, GOTO25azp, ATLAS25chv",244.097700,12.591682,SN Ia,0.050000,85890,2025-03-10 11:32:44.00,2.460745e+06,...,,,3776.700000,9223.300000,25.451402,,,[TNS reporting group: 48 - ZTF ],TNS_Bot1,2025-03-10 13:10:04
54002,27662,SN 2025dyo,"ZTF25aaiushx, ATLAS25cnz",234.368078,5.819809,SN Ia,0.039000,85891,2025-03-10 12:19:13.00,2.460745e+06,...,,,3776.700000,9223.300000,25.451402,,,[TNS reporting group: 48 - ZTF ],TNS_Bot1,2025-03-10 13:10:04
54003,27663,SN 2025cha,"PS25aaq, GOTO25ari, ATLAS25bvo, ZTF19abvqtkz",210.279550,21.242968,SN II,0.027749,85892,2025-03-10 03:21:16.00,2.460745e+06,...,,,3953.603750,8099.882780,4.648295,,,[TNS reporting group: 48 - ZTF ],TNS_Bot1,2025-03-10 15:10:05


In [4]:
# Initialization - needs to be before the loop
import scipy.interpolate as interp

In [5]:
all_spectra = []
flux_counter = 0
readerror_counter = 0
errs = []
metadata = []

# Define a common wavelength grid - we'll determine this dynamically
min_global_lambda = float('inf')
max_global_lambda = 0
typical_delta = []

# First pass - determine global wavelength range and typical resolution
for fn, wl_unit, spec_unit, flux_ucoeff, lambda_min, lambda_max, del_lambda in tqdm(
    zip(
        spectra_info["Ascii file"],
        spectra_info["WL Units"],
        spectra_info["Spec. units"],
        spectra_info["Flux Unit Coefficient"],
        spectra_info["Lambda-min"],
        spectra_info["Lambda-max"],
        spectra_info["Del-Lambda"],
    ),
    total=len(spectra_info),
):
    if lambda_min and lambda_max and lambda_min < lambda_max:
        min_global_lambda = min(min_global_lambda, lambda_min)
        max_global_lambda = max(max_global_lambda, lambda_max)
    if del_lambda and del_lambda > 0:
        typical_delta.append(del_lambda)

# Set up common wavelength grid
delta_lambda = np.median(typical_delta) if typical_delta else 1.0  # Default if no valid delta
common_wavelengths = np.arange(min_global_lambda, max_global_lambda, delta_lambda)

  0%|          | 0/6304 [00:00<?, ?it/s]

In [6]:
# Second pass - read and interpolate spectra to common grid
for fn, wl_unit, spec_unit, flux_ucoeff, lambda_min, lambda_max, del_lambda in tqdm(
    zip(
        spectra_info["Ascii file"],
        spectra_info["WL Units"],
        spectra_info["Spec. units"],
        spectra_info["Flux Unit Coefficient"],
        spectra_info["Lambda-min"],
        spectra_info["Lambda-max"],
        spectra_info["Del-Lambda"],
    ),
    total=len(spectra_info),
):
    try:
        df = sidhelpers.read_spectra(
            f"../1. download ALL wise data/wiserep_data/spectra/{fn}"
        )
        
        # Extract wavelength and flux columns (assuming standard naming)        
        wavelengths = df.to_numpy()[:, 0]
        flux = df.to_numpy()[:, 1]
        
        # Check for valid data
        valid_mask = ~np.isnan(wavelengths) & ~np.isnan(flux) & (wavelengths > 0)
        if not np.any(valid_mask):
            continue
            
        wavelengths = wavelengths[valid_mask]
        flux = flux[valid_mask]
        
        # Interpolate onto common grid (only within original wavelength range)
        valid_common_mask = (common_wavelengths >= np.min(wavelengths)) & (common_wavelengths <= np.max(wavelengths))
        if np.sum(valid_common_mask) > 10:  # Require at least 10 points
            f = interp.interp1d(wavelengths, flux, bounds_error=False, fill_value=np.nan)
            interp_flux = f(common_wavelengths[valid_common_mask])
            
            spectrum_data = pd.DataFrame({
                'wavelength': common_wavelengths[valid_common_mask],
                'flux': interp_flux,
            })
            all_spectra.append(spectrum_data)
            
            # Store metadata separately
            meta = {
                'spectrum_id': fn,
                'wl_unit': wl_unit,
                'spec_unit': spec_unit,
                'flux_ucoeff': flux_ucoeff
            }
            metadata.append(meta)
            
    except Exception as ee:
        readerror_counter += 1
        errs.append([fn, str(ee)])
        continue

  0%|          | 0/6304 [00:00<?, ?it/s]

In [8]:
object_names = []
normalized_spectra = []

for i, spectrum in enumerate(all_spectra):
    # Get object name from spectrum_id
    spectrum_id = metadata[i]['spectrum_id']
    object_name = spectrum_id.split('_')[0]
    
    # Normalize flux values (divide by maximum)
    flux = spectrum['flux'].values
    if np.any(~np.isnan(flux)) and np.max(np.abs(flux)) > 0:
        norm_flux = flux / np.max(np.abs(flux))
        
        # Create normalized DataFrame with object identifier
        norm_spectrum = pd.DataFrame({
            'wavelength': spectrum['wavelength'].values,
            'flux': norm_flux,
            'object_name': object_name
        })
        normalized_spectra.append(norm_spectrum)
        object_names.append(object_name)

In [56]:
# Combine all normalized spectra
normalized_combined = pd.concat(normalized_spectra, ignore_index=True)
# Some objects may have multiple spectra, take the mean in those cases
pivot_spectra = normalized_combined.pivot_table(
    index='object_name', 
    columns='wavelength', 
    values='flux',
    aggfunc='mean'  # Handle duplicates by taking mean
)

# Fill NaN values with 0 (or another appropriate strategy)
# pivot_spectra = pivot_spectra.fillna(0)


In [58]:
# Count NaN values in each column
nan_counts = pivot_spectra.isna().sum()

# Find columns with more than 5000 NaN values
columns_to_drop = nan_counts[nan_counts > 5Ae000].index

# Drop those columns
pivot_spectra_filtered = pivot_spectra.drop(columns=columns_to_drop)

print(f"Dropped {len(columns_to_drop)} wavelength columns with >5000 NaN values")
print(f"Remaining shape: {pivot_spectra_filtered.shape}")

# Replace the original DataFrame with the filtered one
pivot_spectra = pivot_spectra_filtered

Dropped 64 wavelength columns with >5000 NaN values
Remaining shape: (6040, 214)


In [68]:
pivot_spectra.head()

wavelength,3798.816981,3824.268383,3849.719785,3875.171187,3900.622589,3926.073990,3951.525392,3976.976794,4002.428196,4027.879598,...,8990.902962,9016.354364,9041.805766,9067.257168,9092.708570,9118.159972,9143.611374,9169.062776,9194.514177,9219.965579
object_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016al,,-0.446487,1.0,0.688117,0.124609,0.213864,0.537586,0.348094,0.603551,0.706524,...,0.091109,0.088522,0.08584,0.079184,0.082659,0.078244,0.049994,0.045429,,
2016am,,-0.014088,0.587648,0.537381,0.87962,0.396862,0.29242,0.812706,0.567693,0.520243,...,0.250707,0.259472,0.253911,0.221505,0.220823,0.225413,0.22371,0.217723,,
2016bld,,-0.071703,-0.044067,-0.150936,-0.137172,-0.004433,-0.084392,0.173121,0.326348,0.347203,...,0.758474,0.734724,0.724043,0.759839,0.795027,0.799886,0.735951,0.75405,,
2016cok,,0.885432,0.805263,0.868404,0.688214,0.756615,0.827972,0.72237,0.909177,0.721569,...,0.30489,0.296657,0.291972,0.292109,0.299268,0.305233,0.304421,0.304362,,
2016coo,,-0.865105,-0.418752,0.016306,-0.087951,-0.54548,0.059865,0.942116,0.434564,0.955382,...,-0.057183,-0.01763,0.023201,0.062859,0.104479,0.137886,0.151441,0.115151,,


In [69]:
# Save the pivoted data
pivot_spectra.to_csv("p60_spectra_for_unsupervised.csv")

print(f"Created spectral matrix with {pivot_spectra.shape[0]} objects and {pivot_spectra.shape[1]} wavelength bins")

Created spectral matrix with 6040 objects and 214 wavelength bins


In [None]:
# # Combine all spectra into a single dataframe
# if all_spectra:
#     combined_spectra = pd.concat(all_spectra, ignore_index=True)
#     metadata_df = pd.DataFrame(metadata)
    
#     # Save the combined data
#     combined_spectra.to_csv("p60_combined_spectra.csv", index=False)
#     metadata_df.to_csv("p60_spectra_metadata.csv", index=False)
    
#     print(f"Successfully combined {len(metadata)} spectra")
#     print(f"Read errors: {readerror_counter}")
# else:
#     print("No valid spectra found")