In [41]:
from astropy.io import fits
import os
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np

## Opening a random .fits file from Unzipped JADES spectra to see what's inside

In [42]:
# Specify the path to your FITS file
fits_file = '/Users/aryanahaghjoo/Documents/GitHub/super_resolution/data/JADES/JADES_spectra_unzipped/hlsp_jades_jwst_nirspec_clear-prism/hlsp_jades_jwst_nirspec_goods-n-mediumhst-00000604_clear-prism_v1.0_s2d.fits'

# Open the FITS file
with fits.open(fits_file) as hdul:
    # Print the FITS file structure
    hdul.info()
    
    # Access the primary HDU (Header Data Unit)
    primary_hdu = hdul[0]
    
    # Print the header
    print("\nHeader of the Primary HDU:")
    print(repr(primary_hdu.header))
    
    # Access the data (if it's an image)
    if primary_hdu.data is not None:
        print("\nData shape:", primary_hdu.data.shape)

    # If there's an extension (e.g., table), access it
    if len(hdul) > 1:
        table_hdu = hdul[1]
        print("\nHeader of Extension HDU:")
        print(repr(table_hdu.header))
        print("\nData in Extension HDU (First 5 Rows):")
        print(table_hdu.data[:5])

Filename: /Users/aryanahaghjoo/Documents/GitHub/super_resolution/data/JADES/JADES_spectra_unzipped/hlsp_jades_jwst_nirspec_clear-prism/hlsp_jades_jwst_nirspec_goods-n-mediumhst-00000604_clear-prism_v1.0_s2d.fits
No.    Name      Ver    Type      Cards   Dimensions   Format
  0  PRIMARY       1 PrimaryHDU      34   ()      
  1  FLUX          1 ImageHDU         9   (674, 27)   float64   
  2  FLUX_ERR      1 ImageHDU         9   (674, 27)   float64   
  3  WAVELENGTH    1 ImageHDU         8   (674,)   float64   
  4  RA            1 ImageHDU         9   (674, 27)   float64   
  5  DEC           1 ImageHDU         9   (674, 27)   float64   
  6  ASDF          1 BinTableHDU      9   0R x 0C   []   

Header of the Primary HDU:
SIMPLE  =                    T / conforms to FITS standard                      
BITPIX  =                    8 / array data type                                
NAXIS   =                    0 / number of array dimensions                     
EXTEND  =               

## 1. Extracting Spectrum from "prism"

In [43]:
# Directory containing your FITS files with 1D spectra
fits_dir = '/Users/aryanahaghjoo/Documents/GitHub/super_resolution/data/JADES/JADES_spectra_unzipped/hlsp_jades_jwst_nirspec_clear-prism'
# Collect both lowercase and uppercase file extensions
fits_files = glob.glob(os.path.join(fits_dir, '*.fits')) + glob.glob(os.path.join(fits_dir, '*.FITS'))

data_list = []

for file in tqdm(fits_files, desc="Processing FITS files"):
    with fits.open(file) as hdul:
        # Search for the EXTRACT1D extension
        extract1d_hdu = None
        for hdu in hdul:
            if hdu.header.get('EXTNAME', '').strip().upper() == 'EXTRACT1D':
                extract1d_hdu = hdu
                break

        # Skip files that are not 1D spectra
        if extract1d_hdu is None:
            continue

        # Confirm the extension is a table with columns
        if not hasattr(extract1d_hdu, 'columns'):
            print(f"Warning: 'EXTRACT1D' in {os.path.basename(file)} is not a table. Skipping file.")
            continue

        table_data = extract1d_hdu.data
        available_columns = [col.upper() for col in extract1d_hdu.columns.names]
        required_columns = ['WAVELENGTH', 'FLUX', 'FLUX_ERR']
        if not all(col in available_columns for col in required_columns):
            print(f"Warning: Missing required columns in {os.path.basename(file)}. Skipping file.")
            continue

        # Extract data columns
        wavelength = table_data['WAVELENGTH']
        flux = table_data['FLUX']
        flux_err = table_data['FLUX_ERR']

        data_list.append({
            'file_name': os.path.basename(file),
            'WAVELENGTH': wavelength,
            'FLUX': flux,
            'FLUX_ERR': flux_err
        })

# Create a DataFrame where each row corresponds to a file
df_prism= pd.DataFrame(data_list)
#df_prism.to_csv('/Users/aryanahaghjoo/Documents/GitHub/super_resolution/toy_model/JADES_spectra_dataframe/prism.csv', index=False)

Processing FITS files: 100%|██████████| 7504/7504 [00:08<00:00, 893.95it/s]


In [44]:
def classify_nan_positions(arr):
    """
    Classify NaN positions in a 1D numpy array.

    Returns:
      - 'no_nans' if there are no NaN values.
      - 'all_nans' if the entire array is NaN.
      - 'trailing_nans' if all NaNs occur only after the last valid (non-NaN) value.
      - 'middle_nans' if any NaN occurs before the last valid value.
    """
    nan_indices = np.where(np.isnan(arr))[0]
    if len(nan_indices) == 0:
        return 'no_nans'
    
    valid_indices = np.where(~np.isnan(arr))[0]
    if len(valid_indices) == 0:
        return 'all_nans'
    
    last_valid = valid_indices[-1]
    # If all NaN indices are strictly greater than the last valid index,
    # then NaNs are only trailing.
    if np.all(nan_indices > last_valid):
        return 'trailing_nans'
    else:
        return 'middle_nans'

In [45]:
# Count NaN entries in each file's FLUX and FLUX_ERR arrays
df_prism['nan_in_flux'] = df_prism['FLUX'].apply(lambda arr: np.isnan(arr).sum())
df_prism['nan_in_flux_err'] = df_prism['FLUX_ERR'].apply(lambda arr: np.isnan(arr).sum())

# Display the count per file
#print(df_prism[['file_name', 'nan_in_flux', 'nan_in_flux_err']])

# Sum across all files to get the total number of NaN entries
total_nan_flux = df_prism['nan_in_flux'].sum()
total_nan_flux_err = df_prism['nan_in_flux_err'].sum()

print("Total NaNs in FLUX:", total_nan_flux)
print("Total NaNs in FLUX_ERR:", total_nan_flux_err)

# Apply the classification to your DataFrame columns:
df_prism['flux_nan_position'] = df_prism['FLUX'].apply(classify_nan_positions)
df_prism['flux_err_nan_position'] = df_prism['FLUX_ERR'].apply(classify_nan_positions)

# Optionally, count the occurrences in each category:
flux_counts = df_prism['flux_nan_position'].value_counts()
flux_err_counts = df_prism['flux_err_nan_position'].value_counts()

print("NaN positions in FLUX:")
print(flux_counts)
print("\nNaN positions in FLUX_ERR:")
print(flux_err_counts)

# You can also separate the files based on classification, for example:
df_prism_middle = df_prism[(df_prism['flux_nan_position'] == 'middle_nans') | (df_prism['flux_err_nan_position'] == 'middle_nans')]
df_prism_trailing = df_prism[(df_prism['flux_nan_position'] == 'trailing_nans') & (df_prism['flux_err_nan_position'] == 'trailing_nans')]

#print("\nFiles with NaNs in the middle:")
#print(df_prism_middle[['file_name', 'flux_nan_position', 'flux_err_nan_position']])
#print("\nFiles with only trailing NaNs:")
#print(df_prism_trailing[['file_name', 'flux_nan_position', 'flux_err_nan_position']])

Total NaNs in FLUX: 28487
Total NaNs in FLUX_ERR: 28487
NaN positions in FLUX:
flux_nan_position
middle_nans      3028
trailing_nans     724
Name: count, dtype: int64

NaN positions in FLUX_ERR:
flux_err_nan_position
middle_nans      3028
trailing_nans     724
Name: count, dtype: int64


## 2. Extracting Spectrum from "g140m"

In [46]:
# Directory containing your FITS files with 1D spectra
fits_dir = '/Users/aryanahaghjoo/Documents/GitHub/super_resolution/data/JADES/JADES_spectra_unzipped/hlsp_jades_jwst_nirspec_f070lp-g140m'
# Collect both lowercase and uppercase file extensions
fits_files = glob.glob(os.path.join(fits_dir, '*.fits')) + glob.glob(os.path.join(fits_dir, '*.FITS'))

data_list = []

for file in tqdm(fits_files, desc="Processing FITS files"):
    with fits.open(file) as hdul:
        # Search for the EXTRACT1D extension
        extract1d_hdu = None
        for hdu in hdul:
            if hdu.header.get('EXTNAME', '').strip().upper() == 'EXTRACT1D':
                extract1d_hdu = hdu
                break

        # Skip files that are not 1D spectra
        if extract1d_hdu is None:
            continue

        # Confirm the extension is a table with columns
        if not hasattr(extract1d_hdu, 'columns'):
            print(f"Warning: 'EXTRACT1D' in {os.path.basename(file)} is not a table. Skipping file.")
            continue

        table_data = extract1d_hdu.data
        available_columns = [col.upper() for col in extract1d_hdu.columns.names]
        required_columns = ['WAVELENGTH', 'FLUX', 'FLUX_ERR']
        if not all(col in available_columns for col in required_columns):
            print(f"Warning: Missing required columns in {os.path.basename(file)}. Skipping file.")
            continue

        # Extract data columns
        wavelength = table_data['WAVELENGTH']
        flux = table_data['FLUX']
        flux_err = table_data['FLUX_ERR']

        data_list.append({
            'file_name': os.path.basename(file),
            'WAVELENGTH': wavelength,
            'FLUX': flux,
            'FLUX_ERR': flux_err
        })

# Create a DataFrame where each row corresponds to a file
df_g140m= pd.DataFrame(data_list)
#df_g140m.to_csv('/Users/aryanahaghjoo/Documents/GitHub/super_resolution/toy_model/JADES_spectra_dataframe/g140m.csv', index=False)

Processing FITS files: 100%|██████████| 6968/6968 [00:08<00:00, 790.45it/s]


In [47]:
# Count NaN entries in each file's FLUX and FLUX_ERR arrays
df_g140m['nan_in_flux'] = df_g140m['FLUX'].apply(lambda arr: np.isnan(arr).sum())
df_g140m['nan_in_flux_err'] = df_g140m['FLUX_ERR'].apply(lambda arr: np.isnan(arr).sum())

# Display the count per file
#print(df_prism[['file_name', 'nan_in_flux', 'nan_in_flux_err']])

# Sum across all files to get the total number of NaN entries
total_nan_flux = df_g140m['nan_in_flux'].sum()
total_nan_flux_err = df_g140m['nan_in_flux_err'].sum()

print("Total NaNs in FLUX:", total_nan_flux)
print("Total NaNs in FLUX_ERR:", total_nan_flux_err)

# Apply the classification to your DataFrame columns:
df_g140m['flux_nan_position'] = df_g140m['FLUX'].apply(classify_nan_positions)
df_g140m['flux_err_nan_position'] = df_g140m['FLUX_ERR'].apply(classify_nan_positions)

# Optionally, count the occurrences in each category:
flux_counts = df_g140m['flux_nan_position'].value_counts()
flux_err_counts = df_g140m['flux_err_nan_position'].value_counts()

print("NaN positions in FLUX:")
print(flux_counts)
print("\nNaN positions in FLUX_ERR:")
print(flux_err_counts)

# You can also separate the files based on classification, for example:
df_g140m_middle = df_g140m[(df_g140m['flux_nan_position'] == 'middle_nans') | (df_g140m['flux_err_nan_position'] == 'middle_nans')]
df_g140m_trailing = df_g140m[(df_g140m['flux_nan_position'] == 'trailing_nans') & (df_g140m['flux_err_nan_position'] == 'trailing_nans')]

#print("\nFiles with NaNs in the middle:")
#print(df_prism_middle[['file_name', 'flux_nan_position', 'flux_err_nan_position']])
#print("\nFiles with only trailing NaNs:")
#print(df_prism_trailing[['file_name', 'flux_nan_position', 'flux_err_nan_position']])

Total NaNs in FLUX: 382330
Total NaNs in FLUX_ERR: 382330
NaN positions in FLUX:
flux_nan_position
middle_nans      3414
trailing_nans      59
no_nans            11
Name: count, dtype: int64

NaN positions in FLUX_ERR:
flux_err_nan_position
middle_nans      3414
trailing_nans      59
no_nans            11
Name: count, dtype: int64


## 3. Extracting Spectrum from "g235m"

In [48]:
# Directory containing your FITS files with 1D spectra
fits_dir = '/Users/aryanahaghjoo/Documents/GitHub/super_resolution/data/JADES/JADES_spectra_unzipped/hlsp_jades_jwst_nirspec_f170lp-g235m'
# Collect both lowercase and uppercase file extensions
fits_files = glob.glob(os.path.join(fits_dir, '*.fits')) + glob.glob(os.path.join(fits_dir, '*.FITS'))

data_list = []

for file in tqdm(fits_files, desc="Processing FITS files"):
    with fits.open(file) as hdul:
        # Search for the EXTRACT1D extension
        extract1d_hdu = None
        for hdu in hdul:
            if hdu.header.get('EXTNAME', '').strip().upper() == 'EXTRACT1D':
                extract1d_hdu = hdu
                break

        # Skip files that are not 1D spectra
        if extract1d_hdu is None:
            continue

        # Confirm the extension is a table with columns
        if not hasattr(extract1d_hdu, 'columns'):
            print(f"Warning: 'EXTRACT1D' in {os.path.basename(file)} is not a table. Skipping file.")
            continue

        table_data = extract1d_hdu.data
        available_columns = [col.upper() for col in extract1d_hdu.columns.names]
        required_columns = ['WAVELENGTH', 'FLUX', 'FLUX_ERR']
        if not all(col in available_columns for col in required_columns):
            print(f"Warning: Missing required columns in {os.path.basename(file)}. Skipping file.")
            continue

        # Extract data columns
        wavelength = table_data['WAVELENGTH']
        flux = table_data['FLUX']
        flux_err = table_data['FLUX_ERR']

        data_list.append({
            'file_name': os.path.basename(file),
            'WAVELENGTH': wavelength,
            'FLUX': flux,
            'FLUX_ERR': flux_err
        })

# Create a DataFrame where each row corresponds to a file
df_g235m= pd.DataFrame(data_list)
#df_g235m.to_csv('/Users/aryanahaghjoo/Documents/GitHub/super_resolution/toy_model/JADES_spectra_dataframe/g235m.csv', index=False)

Processing FITS files: 100%|██████████| 6590/6590 [00:08<00:00, 784.15it/s]


In [49]:
# Count NaN entries in each file's FLUX and FLUX_ERR arrays
df_g235m['nan_in_flux'] = df_g235m['FLUX'].apply(lambda arr: np.isnan(arr).sum())
df_g235m['nan_in_flux_err'] = df_g235m['FLUX_ERR'].apply(lambda arr: np.isnan(arr).sum())

# Display the count per file
#print(df_prism[['file_name', 'nan_in_flux', 'nan_in_flux_err']])

# Sum across all files to get the total number of NaN entries
total_nan_flux = df_g235m['nan_in_flux'].sum()
total_nan_flux_err = df_g235m['nan_in_flux_err'].sum()

print("Total NaNs in FLUX:", total_nan_flux)
print("Total NaNs in FLUX_ERR:", total_nan_flux_err)

# Apply the classification to your DataFrame columns:
df_g235m['flux_nan_position'] = df_g235m['FLUX'].apply(classify_nan_positions)
df_g235m['flux_err_nan_position'] = df_g235m['FLUX_ERR'].apply(classify_nan_positions)

# Optionally, count the occurrences in each category:
flux_counts = df_g235m['flux_nan_position'].value_counts()
flux_err_counts = df_g235m['flux_err_nan_position'].value_counts()

print("NaN positions in FLUX:")
print(flux_counts)
print("\nNaN positions in FLUX_ERR:")
print(flux_err_counts)

# You can also separate the files based on classification, for example:
df_g235m_middle = df_g235m[(df_g235m['flux_nan_position'] == 'middle_nans') | (df_g235m['flux_err_nan_position'] == 'middle_nans')]
df_g235m_trailing = df_g235m[(df_g235m['flux_nan_position'] == 'trailing_nans') & (df_g235m['flux_err_nan_position'] == 'trailing_nans')]
#print("\nFiles with NaNs in the middle:")
#print(df_prism_middle[['file_name', 'flux_nan_position', 'flux_err_nan_position']])
#print("\nFiles with only trailing NaNs:")
#print(df_prism_trailing[['file_name', 'flux_nan_position', 'flux_err_nan_position']])

Total NaNs in FLUX: 273275
Total NaNs in FLUX_ERR: 273275
NaN positions in FLUX:
flux_nan_position
middle_nans      3033
trailing_nans     237
no_nans            19
all_nans            6
Name: count, dtype: int64

NaN positions in FLUX_ERR:
flux_err_nan_position
middle_nans      3033
trailing_nans     237
no_nans            19
all_nans            6
Name: count, dtype: int64


## 4. Extracting Spectrum from "g395m"

In [50]:
# Directory containing your FITS files with 1D spectra
fits_dir = '/Users/aryanahaghjoo/Documents/GitHub/super_resolution/data/JADES/JADES_spectra_unzipped/hlsp_jades_jwst_nirspec_f290lp-g395m'
# Collect both lowercase and uppercase file extensions
fits_files = glob.glob(os.path.join(fits_dir, '*.fits')) + glob.glob(os.path.join(fits_dir, '*.FITS'))

data_list = []

for file in tqdm(fits_files, desc="Processing FITS files"):
    with fits.open(file) as hdul:
        # Search for the EXTRACT1D extension
        extract1d_hdu = None
        for hdu in hdul:
            if hdu.header.get('EXTNAME', '').strip().upper() == 'EXTRACT1D':
                extract1d_hdu = hdu
                break

        # Skip files that are not 1D spectra
        if extract1d_hdu is None:
            continue

        # Confirm the extension is a table with columns
        if not hasattr(extract1d_hdu, 'columns'):
            print(f"Warning: 'EXTRACT1D' in {os.path.basename(file)} is not a table. Skipping file.")
            continue

        table_data = extract1d_hdu.data
        available_columns = [col.upper() for col in extract1d_hdu.columns.names]
        required_columns = ['WAVELENGTH', 'FLUX', 'FLUX_ERR']
        if not all(col in available_columns for col in required_columns):
            print(f"Warning: Missing required columns in {os.path.basename(file)}. Skipping file.")
            continue

        # Extract data columns
        wavelength = table_data['WAVELENGTH']
        flux = table_data['FLUX']
        flux_err = table_data['FLUX_ERR']

        data_list.append({
            'file_name': os.path.basename(file),
            'WAVELENGTH': wavelength,
            'FLUX': flux,
            'FLUX_ERR': flux_err
        })

# Create a DataFrame where each row corresponds to a file
df_g395m= pd.DataFrame(data_list)
#df_g235m.to_csv('/Users/aryanahaghjoo/Documents/GitHub/super_resolution/toy_model/JADES_spectra_dataframe/g395m.csv', index=False)

Processing FITS files: 100%|██████████| 6968/6968 [00:07<00:00, 883.07it/s]


In [51]:
# Count NaN entries in each file's FLUX and FLUX_ERR arrays
df_g395m['nan_in_flux'] = df_g395m['FLUX'].apply(lambda arr: np.isnan(arr).sum())
df_g395m['nan_in_flux_err'] = df_g395m['FLUX_ERR'].apply(lambda arr: np.isnan(arr).sum())

# Display the count per file
#print(df_prism[['file_name', 'nan_in_flux', 'nan_in_flux_err']])

# Sum across all files to get the total number of NaN entries
total_nan_flux = df_g395m['nan_in_flux'].sum()
total_nan_flux_err = df_g395m['nan_in_flux_err'].sum()

print("Total NaNs in FLUX:", total_nan_flux)
print("Total NaNs in FLUX_ERR:", total_nan_flux_err)

# Apply the classification to your DataFrame columns:
df_g395m['flux_nan_position'] = df_g395m['FLUX'].apply(classify_nan_positions)
df_g395m['flux_err_nan_position'] = df_g395m['FLUX_ERR'].apply(classify_nan_positions)

# Optionally, count the occurrences in each category:
flux_counts = df_g395m['flux_nan_position'].value_counts()
flux_err_counts = df_g395m['flux_err_nan_position'].value_counts()

print("NaN positions in FLUX:")
print(flux_counts)
print("\nNaN positions in FLUX_ERR:")
print(flux_err_counts)

# You can also separate the files based on classification, for example:
df_g395m_middle = df_g395m[(df_g395m['flux_nan_position'] == 'middle_nans') | (df_g395m['flux_err_nan_position'] == 'middle_nans')]
df_g395m_trailing = df_g395m[(df_g395m['flux_nan_position'] == 'trailing_nans') & (df_g395m['flux_err_nan_position'] == 'trailing_nans')]

#print("\nFiles with NaNs in the middle:")
#print(df_prism_middle[['file_name', 'flux_nan_position', 'flux_err_nan_position']])
#print("\nFiles with only trailing NaNs:")
#print(df_prism_trailing[['file_name', 'flux_nan_position', 'flux_err_nan_position']])

Total NaNs in FLUX: 271025
Total NaNs in FLUX_ERR: 271025
NaN positions in FLUX:
flux_nan_position
middle_nans    3473
all_nans         11
Name: count, dtype: int64

NaN positions in FLUX_ERR:
flux_err_nan_position
middle_nans    3473
all_nans         11
Name: count, dtype: int64
