In [1]:
import os
import pandas as pd
import numpy as np
from astropy.io import fits
import matplotlib.pyplot as plt
import h5py

In [2]:
subsets = ['testing']
band = 'g' # 'g', 'r', 'i', 'z', 'y'
directories = f'/data/HSC_generated/HSC_generated_v1/ddpm/g_band_ddpm'
window_diameters = [10, 20, 30] # '10px_diameter' '20px_diameter' '30px_disameter'
num_failed = 0
data = {subset: [] for subset in subsets}

In [3]:
directories

'/data/HSC_generated/HSC_generated_v1/ddpm/g_band_ddpm'

In [4]:
paths = {f'{subset}_path': f'/data/HSC_generated/HSC_generated_v1/ddpm/64x64/5x64x64_{subset}.hdf5' for subset in subsets}
object_ids = {}
# mags = {}
redshifts = {}
for subset in subsets:
    with h5py.File(paths[f'{subset}_path']) as file:
        object_ids[subset] = pd.Series(file['object_id'][:])
        # mags[subset] = pd.Series(file[f'{band}_cmodel_mag'][:])
        redshifts[subset] = pd.Series(file['specz_redshift'][:])

In [5]:
len(object_ids['testing'])

2000

In [6]:
object_ids['testing']

0          1
1          2
2          3
3          4
4          5
        ... 
1995    1996
1996    1997
1997    1998
1998    1999
1999    2000
Length: 2000, dtype: int64

In [7]:
for subset in subsets:
    for obj_id, idx in zip(object_ids[subset], range(len(object_ids[subset]))):
        try:
            img = fits.getdata(f'{directories}/test_segmented_{obj_id}.fits')
        except:
            num_failed += 1
            print(num_failed)
            continue

        num_pixel_list = []
        for obj_label in np.unique(img):
            if obj_label == 0:
                continue
            num_pixels = np.sum(img == obj_label)
            num_pixel_list.append(num_pixels)

        center_x = 32
        center_y = 32

        num_in_center_per_gal = []
        for window in window_diameters:
            radius = int(window/2)
            in_center = []

            for x, y in zip(range(img.shape[0]), range(img.shape[1])):
                dist = np.sqrt((x - center_x)**2 + (y - center_y)**2)
                if dist <= radius:
                    if img[x, y] != 0:
                        if img[x, y] not in in_center:
                            in_center.append(img[x, y])
            num_in_center_per_gal.append(len(in_center))

        param_names = []
        data_lines = []
        num_params = 0
        with open(f'{directories}/test_petro_{obj_id}.cat') as f:
            for line in f:
                if line.startswith('#'):
                    param_names.append(line.strip().split()[2]) # index of parameter names
                    num_params += 1
                if not line.startswith('#'):
                    data_lines.append(list(map(float, line.strip().split()))) # save list of data in that line, length of the number of object detected, also converted to float

        for window in window_diameters:
            param_names.insert(0, f'NUMBER_IN_CENTER_{window}PX_DIAMETER')
        # param_names.insert(0, f'{band}_cmodel_mag')
        param_names.insert(0, 'specz')
        param_names.insert(0, 'object_id')
        
        if len(data_lines) > len(num_pixel_list):
            data_lines = data_lines[:-1]  # Drop the last element if it's longer than num_pixel_list

        if len(data_lines) == 0:
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, 0)
            data_per_gal += list(np.zeros((num_params,), dtype=float))
            # data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)
            
        for individual_galaxy_shape_param_info in range(len(data_lines)):
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, num_in_center_per_gal[i]) 
            for param in data_lines[individual_galaxy_shape_param_info]:
                data_per_gal.append(param)
            # data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)

In [8]:
param_names

['object_id',
 'specz',
 'NUMBER_IN_CENTER_30PX_DIAMETER',
 'NUMBER_IN_CENTER_20PX_DIAMETER',
 'NUMBER_IN_CENTER_10PX_DIAMETER',
 'NUMBER',
 'PETRO_RADIUS',
 'X_IMAGE',
 'Y_IMAGE',
 'XMIN_IMAGE',
 'XMAX_IMAGE',
 'YMIN_IMAGE',
 'YMAX_IMAGE',
 'ISOAREA_IMAGE',
 'ISOAREA_WORLD',
 'A_IMAGE',
 'B_IMAGE',
 'ERRCXX_IMAGE',
 'ERRCYY_IMAGE',
 'ERRCXY_IMAGE',
 'THETA_IMAGE',
 'THETA_WORLD',
 'MU_MAX',
 'ELLIPTICITY',
 'FLUX_RADIUS',
 'SPHEROID_SERSICN']

In [9]:
len(data)

1

In [10]:
print(num_failed)

0


In [11]:
df = pd.DataFrame(data['testing'], columns=param_names)

In [12]:
df # isoarea and pixel area seem very similar, maybe delete one

Unnamed: 0,object_id,specz,NUMBER_IN_CENTER_30PX_DIAMETER,NUMBER_IN_CENTER_20PX_DIAMETER,NUMBER_IN_CENTER_10PX_DIAMETER,NUMBER,PETRO_RADIUS,X_IMAGE,Y_IMAGE,XMIN_IMAGE,...,B_IMAGE,ERRCXX_IMAGE,ERRCYY_IMAGE,ERRCXY_IMAGE,THETA_IMAGE,THETA_WORLD,MU_MAX,ELLIPTICITY,FLUX_RADIUS,SPHEROID_SERSICN
0,1,2.148836,0,0,0,1.0,7.26,11.8601,60.8076,2.0,...,2.559,1117.4640,3651.0090,-0.994596,-0.92,-0.92,-4.9909,0.366,3.848,1.306
1,2,3.800389,2,1,1,1.0,4.62,54.0021,41.3385,41.0,...,3.482,9931.0180,5042.7630,-2070.772000,79.28,79.28,-7.0728,0.276,4.182,1.556
2,2,3.800389,2,1,1,2.0,5.94,5.2221,44.1668,1.0,...,3.147,4709.6640,2357.7590,-955.434100,77.61,77.61,-5.5289,0.275,4.501,0.725
3,2,3.800389,2,1,1,3.0,10.56,12.9170,8.8385,11.0,...,1.003,260.0703,109.0648,-185.791600,64.07,64.07,-2.5475,0.545,4.215,0.326
4,2,3.800389,2,1,1,4.0,9.24,33.0236,32.1127,28.0,...,1.989,1056.3950,1530.9580,503.560700,-23.73,-23.73,-5.0103,0.172,3.247,1.989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4299,1998,0.334537,1,1,1,2.0,10.56,32.6492,30.3359,30.0,...,1.456,544.7028,178.0702,134.550500,-80.39,-80.39,-2.9726,0.425,4.435,1.737
4300,1998,0.334537,1,1,1,3.0,9.24,14.1499,2.0896,11.0,...,0.966,267.8008,837.4723,-55.061480,1.40,1.40,-3.5053,0.417,2.145,0.890
4301,1998,0.334537,1,1,1,4.0,6.60,33.3488,55.8921,25.0,...,3.295,1076.7420,720.6925,181.418800,-81.55,-81.55,-3.9657,0.224,5.421,0.827
4302,1999,2.924762,0,0,0,1.0,7.26,4.9936,62.3401,1.0,...,1.426,622.1265,1689.5580,-403.985800,6.05,6.05,-4.4539,0.417,2.547,1.431


In [13]:
len(np.unique(df['object_id']))

2000

In [14]:
zero_df = df[df['NUMBER'] == 0] # for images with no sources detected

In [15]:
distance_from_center = lambda row: np.sqrt((row['X_IMAGE']-32)**2 + (row['Y_IMAGE']-32)**2)

In [16]:
df['distance_from_center'] = df.apply(distance_from_center, axis=1)

In [17]:
central_gals_idx = df.groupby('object_id')['distance_from_center'].idxmin().tolist() # get idx of object with the min distance from center for each object id
central_gals_idx = np.sort(central_gals_idx)
df = df[df.index.isin(central_gals_idx)]

In [18]:
df # we are missing about 150 galaxies in total due to the cat files having one more object than was detected in the fits file

Unnamed: 0,object_id,specz,NUMBER_IN_CENTER_30PX_DIAMETER,NUMBER_IN_CENTER_20PX_DIAMETER,NUMBER_IN_CENTER_10PX_DIAMETER,NUMBER,PETRO_RADIUS,X_IMAGE,Y_IMAGE,XMIN_IMAGE,...,ERRCXX_IMAGE,ERRCYY_IMAGE,ERRCXY_IMAGE,THETA_IMAGE,THETA_WORLD,MU_MAX,ELLIPTICITY,FLUX_RADIUS,SPHEROID_SERSICN,distance_from_center
0,1,2.148836,0,0,0,1.0,7.26,11.8601,60.8076,2.0,...,1117.4640,3651.0090,-0.994596,-0.92,-0.92,-4.9909,0.366,3.848,1.306,35.149586
4,2,3.800389,2,1,1,4.0,9.24,33.0236,32.1127,28.0,...,1056.3950,1530.9580,503.560700,-23.73,-23.73,-5.0103,0.172,3.247,1.989,1.029786
5,3,3.127409,1,1,1,1.0,4.62,32.3087,32.7188,24.0,...,36325.9700,40438.7600,-323.845700,6.04,6.04,-7.5134,0.020,2.165,1.370,0.782285
9,4,2.831122,0,0,0,0.0,0.00,0.0000,0.0000,0.0,...,0.0000,0.0000,0.000000,0.00,0.00,0.0000,0.000,0.000,0.000,45.254834
10,5,3.133684,2,1,1,1.0,4.62,32.7581,32.5274,24.0,...,26965.1700,30105.6100,1764.626000,-10.06,-10.06,-6.9867,0.012,2.359,1.143,0.923508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4292,1996,2.268385,0,0,0,0.0,0.00,0.0000,0.0000,0.0,...,0.0000,0.0000,0.000000,0.00,0.00,0.0000,0.000,0.000,0.000,45.254834
4293,1997,3.915436,1,1,1,1.0,5.28,32.6388,32.5898,27.0,...,3219.9790,3416.8280,167.899900,-18.77,-18.77,-5.4145,0.033,2.554,1.355,0.869442
4299,1998,0.334537,1,1,1,2.0,10.56,32.6492,30.3359,30.0,...,544.7028,178.0702,134.550500,-80.39,-80.39,-2.9726,0.425,4.435,1.737,1.786250
4302,1999,2.924762,0,0,0,1.0,7.26,4.9936,62.3401,1.0,...,622.1265,1689.5580,-403.985800,6.05,6.05,-4.4539,0.417,2.547,1.431,40.618559


In [19]:
df.to_csv(f'/data/HSC_generated/HSC_generated_v1/ddpm/g_band_ddpm/shape_parameters.csv', index=False)

In [None]:
subsets = ['testing']
band = 'g' # 'g', 'r', 'i', 'z', 'y'
directories = f'/data/HSC_generated/HSC_generated_v1/cvae/g_band_cvae'
window_diameters = [10, 20, 30] # '10px_diameter' '20px_diameter' '30px_disameter'
num_failed = 0
data = {subset: [] for subset in subsets}

paths = {f'{subset}_path': f'/data/HSC_generated/HSC_generated_v1/cvae/64x64/5x64x64_{subset}.hdf5' for subset in subsets}
object_ids = {}
# mags = {}
redshifts = {}
for subset in subsets:
    with h5py.File(paths[f'{subset}_path']) as file:
        object_ids[subset] = pd.Series(file['object_id'][:])
        # mags[subset] = pd.Series(file[f'{band}_cmodel_mag'][:])
        redshifts[subset] = pd.Series(file['specz_redshift'][:])
        
for subset in subsets:
    for obj_id, idx in zip(object_ids[subset], range(len(object_ids[subset]))):
        try:
            img = fits.getdata(f'{directories}/test_segmented_{obj_id}.fits')
        except:
            num_failed += 1
            print(num_failed)
            continue

        num_pixel_list = []
        for obj_label in np.unique(img):
            if obj_label == 0:
                continue
            num_pixels = np.sum(img == obj_label)
            num_pixel_list.append(num_pixels)

        center_x = 32
        center_y = 32

        num_in_center_per_gal = []
        for window in window_diameters:
            radius = int(window/2)
            in_center = []

            for x, y in zip(range(img.shape[0]), range(img.shape[1])):
                dist = np.sqrt((x - center_x)**2 + (y - center_y)**2)
                if dist <= radius:
                    if img[x, y] != 0:
                        if img[x, y] not in in_center:
                            in_center.append(img[x, y])
            num_in_center_per_gal.append(len(in_center))

        param_names = []
        data_lines = []
        num_params = 0
        with open(f'{directories}/test_petro_{obj_id}.cat') as f:
            for line in f:
                if line.startswith('#'):
                    param_names.append(line.strip().split()[2]) # index of parameter names
                    num_params += 1
                if not line.startswith('#'):
                    data_lines.append(list(map(float, line.strip().split()))) # save list of data in that line, length of the number of object detected, also converted to float

        for window in window_diameters:
            param_names.insert(0, f'NUMBER_IN_CENTER_{window}PX_DIAMETER')
        # param_names.insert(0, f'{band}_cmodel_mag')
        param_names.insert(0, 'specz')
        param_names.insert(0, 'object_id')
        
        if len(data_lines) > len(num_pixel_list):
            data_lines = data_lines[:-1]  # Drop the last element if it's longer than num_pixel_list

        if len(data_lines) == 0:
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, 0)
            data_per_gal += list(np.zeros((num_params,), dtype=float))
            # data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)
            
        for individual_galaxy_shape_param_info in range(len(data_lines)):
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, num_in_center_per_gal[i]) 
            for param in data_lines[individual_galaxy_shape_param_info]:
                data_per_gal.append(param)
            # data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)
            
print(num_failed)

df = pd.DataFrame(data['testing'], columns=param_names)

len(np.unique(df['object_id']))

zero_df = df[df['NUMBER'] == 0]

distance_from_center = lambda row: np.sqrt((row['X_IMAGE']-32)**2 + (row['Y_IMAGE']-32)**2)

df['distance_from_center'] = df.apply(distance_from_center, axis=1)

central_gals_idx = df.groupby('object_id')['distance_from_center'].idxmin().tolist() # get idx of object with the min distance from center for each object id
central_gals_idx = np.sort(central_gals_idx)
df = df[df.index.isin(central_gals_idx)]

print(df)

df.to_csv(f'/data/HSC_generated/HSC_generated_v1/cvae/g_band_cvae/shape_parameters.csv', index=False)

KeyError: 'specz_redshift'

In [4]:
from tqdm import tqdm

subsets = ['testing']
band = 'g' # 'g', 'r', 'i', 'z', 'y'
directories = f'/data/HSC_generated/HSC_generated_v1/g_band'
window_diameters = [10, 20, 30] # '10px_diameter' '20px_diameter' '30px_disameter'
num_failed = 0
data = {subset: [] for subset in subsets}

paths = {f'{subset}_path': f'/data/HSC/HSC_v6/step2/64x64/5x64x64.hdf5' for subset in subsets}
object_ids = {}
# mags = {}
redshifts = {}
for subset in subsets:
    with h5py.File(paths[f'{subset}_path']) as file:
        object_ids[subset] = pd.Series(file['object_id'][:])
        # mags[subset] = pd.Series(file[f'{band}_cmodel_mag'][:])
        redshifts[subset] = pd.Series(file['specz_redshift'][:])
        
for subset in subsets:
    for obj_id, idx in tqdm(zip(object_ids[subset], range(len(object_ids[subset])))):
        try:
            img = fits.getdata(f'{directories}/test_segmented_{obj_id}.fits')
        except:
            num_failed += 1
            # print(num_failed)
            continue

        num_pixel_list = []
        for obj_label in np.unique(img):
            if obj_label == 0:
                continue
            num_pixels = np.sum(img == obj_label)
            num_pixel_list.append(num_pixels)

        center_x = 32
        center_y = 32

        num_in_center_per_gal = []
        for window in window_diameters:
            radius = int(window/2)
            in_center = []

            for x, y in zip(range(img.shape[0]), range(img.shape[1])):
                dist = np.sqrt((x - center_x)**2 + (y - center_y)**2)
                if dist <= radius:
                    if img[x, y] != 0:
                        if img[x, y] not in in_center:
                            in_center.append(img[x, y])
            num_in_center_per_gal.append(len(in_center))

        param_names = []
        data_lines = []
        num_params = 0
        with open(f'{directories}/test_petro_{obj_id}.cat') as f:
            for line in f:
                if line.startswith('#'):
                    param_names.append(line.strip().split()[2]) # index of parameter names
                    num_params += 1
                if not line.startswith('#'):
                    data_lines.append(list(map(float, line.strip().split()))) # save list of data in that line, length of the number of object detected, also converted to float

        for window in window_diameters:
            param_names.insert(0, f'NUMBER_IN_CENTER_{window}PX_DIAMETER')
        # param_names.insert(0, f'{band}_cmodel_mag')
        param_names.insert(0, 'specz')
        param_names.insert(0, 'object_id')
        
        if len(data_lines) > len(num_pixel_list):
            data_lines = data_lines[:-1]  # Drop the last element if it's longer than num_pixel_list

        if len(data_lines) == 0:
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, 0)
            data_per_gal += list(np.zeros((num_params,), dtype=float))
            # data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)
            
        for individual_galaxy_shape_param_info in range(len(data_lines)):
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, num_in_center_per_gal[i]) 
            for param in data_lines[individual_galaxy_shape_param_info]:
                data_per_gal.append(param)
            # data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)
            
print(num_failed)

df = pd.DataFrame(data['testing'], columns=param_names)

len(np.unique(df['object_id']))

zero_df = df[df['NUMBER'] == 0]

distance_from_center = lambda row: np.sqrt((row['X_IMAGE']-32)**2 + (row['Y_IMAGE']-32)**2)

df['distance_from_center'] = df.apply(distance_from_center, axis=1)

central_gals_idx = df.groupby('object_id')['distance_from_center'].idxmin().tolist() # get idx of object with the min distance from center for each object id
central_gals_idx = np.sort(central_gals_idx)
df = df[df.index.isin(central_gals_idx)]

# print(df)

df.to_csv(f'/data/HSC_generated/HSC_generated_v1/g_band_shape_parameters.csv', index=False)

286401it [01:10, 4091.39it/s]


243441
