In [1]:
import os
import pandas as pd
import numpy as np
from astropy.io import fits
import matplotlib.pyplot as plt
import h5py

In [2]:
subsets = ['testing', 'validation', 'training']
band = 'g' # 'g', 'r', 'i', 'z', 'y'
directories = {subset: f'/data/HSC/HSC_v6/step1/{band}_band_sextractor/{subset}_set_subset' for subset in subsets}
window_diameters = [10, 20, 30] # '10px_diameter' '20px_diameter' '30px_disameter'
num_failed = 0
data = {subset: [] for subset in subsets}

In [3]:
directories

{'testing': '/data/HSC/HSC_v6/step1/g_band_sextractor/testing_set_subset',
 'validation': '/data/HSC/HSC_v6/step1/g_band_sextractor/validation_set_subset',
 'training': '/data/HSC/HSC_v6/step1/g_band_sextractor/training_set_subset'}

In [4]:
paths = {f'{subset}_path': f'/data/HSC/HSC_v6/step2A/127x127/5x127x127_{subset}.hdf5' for subset in subsets}
object_ids = {}
mags = {}
redshifts = {}
for subset in subsets:
    with h5py.File(paths[f'{subset}_path']) as file:
        object_ids[subset] = pd.Series(file['object_id'][:])
        mags[subset] = pd.Series(file[f'{band}_cmodel_mag'][:])
        redshifts[subset] = pd.Series(file['specz_redshift'][:])

In [5]:
len(object_ids['training']) + len(object_ids['testing']) + len(object_ids['validation'])

286401

In [6]:
for subset in subsets:
    for obj_id, idx in zip(object_ids[subset], range(len(object_ids[subset]))):
        try:
            img = fits.getdata(f'{directories[subset]}/test_segmented_{obj_id}.fits')
        except:
            num_failed += 1
            print(num_failed)
            continue

        num_pixel_list = []
        for obj_label in np.unique(img):
            if obj_label == 0:
                continue
            num_pixels = np.sum(img == obj_label)
            num_pixel_list.append(num_pixels)

        center_x = 63
        center_y = 63

        num_in_center_per_gal = []
        for window in window_diameters:
            radius = int(window/2)
            in_center = []

            for x, y in zip(range(img.shape[0]), range(img.shape[1])):
                dist = np.sqrt((x - center_x)**2 + (y - center_y)**2)
                if dist <= radius:
                    if img[x, y] != 0:
                        if img[x, y] not in in_center:
                            in_center.append(img[x, y])
            num_in_center_per_gal.append(len(in_center))

        param_names = []
        data_lines = []
        num_params = 0
        with open(f'{directories[subset]}/test_petro_{obj_id}.cat') as f:
            for line in f:
                if line.startswith('#'):
                    param_names.append(line.strip().split()[2]) # index of parameter names
                    num_params += 1
                if not line.startswith('#'):
                    data_lines.append(list(map(float, line.strip().split()))) # save list of data in that line, length of the number of object detected, also converted to float

        for window in window_diameters:
            param_names.insert(0, f'NUMBER_IN_CENTER_{window}PX_DIAMETER')
        param_names.insert(0, f'{band}_cmodel_mag')
        param_names.insert(0, 'specz')
        param_names.insert(0, 'object_id')
        
        if len(data_lines) > len(num_pixel_list):
            data_lines = data_lines[:-1]  # Drop the last element if it's longer than num_pixel_list

        if len(data_lines) == 0:
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, 0)
            data_per_gal += list(np.zeros((num_params,), dtype=float))
            data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)
            
        for individual_galaxy_shape_param_info in range(len(data_lines)):
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, num_in_center_per_gal[i]) 
            for param in data_lines[individual_galaxy_shape_param_info]:
                data_per_gal.append(param)
            data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)

In [7]:
param_names

['object_id',
 'specz',
 'g_cmodel_mag',
 'NUMBER_IN_CENTER_30PX_DIAMETER',
 'NUMBER_IN_CENTER_20PX_DIAMETER',
 'NUMBER_IN_CENTER_10PX_DIAMETER',
 'NUMBER',
 'PETRO_RADIUS',
 'X_IMAGE',
 'Y_IMAGE',
 'XMIN_IMAGE',
 'XMAX_IMAGE',
 'YMIN_IMAGE',
 'YMAX_IMAGE',
 'ISOAREA_IMAGE',
 'ISOAREA_WORLD',
 'A_IMAGE',
 'B_IMAGE',
 'THETA_IMAGE',
 'THETA_WORLD',
 'MU_MAX',
 'ELLIPTICITY',
 'FLUX_RADIUS',
 'SPHEROID_SERSICN']

In [8]:
len(data)

3

In [9]:
print(num_failed)

0


In [10]:
df = pd.DataFrame(data['testing'] + data['validation'] + data['training'], columns=param_names)

In [11]:
df # isoarea and pixel area seem very similar, maybe delete one

Unnamed: 0,object_id,specz,g_cmodel_mag,NUMBER_IN_CENTER_30PX_DIAMETER,NUMBER_IN_CENTER_20PX_DIAMETER,NUMBER_IN_CENTER_10PX_DIAMETER,NUMBER,PETRO_RADIUS,X_IMAGE,Y_IMAGE,...,ISOAREA_IMAGE,ISOAREA_WORLD,A_IMAGE,B_IMAGE,THETA_IMAGE,THETA_WORLD,MU_MAX,ELLIPTICITY,FLUX_RADIUS,SPHEROID_SERSICN
0,36407046198803509,0.43602,20.320715,1,1,1,1.0,9.90,60.7368,60.2556,...,327.0,7.291898e-07,4.186,3.586,89.53,-89.61,-5.2324,0.143,5.986,1.598
1,36407046198803509,0.43602,20.320715,1,1,1,2.0,10.56,77.8462,4.8655,...,8.0,1.783951e-08,0.837,0.684,-48.47,48.44,-2.7485,0.183,2.440,0.772
2,36407046198803509,0.43602,20.320715,1,1,1,3.0,10.56,78.7059,72.7290,...,73.0,1.627855e-07,2.258,2.237,41.72,-42.00,-3.3832,0.009,5.322,1.401
3,36407046198803509,0.43602,20.320715,1,1,1,4.0,4.62,94.6603,114.8387,...,179.0,3.991589e-07,2.083,2.052,-6.58,6.53,-7.9784,0.015,2.075,1.162
4,36407046198804043,0.23209,19.497759,1,1,1,1.0,9.90,60.5698,61.0655,...,977.0,2.178650e-06,7.631,6.184,-45.91,45.88,-5.7559,0.190,11.489,2.622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1144777,74649168714427725,1.06695,22.906536,1,1,1,6.0,10.56,9.8920,100.1101,...,8.0,1.783951e-08,0.851,0.673,43.47,-43.92,-1.9269,0.209,3.189,1.440
1144778,74649168714427725,1.06695,22.906536,1,1,1,7.0,10.56,47.7302,85.1164,...,13.0,2.898920e-08,1.504,0.648,13.36,-14.14,-1.8160,0.569,3.347,0.980
1144779,74649168714427725,1.06695,22.906536,1,1,1,8.0,10.56,66.1889,81.4462,...,11.0,2.452932e-08,1.123,0.739,-48.89,48.45,-1.8965,0.342,2.896,0.468
1144780,74649168714427725,1.06695,22.906536,1,1,1,9.0,6.60,60.6106,60.2108,...,127.0,2.832022e-07,3.189,2.272,-50.09,49.65,-3.4308,0.288,4.041,0.584


In [12]:
len(np.unique(df['object_id']))

286401

In [13]:
zero_df = df[df['NUMBER'] == 0] # for images with no sources detected

In [14]:
distance_from_center = lambda row: np.sqrt((row['X_IMAGE']-63)**2 + (row['Y_IMAGE']-63)**2)

In [15]:
df['distance_from_center'] = df.apply(distance_from_center, axis=1)

In [16]:
central_gals_idx = df.groupby('object_id')['distance_from_center'].idxmin().tolist() # get idx of object with the min distance from center for each object id
central_gals_idx = np.sort(central_gals_idx)
df = df[df.index.isin(central_gals_idx)]

In [17]:
df # we are missing about 150 galaxies in total due to the cat files having one more object than was detected in the fits file

Unnamed: 0,object_id,specz,g_cmodel_mag,NUMBER_IN_CENTER_30PX_DIAMETER,NUMBER_IN_CENTER_20PX_DIAMETER,NUMBER_IN_CENTER_10PX_DIAMETER,NUMBER,PETRO_RADIUS,X_IMAGE,Y_IMAGE,...,ISOAREA_WORLD,A_IMAGE,B_IMAGE,THETA_IMAGE,THETA_WORLD,MU_MAX,ELLIPTICITY,FLUX_RADIUS,SPHEROID_SERSICN,distance_from_center
0,36407046198803509,0.43602,20.320715,1,1,1,1.0,9.90,60.7368,60.2556,...,7.291898e-07,4.186,3.586,89.53,-89.61,-5.2324,0.143,5.986,1.598,3.557219
4,36407046198804043,0.23209,19.497759,1,1,1,1.0,9.90,60.5698,61.0655,...,2.178650e-06,7.631,6.184,-45.91,45.88,-5.7559,0.190,11.489,2.622,3.106149
6,36407050493759629,0.56511,21.629736,1,1,1,1.0,10.56,61.0002,60.2766,...,3.612500e-07,3.222,3.003,86.02,-86.12,-4.0039,0.068,7.286,2.227,3.378773
12,36407050493773284,0.55775,22.469265,1,1,1,6.0,10.56,61.0426,60.7620,...,1.494059e-07,2.148,1.909,-67.51,67.46,-3.9265,0.111,5.118,3.225,2.973224
16,36407054788744371,0.44495,21.639175,1,1,1,3.0,7.26,61.1105,60.2413,...,3.701698e-07,3.075,2.656,-27.79,27.84,-4.8882,0.136,4.128,1.628,3.343746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1144757,74649168714401583,1.86393,21.211262,2,1,1,3.0,4.62,61.6107,59.9093,...,4.816667e-07,2.517,2.391,85.09,-84.27,-6.1559,0.050,2.523,1.186,3.388596
1144763,74649168714403940,0.89864,22.631859,1,1,1,4.0,5.94,59.1780,61.2204,...,2.854321e-07,2.764,2.395,-12.47,11.63,-3.8753,0.133,3.486,0.593,4.216000
1144765,74649168714420584,0.24194,19.315403,1,1,1,1.0,6.60,59.3223,61.2335,...,1.926667e-06,5.979,5.182,-51.76,51.27,-6.3171,0.133,6.529,1.834,4.079951
1144770,74649168714426572,0.77584,22.623613,1,1,1,4.0,6.60,61.9607,60.6727,...,3.211111e-07,3.426,2.406,72.84,-72.12,-3.3291,0.298,4.448,0.602,2.548817


In [18]:
df.to_csv(f'/data/HSC/HSC_v6/step1/{band}_band_sextractor/shape_parameters.csv', index=False)