In [1]:
import os
import pandas as pd
import numpy as np
from astropy.io import fits
import matplotlib.pyplot as plt
import h5py

In [2]:
subsets = ['testing', 'validation', 'training']
band = 'y' # 'g', 'r', 'i', 'z', 'y'
directories = {subset: f'/data/HSC/HSC_v6/step1/{band}_band_sextractor/{subset}_set_subset' for subset in subsets}
window_diameters = [10, 20, 30] # '10px_diameter' '20px_diameter' '30px_disameter'
num_failed = 0
data = {subset: [] for subset in subsets}

In [3]:
directories

{'testing': '/data/HSC/HSC_v6/step1/y_band_sextractor/testing_set_subset',
 'validation': '/data/HSC/HSC_v6/step1/y_band_sextractor/validation_set_subset',
 'training': '/data/HSC/HSC_v6/step1/y_band_sextractor/training_set_subset'}

In [4]:
paths = {f'{subset}_path': f'/data/HSC/HSC_v6/step2A/127x127/5x127x127_{subset}.hdf5' for subset in subsets}
object_ids = {}
mags = {}
redshifts = {}
for subset in subsets:
    with h5py.File(paths[f'{subset}_path']) as file:
        object_ids[subset] = pd.Series(file['object_id'][:])
        mags[subset] = pd.Series(file[f'{band}_cmodel_mag'][:])
        redshifts[subset] = pd.Series(file['specz_redshift'][:])

In [5]:
for subset in subsets:
    for obj_id, idx in zip(object_ids[subset], range(len(object_ids[subset]))):
        try:
            img = fits.getdata(f'{directories[subset]}/test_segmented_{obj_id}.fits')
        except:
            num_failed += 1
            print(num_failed)
            continue

        num_pixel_list = []
        for obj_label in np.unique(img):
            if obj_label == 0:
                continue
            num_pixels = np.sum(img == obj_label)
            num_pixel_list.append(num_pixels)

        center_x = 63
        center_y = 63

        num_in_center_per_gal = []
        for window in window_diameters:
            radius = int(window/2)
            in_center = []

            for x, y in zip(range(img.shape[0]), range(img.shape[1])):
                dist = np.sqrt((x - center_x)**2 + (y - center_y)**2)
                if dist <= radius:
                    if img[x, y] != 0:
                        if img[x, y] not in in_center:
                            in_center.append(img[x, y])
            num_in_center_per_gal.append(len(in_center))

        param_names = []
        data_lines = []
        with open(f'{directories[subset]}/test_petro_{obj_id}.cat') as f:
            for line in f:
                if line.startswith('#'):
                    param_names.append(line.strip().split()[2]) # index of parameter names
                if not line.startswith('#'):
                    data_lines.append(list(map(float, line.strip().split()))) # save list of data in that line, length of the number of object detected, also converted to float

        for window in window_diameters:
            param_names.insert(0, f'NUMBER_IN_CENTER_{window}PX_DIAMETER')
        param_names.insert(0, 'PIXEL_AREA')
        param_names.insert(0, f'{band}_cmodel_mag')
        param_names.insert(0, 'specz')
        param_names.insert(0, 'object_id')
        for individual_galaxy_shape_param_info in range(len(data_lines)):
            data_per_gal = []
            for i in range(len(window_diameters)):
                data_per_gal.insert(0, num_in_center_per_gal[i]) 
            for param in data_lines[individual_galaxy_shape_param_info]:
                data_per_gal.append(param)
            try:
                data_per_gal.insert(0, num_pixel_list[individual_galaxy_shape_param_info])
            except:
                print(len(num_pixel_list), len(data_lines))
            data_per_gal.insert(0, mags[subset][idx])
            data_per_gal.insert(0, redshifts[subset][idx])
            data_per_gal.insert(0, obj_id)
            data[subset].append(data_per_gal)

3 4
1 2
4 5
2 3
2 3
7 8
4 5
3 4
5 6
6 7
2 3
1 2
3 4
3 4
2 3
1 2
3 4
3 4
1 2
3 4
2 3
2 3
4 5
4 5
3 4
3 4
8 9
2 3
2 3
3 4
4 5
4 5
5 6
2 3
6 7
6 7
2 3
2 3
2 3
2 3
3 4
2 3
3 4
5 6
2 3
2 3
2 3
1 2
3 4
1 2
3 4
2 3
4 5
3 4
2 3
2 3
4 5
6 7
3 4
2 3
2 3
1 2
3 4
4 5
3 4
6 7
5 6
3 4
1 2
6 7
1 2
1 2
3 4
5 6
4 5
4 5
3 4
6 7
2 3
1 2
4 5
2 3
3 4
4 5
5 6
4 5
2 3
4 5
2 3
4 5
4 5
4 5
2 3
4 5
1 2
3 4
4 5
3 4
3 4
1 2
4 5
5 6
2 3
1 2
1 2
2 3
4 5
4 5
2 3
6 7
2 3
2 3
4 5
2 3
2 3
4 5
1 2
4 5
2 3
1 2
6 7
1 2
2 3
3 4
4 5
3 4
1 2
7 8
2 3
4 5
6 7
3 4
4 5
4 5
3 4
4 5
4 5
2 3
1 2
4 5
2 3
5 6
3 4
2 3
4 5
1 2
4 5
3 4
2 3
1 2


In [6]:
param_names

['object_id',
 'specz',
 'y_cmodel_mag',
 'PIXEL_AREA',
 'NUMBER_IN_CENTER_30PX_DIAMETER',
 'NUMBER_IN_CENTER_20PX_DIAMETER',
 'NUMBER_IN_CENTER_10PX_DIAMETER',
 'NUMBER',
 'PETRO_RADIUS',
 'X_IMAGE',
 'Y_IMAGE',
 'XMIN_IMAGE',
 'XMAX_IMAGE',
 'YMIN_IMAGE',
 'YMAX_IMAGE',
 'ISOAREA_IMAGE',
 'ISOAREA_WORLD',
 'A_IMAGE',
 'B_IMAGE',
 'THETA_IMAGE',
 'THETA_WORLD',
 'MU_MAX',
 'ELLIPTICITY',
 'FLUX_RADIUS',
 'SPHEROID_SERSICN']

In [7]:
len(data)

3

In [8]:
print(num_failed)

0


In [9]:
df = pd.DataFrame(data['testing'] + data['validation'] + data['training'], columns=param_names)

In [10]:
df # isoarea and pixel area seem very similar, maybe delete one

Unnamed: 0,object_id,specz,y_cmodel_mag,PIXEL_AREA,NUMBER_IN_CENTER_30PX_DIAMETER,NUMBER_IN_CENTER_20PX_DIAMETER,NUMBER_IN_CENTER_10PX_DIAMETER,NUMBER,PETRO_RADIUS,X_IMAGE,...,ISOAREA_IMAGE,ISOAREA_WORLD,A_IMAGE,B_IMAGE,THETA_IMAGE,THETA_WORLD,MU_MAX,ELLIPTICITY,FLUX_RADIUS,SPHEROID_SERSICN
0,36407046198803509,0.43602,17.614677,379,1,1,1.0,1.0,7.92,60.6967,...,372.0,8.295370e-07,4.266,3.966,-82.86,82.79,-7.8749,0.070,6.054,1.491
1,36407046198803509,0.43602,17.614677,79,1,1,1.0,2.0,5.28,94.6723,...,79.0,1.761651e-07,1.904,1.790,-36.20,36.19,-8.0534,0.060,2.218,0.875
2,36407046198804043,0.23209,16.851135,824,1,1,1.0,1.0,7.92,60.7407,...,824.0,1.837469e-06,6.518,5.251,-38.78,38.74,-8.4882,0.194,8.032,2.242
3,36407050493759629,0.56511,18.164425,399,1,1,1.0,1.0,9.24,60.7617,...,399.0,8.897454e-07,4.771,3.885,-87.73,87.64,-7.3739,0.186,6.735,1.836
4,36407050493773284,0.55775,19.115318,53,1,1,1.0,1.0,9.24,84.9366,...,53.0,1.181867e-07,2.142,1.540,-86.63,86.55,-5.8324,0.281,3.552,1.344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841394,74649168714420584,0.24194,16.965544,79,1,1,1.0,2.0,8.58,34.2023,...,79.0,1.761651e-07,2.202,2.016,43.04,-43.55,-6.4581,0.084,3.770,1.667
841395,74649168714426572,0.77584,21.360008,7,1,1,1.0,1.0,10.56,85.6859,...,7.0,1.560957e-08,1.104,0.560,63.08,-62.50,-4.6799,0.493,2.737,0.485
841396,74649168714426572,0.77584,21.360008,70,1,1,1.0,2.0,6.60,119.4976,...,70.0,1.560957e-07,2.097,1.768,80.66,-79.85,-6.7588,0.157,2.652,2.000
841397,74649168714426572,0.77584,21.360008,16,1,1,1.0,3.0,10.56,113.1930,...,10.0,2.229938e-08,1.145,0.663,-70.09,70.79,-4.7134,0.421,3.953,2.850


In [11]:
distance_from_center = lambda row: np.sqrt((row['X_IMAGE']-63)**2 + (row['Y_IMAGE']-63)**2)

In [12]:
df['distance_from_center'] = df.apply(distance_from_center, axis=1)

In [13]:
central_gals_idx = df.groupby('object_id')['distance_from_center'].idxmin().tolist() # get idx of object with the min distance from center for each object id
central_gals_idx = np.sort(central_gals_idx)
df = df[df.index.isin(central_gals_idx)]

In [14]:
df # we are missing about 150 galaxies in total due to the cat files having one more object than was detected in the fits file

Unnamed: 0,object_id,specz,y_cmodel_mag,PIXEL_AREA,NUMBER_IN_CENTER_30PX_DIAMETER,NUMBER_IN_CENTER_20PX_DIAMETER,NUMBER_IN_CENTER_10PX_DIAMETER,NUMBER,PETRO_RADIUS,X_IMAGE,...,ISOAREA_WORLD,A_IMAGE,B_IMAGE,THETA_IMAGE,THETA_WORLD,MU_MAX,ELLIPTICITY,FLUX_RADIUS,SPHEROID_SERSICN,distance_from_center
0,36407046198803509,0.43602,17.614677,379,1,1,1.0,1.0,7.92,60.6967,...,8.295370e-07,4.266,3.966,-82.86,82.79,-7.8749,0.070,6.054,1.491,3.734336
2,36407046198804043,0.23209,16.851135,824,1,1,1.0,1.0,7.92,60.7407,...,1.837469e-06,6.518,5.251,-38.78,38.74,-8.4882,0.194,8.032,2.242,3.043547
3,36407050493759629,0.56511,18.164425,399,1,1,1.0,1.0,9.24,60.7617,...,8.897454e-07,4.771,3.885,-87.73,87.64,-7.3739,0.186,6.735,1.836,3.518309
6,36407050493773284,0.55775,19.115318,196,1,1,1.0,3.0,9.90,60.8754,...,4.236883e-07,3.286,2.800,61.63,-61.69,-7.1324,0.148,5.268,2.476,3.037911
13,36407054788744371,0.44495,18.476147,234,1,1,1.0,4.0,6.60,61.1102,...,5.195756e-07,3.420,3.023,-39.18,39.23,-7.9104,0.116,4.189,1.403,3.358037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841387,74649168714401486,0.72650,19.710840,121,1,1,1.0,1.0,7.92,59.6986,...,2.698226e-07,2.587,2.430,35.51,-36.00,-6.6440,0.061,3.767,1.405,4.060160
841389,74649168714401583,1.86393,20.932972,62,1,1,1.0,2.0,5.94,61.6153,...,1.382562e-07,1.786,1.721,85.12,-84.31,-6.6126,0.036,2.410,1.835,3.386439
841391,74649168714403940,0.89864,21.601278,34,1,1,1.0,1.0,9.90,59.3400,...,7.581790e-08,1.875,1.305,-28.82,28.23,-5.0249,0.304,3.323,0.679,4.004444
841393,74649168714420584,0.24194,16.965544,972,1,1,1.0,1.0,6.60,59.4535,...,2.167500e-06,6.226,5.502,-52.62,52.12,-8.7467,0.116,6.762,2.092,4.048591


In [15]:
df.to_csv(f'/data/HSC/HSC_v6/step1/{band}_band_sextractor/shape_parameters.csv', index=False)