In [1]:
import os
import csv
import h5py
import ngmix
import galsim
import random
import numpy as np
import pandas as pd
from astropy.io import fits
import matplotlib.pyplot as plt

In [2]:
nx = 60
ny = 50
stamp_xsize = 127 # pixel
stamp_ysize = 127 # pixel
pixel_scale = .2 # arcsec / pixel
psf_sigma = 2. * pixel_scale # arcsec
shift_radius = .2 # arcsec
noise_sigma = .001

image_xsize = stamp_xsize * nx
image_ysize = stamp_ysize * ny

n_realizations = 1000

psf_gauss = galsim.Gaussian(psf_sigma)
shift_radius_sq = shift_radius ** 2

In [3]:
# define ranges
shear_range = (-0.1, 0.1) # uniform distribution

sersic_bulge_range = (1.,4.)
sersic_disk_range = (1., 1.5)

sersic_n_range = (4.0, 6.0) #uniform distribution
R_range = (2.0, 8.0) # pix, half-light radius, uniform distribution
S_range = (1., 15.0) # pix^-2, uniform distribution
e_range = (0., .6)
e_mode = .2

# define random number generators
# seed_1 for galaxy simulation
seed_1 = 387920
seed_2 = 6424512
seed_3 = 8793182

rng_1 = galsim.BaseDeviate(seed_1)
random_seed = rng_1.raw() # used for gal simulation

rng_2 = np.random.RandomState(seed_2) # used in metacal

In [4]:
def get_psf_angle(x,y):
    theta = np.arctan(y/x)
    return theta

In [5]:
def get_psf_e(x, lower_limit, upper_limit, image_size):
    e =  lower_limit + (upper_limit - lower_limit) / image_size * x
    return e

In [6]:
def get_psf(e1, e2):
    psf = psf_gauss.shear(e1=e1, e2=e2)
    return psf

In [7]:
def get_shear(rng, shear_range):
    shear = rng.np.uniform(shear_range[0], shear_range[1], size=2)
    return shear

In [8]:
def get_flux(rng, S_range, R):
    S = rng.np.uniform(S_range[0], S_range[1])
    F = S * np.pi * R**2
    return F

In [9]:
def get_gal(rng, R_range,sersic_n_range, pixel_scale):
    R = rng.np.uniform(R_range[0], R_range[1]) * pixel_scale
    n = rng.np.uniform(sersic_n_range[0], sersic_n_range[1])
    gal = galsim.Sersic(n = n, half_light_radius = R, flux = 1.)
    return gal, R, n

In [10]:
def get_spiral(rng_s, rng_s1, R_range, sersic_bulge_range, sersic_disk_range, S_range, pixel_scale):
    # s for disk, s1 for bulge
    
    frac_disk = rng_s.np.uniform()
    
    R_disk_range = tuple(element * frac_disk for element in R_range)
    R_disk = rng_s.np.uniform(R_disk_range[0], R_disk_range[1]) * pixel_scale
    S_disk = tuple(element * frac_disk for element in S_range)
    F_disk = get_flux(rng_s, S_disk, R_disk)
    n_d = rng_s.np.uniform(sersic_disk_range[0], sersic_disk_range[1])
    
    frac_bulge = 1-frac_disk
    
    R_bulge_range = tuple(element * frac_bulge for element in R_range)
    R_bulge = rng_s1.np.uniform(R_range[0], R_range[1]) * pixel_scale
    S_bulge = tuple(element * frac_bulge for element in S_range)
    F_bulge = get_flux(rng_s1, S_bulge, R_disk)
    n_b = rng_s1.np.uniform(sersic_bulge_range[0], sersic_bulge_range[1])

    bulge = galsim.Sersic(n = n_b, half_light_radius = R_bulge, flux = F_bulge)
    disk = galsim.Sersic(n = n_d, half_light_radius = R_disk, flux = F_disk)
    
    gal = frac_disk * disk + (1-frac_disk) * bulge
    
    return gal, (R_bulge, R_disk), (n_b, n_d), F_bulge+F_disk

In [11]:
def get_gal_ellipticity(rng, e_mode, e_range):
    ellip = rng.np.rayleigh(e_mode/np.sqrt(2))
    ellip = np.clip(ellip, 0., .6)
    e1 = ellip * np.cos(2 * np.pi * rng.np.uniform(0.,1.))
    e2 = ellip * np.cos(2 * np.pi * rng.np.uniform(0.,1.))
    return (e1, e2)

In [12]:
def get_snr(rng):
    snr = rng.np.normal(45, 30)
    snr_value = max(snr, 0.1)
    return snr_value

In [13]:
if not os.path.isdir('/data3/shear_simulated_galaxy'):
    os.mkdir('/data3/shear_simulated_galaxy')

fit_files_directory = os.path.join('/data3/shear_simulated_galaxy', 'fits_files')
if not os.path.isdir(fit_files_directory):
    os.mkdir(fit_files_directory)
    
galaxy_path = os.path.join('/data3/shear_simulated_galaxy/fits_files', 'galaxy_images')
if not os.path.isdir(galaxy_path):
    os.mkdir(galaxy_path)
psf_path = os.path.join('/data3/shear_simulated_galaxy/fits_files', 'psf_images')
if not os.path.isdir(psf_path):
    os.mkdir(psf_path)
    
hdf5_path = '/data3/shear_simulated_galaxy/NonUniformPsf_image127x127_with_metadata.hdf5'
hf = h5py.File(hdf5_path, 'w')

In [14]:
metadata = []
ImForFits = []

image_id_counter = 0

for i in range(n_realizations):
    
    # counter that updates the random number generator
    # set back to 0 after completing iterations of all the subimages to ensure 
    # that the same subimage will have the same rng for galaxy simulation
    k = 0
    
    gal_image = galsim.ImageF(stamp_xsize * nx, stamp_ysize * ny, scale = pixel_scale)
    psf_image = galsim.ImageF(stamp_xsize * nx, stamp_ysize * ny, scale = pixel_scale)
    
    for iy in range(ny):
        for ix in range(nx):
            
            rng = galsim.BaseDeviate(random_seed+k+1)
            snr = get_snr(rng)
            if snr<=10:
                continue
            
            rng_s = galsim.BaseDeviate(seed_1+k+1)
            rng_s1 = galsim.BaseDeviate(seed_3+k+1)
            #ud = galsim.UniformDeviate(random_seed+k+1)
            
            
            # create galaxy in this subimage
            # exactly the same 
            e = get_gal_ellipticity(rng, e_mode, e_range)
            isSpiral = False
            if (rng.np.uniform() < 0.5):
                gal, R, n = get_gal(rng, R_range, sersic_n_range, pixel_scale)
                S = rng.np.uniform(S_range[0], S_range[1])
                F = S * np.pi * R**2
                gal.withFlux(F)
            else:
                gal, R, n, F = get_spiral(rng_s, rng_s1, R_range, sersic_bulge_range, sersic_disk_range, S_range, pixel_scale)
                isSpiral = True
                
            gal = gal.shear(e1=e[0], e2=e[1])
            
            g1 = get_shear(rng,shear_range)[0]
            g2 = get_shear(rng,shear_range)[1]
        
            # create subimage
            # +1  and -1 to create one pixel border between stamps
            b = galsim.BoundsI(ix * stamp_xsize + 1,(ix+1) * stamp_xsize -1,
                                iy * stamp_ysize + 1, (iy+1) * stamp_ysize -1)
            sub_gal_image = gal_image[b]
            sub_psf_image = psf_image[b]
        
            # create psf
            e1 = get_psf_e((b.xmin+b.xmax)/2, -.25, .25, image_xsize)
            e2 = get_psf_e((b.ymin+b.ymax)/2, 0., .25, image_ysize)
        
            theta = get_psf_angle((b.xmin+b.xmax)/2 - nx*stamp_xsize/2 + 1e-10, (b.ymin+b.ymax)/2 - ny*stamp_ysize/2)
            this_psf = get_psf(e1, e2).rotate(theta * galsim.radians)
            
            # rotate each galaxy such that all iterations cover a full circle
            if i == 0:
                rot = rng.np.uniform(0, 2 * np.pi)
                rot_angle = rot
            else:
                rot_angle = 2 * i * np.pi / n_realizations + rot
            
            gal = gal.rotate(rot_angle * galsim.radians)
            
            # shear the galaxy
            shear_gal = gal.shear(g1 = g1, g2 = g2)
        
            # shift galaxy
            rsq = 2 * shift_radius_sq
            while (rsq > shift_radius_sq):
                dx = (2*np.random.random()-1) * shift_radius/2
                dy = (2*np.random.random()-1) * shift_radius/2
                rsq = dx**2 + dy**2
        
            this_gal = shear_gal.shift(dx,dy) 
        
            # convolve psf with gal
            final_gal = galsim.Convolve([this_psf, this_gal])
            subim = final_gal.drawImage(sub_gal_image)
            
            # add noise
            noise = galsim.GaussianNoise(sigma = noise_sigma)
            subim.addNoiseSNR(noise, snr, preserve_flux=True)
            
            subpsf_im = this_psf.drawImage(sub_psf_image)

            """
            # plot subimage
            plt.imshow(subim.array, origin='lower')
            plt.colorbar()
            plt.show() 
            """

            # generate object_id
            object_id = int(f"{ix}{iy}") #keep track of different realizations of the same galaxy, galaxy_id
            object_id_g = image_id_counter # actual object id
            object_id_p = int(str(object_id_g)+"0") #psf id, have an additional 0 at the end of galaxy id
                
            image_id_counter +=1
                
            # write images to fits files
            psf_name = str(object_id_p) + '.fits'
            psf_file_name = os.path.join(psf_path, psf_name)
                
            gal_name = str(object_id_g) + '.fits'
            gal_file_name = os.path.join(galaxy_path, gal_name)
                
            subpsf_im.write(psf_file_name)
            subim.write(gal_file_name)
                
            if isSpiral == True:
                metadata.append({
                    'object_id': object_id_g,
                    'galaxy_id': object_id,
                    'g1': g1,
                    'g2': g2,
                    'sersic_n': 0.,
                    'sersic_bulge_n': n[0],
                    'sersic_disk_n': n[1],
                    'half_light_radius': 0.,
                    'half_light_radius(bulge)': R[0],
                    'half_light_radius(disk)': R[1],
                    'flux': F,
                    'shift_radius_dx': dx,
                    'shift_radius_dy': dy,
                    'snr': snr
                })
                isSpiral = False
            else:
                metadata.append({               
                    'object_id': object_id_g,
                    'galaxy_id': object_id,
                    'g1': g1,
                    'g2': g2,
                    'sersic_n': n,
                    'sersic_bulge_n': 0.,
                    'sersic_disk_n': 0.,
                    'half_light_radius': R,
                    'half_light_radius(bulge)': 0.,
                    'half_light_radius(disk)': 0.,
                    'flux': F,
                    'shift_radius_dx': dx,
                    'shift_radius_dy': dy,
                    'snr': snr
                })
            
            # increment k to generate different random numbers
            k += 1
    """
    'galaxy_image': subim.array,
    'psf_image': subpsf_im.array,
    
    plt.imshow(gal_image.array, origin='lower', cmap='gray')
    plt.colorbar()
    plt.show()
    """      

In [15]:
metadata_df = pd.DataFrame(metadata)
#print (metadata_df)
csv_file_path = os.path.join('/data3/shear_simulated_galaxy', 'metadata.csv')
metadata_df.to_csv(csv_file_path, index=False)

In [16]:
def make_hdf5_from_raw_images():
    
    #WORKING TO PRODUCE FINAL FULL DATASET:

    #for raw:

    #get number of galaxies in the image directory and sort them
    #object id is the filename of the galaxy images
    #galaxy id keeps track of the same galaxy

    image_name_list = sorted(os.listdir("/data3/shear_simulated_galaxy/fits_files/galaxy_images"))
    psf_name_list = sorted(os.listdir("/data3/shear_simulated_galaxy/fits_files/psf_images"))
    
    # check if galaxy and psf list have equal number of images
    if len(image_name_list)!=len(psf_name_list):
        return "not every galaxy has coresponding psf"

    #load metadata
    sheardata = pd.read_csv('/data3/shear_simulated_galaxy/metadata.csv')
    sheardata.describe()

    b = np.argsort(sheardata['object_id'])
    sorted_sheardata = sheardata.iloc[b][:]   
    sheardata = sorted_sheardata

    #name the file you want to create
    hf= h5py.File('/data3/shear_simulated_galaxy/NonUniformPsf_image127x127_with_metadata.hdf5', 'a')

    # create metadata's corresponding dataset in hdf5 file
    for (columnName, columnData) in sheardata.iteritems():
        print(columnName)
        hf.create_dataset(columnName,data=sheardata[columnName])

        
    for i in range(len(image_name_list)):     

        #object_id = image_name_list[i][0:17] # slice the object_id
        galaxy_image = []
        psf_image = []
        
        g_image = fits.open("/data3/shear_simulated_galaxy/fits_files/galaxy_images/"+image_name_list[i])
        g_image_data = g_image[0].data
        
        p_image = fits.open("/data3/shear_simulated_galaxy/fits_files/psf_images/"+psf_name_list[i])
        p_image_data = p_image[0].data

        g_pad1 = int((127-len(g_image_data))/2)
        g_pad2 = 127-len(g_image_data)-g_pad1
        g_pad3 = int((127-len(g_image_data[0]))/2)
        g_pad4 = 127-len(g_image_data[0])-g_pad3

        g_im = np.pad(g_image_data,((g_pad1,g_pad2),(g_pad3,g_pad4)),"constant",constant_values = ((0,0),(0,0)))

        galaxy_image.append(g_im)

        #galaxy_image_reshape = np.reshape(np.array(galaxy_image),[1,1,127,127])
        
        
        p_pad1 = int((127-len(p_image_data))/2)
        p_pad2 = 127-len(p_image_data)-p_pad1
        p_pad3 = int((127-len(p_image_data[0]))/2)
        p_pad4 = 127-len(p_image_data[0])-p_pad3


        p_im = np.pad(p_image_data,((p_pad1,p_pad2),(p_pad3,p_pad4)),"constant",constant_values = ((0,0),(0,0)))

        psf_image.append(p_im)

        #psf_image_reshape = np.reshape(np.array(psf_image),[1,1,127,127])
        

        if i == 0:
            hf.create_dataset("galaxy_image",data = galaxy_image,chunks = True,maxshape = (None,127,127))
            hf.create_dataset("psf_image",data = psf_image,chunks = True,maxshape = (None,127,127))

        else:
            hf['galaxy_image'].resize((hf['galaxy_image'].shape[0]+1), axis=0)
            hf['galaxy_image'][hf["galaxy_image"].shape[0]-1,:,:] = galaxy_image
            
            hf['psf_image'].resize((hf['psf_image'].shape[0]+1), axis=0)
            hf['psf_image'][hf["psf_image"].shape[0]-1,:,:] = psf_image

            
        g_image.close()
        p_image.close()


    hf.close()


In [17]:
make_hdf5_from_raw_images()

  for (columnName, columnData) in sheardata.iteritems():


object_id
galaxy_id
g1
g2
sersic_n
sersic_bulge_n
sersic_disk_n
half_light_radius
half_light_radius(bulge)
half_light_radius(disk)
flux
shift_radius_dx
shift_radius_dy
snr


In [18]:
file_path = '/data3/shear_simulated_galaxy/NonUniformPsf_image127x127_with_metadata.hdf5'
with h5py.File(file_path, 'r') as hf:
    column_names = list(hf.keys())
    print("Column Names (Datasets):")
    print(column_names)

Column Names (Datasets):
['flux', 'g1', 'g2', 'galaxy_id', 'galaxy_image', 'half_light_radius', 'half_light_radius(bulge)', 'half_light_radius(disk)', 'object_id', 'psf_image', 'sersic_bulge_n', 'sersic_disk_n', 'sersic_n', 'shift_radius_dx', 'shift_radius_dy', 'snr']


In [19]:
def make_hsc_v6_large(ntrain=3,ntest=1,nvalidation=1):
    inputfile = 'NonUniformPsf_image127x127_with_metadata.hdf5'
    directory = '/data3/shear_simulated_galaxy/'
    current_file = os.path.join(directory, inputfile)
    hf = h5py.File(current_file,'r')
    
    # group different realizations of the same galaxy together
    galaxy_id_list = hf['galaxy_id'][:]
    test_galaxy_ids, galaxy_id_counts = np.unique(galaxy_id_list, return_counts=True)
    
    # group dataframe indices into a dict with galaxy_id as keys
    data_dict = dict()
    for i, test_id in enumerate(test_galaxy_ids):
        index_array = np.where(galaxy_id_list == test_id)[0]
        data_dict[test_id]=dict()
        for key in hf.keys():
            dataset_data = hf[key][index_array]
            data_dict[test_id][key]=dataset_data
        
    length = len(test_galaxy_ids)
    """
    ntrain = .8 * length
    ntest = .1 * length
    nvalidation = .1 * length
    """
    
    inds_train = random.sample(test_galaxy_ids.tolist(), ntrain)
    remaining_galaxy_ids = [galaxy_id for galaxy_id in test_galaxy_ids if galaxy_id not in inds_train]
    inds_test = random.sample(remaining_galaxy_ids, ntest)
    validation_ids = [galaxy_id for galaxy_id in remaining_galaxy_ids if galaxy_id not in inds_test]
    inds_validation = random.sample(validation_ids, nvalidation)

    part = os.path.splitext(current_file)
    file_ends = ['_training', '_testing', '_validation']
    ind_list = [inds_train, inds_test, inds_validation]
    
    for file_end, ind in zip(file_ends, ind_list):
        f = h5py.File(part[0]+file_end+part[1], 'w')
        for galaxy_id in ind:
            new_group = f.create_group('galaxy_image/' + str(galaxy_id))
            dataset = data_dict[galaxy_id]
            for dataset_name, dataset_data in dataset.items():
                new_group.create_dataset(dataset_name, data=dataset_data)
        f.close

    hf.close()

In [20]:
make_hsc_v6_large(ntrain=3,ntest=1,nvalidation=1)

In [21]:
hf = h5py.File('/data3/shear_simulated_galaxy/NonUniformPsf_image127x127_with_metadata.hdf5','r')
all_items = hf.items()

# Print the items
for name, item in all_items:
    print(name)
    print(item)  # This will print the metadata about the group or dataset
    print("-----------")

# Close the HDF5 file
hf.close()

flux
<HDF5 dataset "flux": shape (15,), type "<f8">
-----------
g1
<HDF5 dataset "g1": shape (15,), type "<f8">
-----------
g2
<HDF5 dataset "g2": shape (15,), type "<f8">
-----------
galaxy_id
<HDF5 dataset "galaxy_id": shape (15,), type "<i8">
-----------
galaxy_image
<HDF5 dataset "galaxy_image": shape (15, 127, 127), type ">f4">
-----------
half_light_radius
<HDF5 dataset "half_light_radius": shape (15,), type "<f8">
-----------
half_light_radius(bulge)
<HDF5 dataset "half_light_radius(bulge)": shape (15,), type "<f8">
-----------
half_light_radius(disk)
<HDF5 dataset "half_light_radius(disk)": shape (15,), type "<f8">
-----------
object_id
<HDF5 dataset "object_id": shape (15,), type "<i8">
-----------
psf_image
<HDF5 dataset "psf_image": shape (15, 127, 127), type ">f4">
-----------
sersic_bulge_n
<HDF5 dataset "sersic_bulge_n": shape (15,), type "<f8">
-----------
sersic_disk_n
<HDF5 dataset "sersic_disk_n": shape (15,), type "<f8">
-----------
sersic_n
<HDF5 dataset "sersic_n":

In [22]:
hf_test = h5py.File('/data3/shear_simulated_galaxy/NonUniformPsf_image127x127_with_metadata_training.hdf5','r')

# Access the "galaxy_image" group
galaxy_image_group = hf_test['galaxy_image']

# Function to recursively print datasets within a group
def print_datasets(group, prefix=""):
    for key in group.keys():
        item = group[key]
        if isinstance(item, h5py.Dataset):
            print(prefix + key)
        elif isinstance(item, h5py.Group):
            print_datasets(item, prefix + key + "/")

# Print the names of all datasets within the group and its subgroups
print_datasets(galaxy_image_group)

# Close the HDF5 file
hf_test.close()

1/flux
1/g1
1/g2
1/galaxy_id
1/galaxy_image
1/half_light_radius
1/half_light_radius(bulge)
1/half_light_radius(disk)
1/object_id
1/psf_image
1/sersic_bulge_n
1/sersic_disk_n
1/sersic_n
1/shift_radius_dx
1/shift_radius_dy
1/snr
2/flux
2/g1
2/g2
2/galaxy_id
2/galaxy_image
2/half_light_radius
2/half_light_radius(bulge)
2/half_light_radius(disk)
2/object_id
2/psf_image
2/sersic_bulge_n
2/sersic_disk_n
2/sersic_n
2/shift_radius_dx
2/shift_radius_dy
2/snr
3/flux
3/g1
3/g2
3/galaxy_id
3/galaxy_image
3/half_light_radius
3/half_light_radius(bulge)
3/half_light_radius(disk)
3/object_id
3/psf_image
3/sersic_bulge_n
3/sersic_disk_n
3/sersic_n
3/shift_radius_dx
3/shift_radius_dy
3/snr
