In [161]:
import tensorflow as tf
import tensorflow_probability as tfp
import pandas as pd
import random
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from IPython.display import clear_output
from scipy.interpolate import interp1d
from datetime import datetime
import os

import numpy as np
from tensorflow import keras
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
tfd = tfp.distributions

import keras
from keras.models import Sequential,Input,Model
from keras.layers import Dense, Dropout, Flatten

Num GPUs Available:  1


In [162]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5000)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [163]:
photozdata = pd.read_csv('/mnt/data/HSC/HSC_v6/HSC_v6.csv')

In [164]:
# filter & duplicate drop from bayesian_nn
z_max = 2.5
filt = (photozdata['specz_redshift'] < z_max)\
& (photozdata['specz_redshift'] > 0.01)\
& (photozdata['specz_redshift_err'] > 0) \
& (photozdata['specz_redshift_err'] < 1)\
&(photozdata["specz_redshift_err"]<0.005*(1+photozdata["specz_redshift"]))\
&(photozdata['g_cmodel_mag'] >0)\
&(photozdata['r_cmodel_mag'] >0)\
&(photozdata['i_cmodel_mag'] >0)\
&(photozdata['z_cmodel_mag'] >0)\
&(photozdata['y_cmodel_mag'] >0)\
&(photozdata['g_cmodel_mag'] < 100)\
&(photozdata['r_cmodel_mag'] < 100)\
&(photozdata['i_cmodel_mag'] < 100)\
&(photozdata['z_cmodel_mag'] < 100)\
&(photozdata['y_cmodel_mag'] < 100)\
&(photozdata['specz_flag_homogeneous'] == True)
photozdata.drop_duplicates(subset=['object_id'])
photozdata_subset = photozdata[filt]

In [165]:
photozdata.columns

Index(['object_id', 'specz_redshift_err', 'specz_redshift', 'specz_mag_i',
       'specz_name', 'specz_ra', 'specz_dec', 'specz_flag_homogeneous', 'ra',
       'dec', 'coord', 'skymap_id', 'g_cmodel_mag', 'r_cmodel_mag',
       'i_cmodel_mag', 'z_cmodel_mag', 'y_cmodel_mag', 'g_cmodel_magsigma',
       'r_cmodel_magsigma', 'i_cmodel_magsigma', 'z_cmodel_magsigma',
       'y_cmodel_magsigma'],
      dtype='object')

In [166]:
photozdata_trainset, photozdata_testset = train_test_split(photozdata_subset, test_size=0.2) # Already randomized

In [167]:
photozdata_trainset.to_csv('/mnt/data/HSC/3_model_comparison/training_'+str(z_max)+'_v2.csv')
photozdata_testset.to_csv('/mnt/data/HSC/3_model_comparison/testing_'+str(z_max)+'_v2.csv')

In [168]:
train_array = np.asarray(photozdata_trainset)

In [169]:
n = len(train_array)

In [170]:
train_sorted = sorted(train_array, key=lambda x: x[2])

In [171]:
train_sorted[0]

array([41601547150321034, 9.99999975e-06, 0.0100999996, 16.9960003,
       'SDSS-DR12-1237656906348626261', 345.33543, -0.925957, True,
       345.33555992089055, -0.9259759774764964,
       '(199519.6875, -52210.6796875, -3333.368408203125)', 94590303,
       22.2296257, 21.982687, 22.1828537, 23.84021, 23.0793934,
       0.00695806509, 0.00607395219, 0.00986235403, 0.0989359617,
       0.113840707], dtype=object)

In [172]:
nstrata = 4
stratas = []
cur = z_max/nstrata
pre = 0
for i in range(0,n):
    if train_sorted[i][2] >= cur:
        stratas.append(train_sorted[pre:i])
        pre = i
        cur += z_max/nstrata
    if i == n - 1:
        stratas.append(train_sorted[pre:i])
strata_lengths = []
for i in range(0,nstrata):
    random.shuffle(stratas[i])
    strata_lengths.append(len(stratas[i]))

In [173]:
nnetwork = 8

In [174]:
print(strata_lengths)

[157399, 51075, 8292, 7371]


In [175]:
def monte_carlo(sample_array):
    sampled = random.sample(sample_array, 1)[0]
    new = sampled
    new[0] = 0
    new[2] = np.random.normal(loc=sampled[2], scale=sampled[1], size=None) # New specz
    new[12] = np.random.normal(loc=sampled[12], scale=sampled[17], size=None) # New g
    new[13] = np.random.normal(loc=sampled[13], scale=sampled[18], size=None) # New r
    new[14] = np.random.normal(loc=sampled[14], scale=sampled[19], size=None) # New i
    new[15] = np.random.normal(loc=sampled[15], scale=sampled[20], size=None) # New z
    new[16] = np.random.normal(loc=sampled[16], scale=sampled[21], size=None) # New y
    return new

In [176]:
sample_array = [1, 1, 1, 1]
max_array = [max(strata_lengths)] * nstrata
sample_size_array = [int(i*j) for i,j in zip(np.true_divide(sample_array,max(sample_array)), max_array)]
generating_set_size = np.subtract(sample_size_array,strata_lengths)

In [177]:
generating_set_size

array([     0, 106324, 149107, 150028])

In [178]:
# Sampling training data from each strata

for i in range(0, nstrata):
    for j in range(0, generating_set_size[i]):
        generated = monte_carlo(list(stratas[i]))
        stratas[i].append(generated)

In [179]:
training_data = []
for i in range(0, nnetwork):
    training_data_bin = []
    for j in range(0, nstrata):
        training_data_bin += stratas[j]
    random.shuffle(training_data_bin)
    training_data.append(training_data_bin)

In [180]:
for i in range(1, nnetwork + 1):
    df = pd.DataFrame(training_data[i - 1], columns=['object_id', 'specz_redshift_err', 'specz_redshift', 'specz_mag_i',
       'specz_name', 'specz_ra', 'specz_dec', 'specz_flag_homogeneous', 'ra',
       'dec', 'coord', 'skymap_id', 'g_cmodel_mag', 'r_cmodel_mag',
       'i_cmodel_mag', 'z_cmodel_mag', 'y_cmodel_mag', 'g_cmodel_magsigma',
       'r_cmodel_magsigma', 'i_cmodel_magsigma', 'z_cmodel_magsigma',
       'y_cmodel_magsigma'])
    df.to_csv('/mnt/data/HSC/3_model_comparison/training_set_'+str(z_max)+'_'+str(sample_array)+'#'+str(i)+'.csv')

In [181]:
training_data_control = photozdata_trainset

In [182]:
for i in range(1, nnetwork + 1):
    df = pd.DataFrame(training_data_control, columns=['object_id', 'specz_redshift_err', 'specz_redshift', 'specz_mag_i',
       'specz_name', 'specz_ra', 'specz_dec', 'specz_flag_homogeneous', 'ra',
       'dec', 'coord', 'skymap_id', 'g_cmodel_mag', 'r_cmodel_mag',
       'i_cmodel_mag', 'z_cmodel_mag', 'y_cmodel_mag', 'g_cmodel_magsigma',
       'r_cmodel_magsigma', 'i_cmodel_magsigma', 'z_cmodel_magsigma',
       'y_cmodel_magsigma'])
    df.to_csv('/mnt/data/HSC/3_model_comparison/training_set_'+str(z_max)+'_control#'+str(i)+'.csv')

In [183]:
# END