In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import random
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from datetime import datetime
import os

from sklearn.model_selection import train_test_split

Num GPUs Available:  1


In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5000)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
z_max = 2.5

In [4]:
train_set = pd.read_csv('/data/HSC/HSC_v6/3_model_comparison/training_'+str(z_max)+'z_.csv')

In [5]:
train_array = np.asarray(train_set)

In [6]:
n = len(train_array) # number of galaxies

In [7]:
train_sorted = sorted(train_array, key=lambda x: x[2]) # sorted by 'specz_redshift'

In [8]:
train_sorted[0]

array([41601547150321034, 9.99999975e-06, 0.0100999996, 16.9960003,
       'SDSS-DR12-1237656906348626261', 345.33543, -0.925957, True,
       345.33555992089055, -0.9259759774764964,
       '(199519.6875, -52210.6796875, -3333.368408203125)', 94590303,
       22.2296257, 21.982687, 22.1828537, 23.84021, 23.0793934,
       0.00695806509, 0.00607395219, 0.00986235403, 0.0989359617,
       0.113840707], dtype=object)

In [9]:
# stratification of training set
nstrata = 4
stratas = []
cur = z_max/nstrata
pre = 0
for i in range(0,n):
    if train_sorted[i][2] >= cur:
        stratas.append(train_sorted[pre:i])
        pre = i
        cur += z_max/nstrata
    if i == n - 1:
        stratas.append(train_sorted[pre:i])
strata_lengths = []
# randomized to undo sorting
for i in range(0,nstrata):
    random.shuffle(stratas[i])
    strata_lengths.append(len(stratas[i]))
print(strata_lengths) # individual strata sizes

[157485, 50981, 8241, 7430]


In [10]:
nnetwork = 7 # networks in the ensemble minus 1 (6 networks)

In [11]:
print(strata_lengths)

[157485, 50981, 8241, 7430]


In [12]:
def monte_carlo(sample_array):
    sampled = random.sample(sample_array, 1)[0]
    new = sampled
    new[0] = 0
    new[2] = np.random.normal(loc=sampled[2], scale=sampled[1], size=None) # New specz [2] using specz error [1]
    new[12] = np.random.normal(loc=sampled[12], scale=sampled[17], size=None) # New g mag [12] using mag error [17]
    new[13] = np.random.normal(loc=sampled[13], scale=sampled[18], size=None) # New r mag [13] using mag error [18]
    new[14] = np.random.normal(loc=sampled[14], scale=sampled[19], size=None) # New i mag [14] using mag error [19]
    new[15] = np.random.normal(loc=sampled[15], scale=sampled[20], size=None) # New z mag [15] using mag error [20]
    new[16] = np.random.normal(loc=sampled[16], scale=sampled[21], size=None) # New y mag [16] using mag error [21]
    return new

In [13]:
sample_array = [1, 1, 1, 1] # proportionality configurtion

max_array = [max(strata_lengths)] * nstrata

# Sets an array where the elements are the modified strata sizes
sample_size_array = [int(i*j) for i,j in zip(np.true_divide(sample_array,max(sample_array)), max_array)]

# Number of data points generated in each stratum
generating_set_size = np.subtract(sample_size_array,strata_lengths)

In [14]:
generating_set_size

array([     0, 106504, 149244, 150055])

In [15]:
# Sampling training data from each strata

for i in range(0, nstrata):
    for j in range(0, generating_set_size[i]):
        generated = monte_carlo(list(stratas[i]))
        stratas[i].append(generated)

In [16]:
training_data = []
for i in range(0, nnetwork):
    training_data_bin = []
    for j in range(0, nstrata):
        training_data_bin += stratas[j]
    random.shuffle(training_data_bin)
    training_data.append(training_data_bin)

In [17]:
for i in range(1, nnetwork + 1):
    df = pd.DataFrame(training_data[i - 1], columns=['object_id', 'specz_redshift_err', 'specz_redshift', 'specz_mag_i',
       'specz_name', 'specz_ra', 'specz_dec', 'specz_flag_homogeneous', 'ra',
       'dec', 'coord', 'skymap_id', 'g_cmodel_mag', 'r_cmodel_mag',
       'i_cmodel_mag', 'z_cmodel_mag', 'y_cmodel_mag', 'g_cmodel_magsigma',
       'r_cmodel_magsigma', 'i_cmodel_magsigma', 'z_cmodel_magsigma',
       'y_cmodel_magsigma'])
    #df.to_csv('/data/HSC/HSC_v6/3_model_comparison/oversampled_training_set_'+str(z_max)+'z_'+str(sample_array)+'#'+str(i)+'.csv', index=False)

In [18]:
# END

In [19]:
df # TOTAL SIZE FOR OVERSAMPLED 

Unnamed: 0,object_id,specz_redshift_err,specz_redshift,specz_mag_i,specz_name,specz_ra,specz_dec,specz_flag_homogeneous,ra,dec,...,g_cmodel_mag,r_cmodel_mag,i_cmodel_mag,z_cmodel_mag,y_cmodel_mag,g_cmodel_magsigma,r_cmodel_magsigma,i_cmodel_magsigma,z_cmodel_magsigma,y_cmodel_magsigma
0,0,0.001974,1.616345,20.542801,SDSS-DR14-8840719017867976704,359.577450,2.636621,True,359.577471,2.636632,...,20.780803,20.572377,20.357231,20.349388,20.401316,0.002932,0.002974,0.003009,0.004020,0.008708
1,40045205621054702,0.000600,0.204950,18.313000,GAMA-DR2-185194,179.434540,-1.634670,True,179.434561,-1.634690,...,19.256144,18.227737,17.730555,17.473797,17.317680,0.001538,0.000902,0.000592,0.000874,0.001444
2,43747518969839933,0.000150,0.566220,19.355000,SDSS-DR12-1237678618487292570,350.226180,2.117411,True,350.226113,2.117440,...,21.870705,20.339184,19.343140,18.913363,18.697018,0.010627,0.003711,0.002113,0.002533,0.003318
3,0,0.000430,2.185321,21.091000,SDSS-DR12-1237679006093410781,343.685520,5.221148,True,343.685519,5.221183,...,21.758679,21.981128,21.576073,20.971487,21.047967,0.003693,0.007665,0.005231,0.005966,0.013445
4,42296288175199443,0.000110,0.549100,19.702000,SDSS-DR12-1237648721248060098,219.569360,0.002011,True,219.569395,0.002028,...,22.358559,20.748165,19.787872,19.356800,19.163898,0.009529,0.003113,0.001115,0.001496,0.002065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629935,41153238463942763,0.000160,0.546720,19.284000,SDSS-DR12-1237648721236787887,193.866220,-0.127248,True,193.866229,-0.127246,...,22.420559,20.679976,19.556683,19.157494,18.960735,0.017309,0.003706,0.003430,0.002406,0.004324
629936,74649147239568431,0.000150,0.462030,22.797001,DEEP2DR4uniq_4292,213.921600,52.232005,True,213.921583,52.232013,...,23.768225,23.140442,22.880527,22.687515,22.729708,0.015898,0.016885,0.011619,0.029968,0.077199
629937,0,0.000740,2.409083,21.468000,"SDSS-DR12-1237679254136422981,VIPERS-PDR1-1270...",38.618523,-4.279513,True,38.618488,-4.279488,...,21.687149,21.728382,21.646048,21.450242,21.213031,0.002381,0.003859,0.003649,0.007755,0.009166
629938,0,0.000770,1.786904,19.464001,SDSS-DR12-1237678617411256811,345.050670,1.116826,True,345.050652,1.116876,...,19.984182,19.986213,19.754134,19.631443,19.623455,0.001255,0.001259,0.000827,0.001284,0.002482
