In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import astropy.stats as astrostats
import scipy.stats as stats

In [2]:
def calculate_bias(predictions,true_values):
    
# calculate a systematic bias in delz

# predictions is an array for point estimates of redshifts
# true_values is an array for true values of redshifts

    delz = (predictions-true_values)/(1+true_values)

    bias = sum(delz)/len(delz)
    return bias

In [3]:
def calculate_conv_disp(predictions, true_values):

# calculate the conventional dispersion
    
    delz = (predictions-true_values)/(1+true_values)
    MAD = stats.median_abs_deviation(delz)
    disp = 1.48*MAD
    
    return disp

In [4]:
def calculate_disp (predictions,true_values):
    
# calculate the dispersion using biweight statistics
    
    delz = (predictions-true_values)/(1+true_values)
    filtered_delz = astrostats.sigma_clip(delz, sigma = 3)
    filtered_delz = astrostats.sigma_clip(filtered_delz, sigma = 3)
    filtered_delz = astrostats.sigma_clip(filtered_delz, sigma = 3)
    
    MAD = astrostats.median_absolute_deviation(filtered_delz)
    disp = 1.48*MAD
    
    return disp


In [5]:
def calculate_outlier_rate (predictions, true_values):

#predictions and true_values should have the same length
#the output of this function is the outlier rate of the predictions compared to the true values

    delz = abs(predictions - true_values)-0.15*(1+true_values)
    number_outlier_point = len(np.where(delz > 0)[0])

    outlier_rate_point = number_outlier_point*1.0/len(true_values)

    return outlier_rate_point

In [6]:
def calculate_loss_function(predictions, true_values):
    
    delz = abs(predictions - true_values)-0.15*(1+true_values)
    L = 1-(1/(1+(delz/0.15)**2))
    
    return L

In [7]:
def generate_pdf_random_forest(number_estimator,X_train,y_train,X_test):

#number_estimator is an integer for the number of estimators for the random forest model
#X_train is an array for training set of inputs
#y_train is an array for training set of true values
#X_test is an array for testing set for inputs
#the output of this function is the predictions of pdfs of the X_test.
#output is a N x number_estimator array, where N is the number of elements in X_test. 
    
    randomForest = RandomForestRegressor(n_estimators = number_estimator)
    randomForest.fit(X_train, y_train)
    
    predictions = []
    for i in range (number_estimator):
        predictions.append(
        np.array(
        randomForest.estimators_[i].predict(X_test)))
    
    predictions = np.asarray(predictions).T
    predictions = np.sort(predictions, axis = 1)
    
    return predictions

In [8]:
def calculate_PIT(predictions,true_value):

# predictions is a two dimentional array. Each row is a PDF for the estimation of redshift for a galaxy. 
# y_test is an array containing the true_value of redshift for all the galaxies. 
# the ouput of this function is a PIT distribution for all the PDFs. 

    PIT  = np.zeros(len(predictions))
    
    for i in range (len(predictions)):          
        PIT[i] = len(np.where(predictions[i]<true_value[i])[0])*1.0/len(predictions[0])
        
    return PIT

In [9]:
def calculate_CRPS(predictions,true_values):
    
    length = len(predictions[0])
    crps = np.zeros(len(predictions))
    for i in range (len(predictions)):
        for j in range (200):
            z = 4.0*j/200
            if z < true_values[i]:
                crps[i] += ((len(np.where(predictions[i]<z)[0])*1.0/length)**2)*(4.0/200)
            else:
                crps[i] += ((len(np.where(predictions[i]<z)[0])*1.0/length-1)**2)*(4.0/200)
    return crps

In [10]:
photozdata = pd.read_csv('/data/HSC/HSC_IMAGES_FIXED/HSC_photozdata_full_header_trimmed.csv')

In [11]:
filt =(photozdata['specz_redshift_err'] >0)\
&(photozdata['specz_redshift_err'] < 1)\
&(photozdata["specz_redshift"] > 0)\
&(photozdata["specz_redshift"]<4)

photozdata_subset = photozdata[filt]

In [12]:
y = photozdata_subset['specz_redshift']
X = photozdata_subset[['g_cmodel_mag','r_cmodel_mag','i_cmodel_mag','z_cmodel_mag','y_cmodel_mag']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
y_test = np.asarray(y_test)

In [13]:
randomForest = RandomForestRegressor()
randomForest.fit(X_train, y_train)

RandomForestRegressor()

In [14]:
y_test_predict = randomForest.predict(X_test)

In [15]:
bias = calculate_bias(y_test_predict,y_test)
disp = calculate_disp(y_test_predict,y_test)
conv_disp = calculate_conv_disp(y_test_predict,y_test)
outlier_rate = calculate_outlier_rate(y_test_predict,y_test)
loss_function = calculate_loss_function(y_test_predict,y_test)


print(bias)
print(disp)
print(conv_disp)
print (outlier_rate)
print (loss_function)

0.03474311913367123
0.0681739241633844
0.08047461860730765
0.2281969740772863
[0.4538905  0.17089263 0.93905724 ... 0.55278074 0.60952835 0.77111103]


In [None]:
predictions = generate_pdf_random_forest(200,X_train,y_train,X_test)

In [None]:
PIT  = calculate_PIT(predictions,y_test)
plt.hist(PIT, bins = 50)

In [None]:
CRPS = calculate_CRPS(predictions,y_test)

In [None]:
plt.hist(CRPS,bins = 50)