# Imports 

In [47]:
## Essential Imports: 
import os
import numpy as np
import qp
import tables_io
from pathlib import Path 
from pzflow.examples import get_galaxy_data
import ceci

## RAIL-Specific Imports: 
import rail

# old : from rail.creation.degradation import LSSTErrorModel, InvRedshiftIncompleteness


from rail.creation.degradation.lsst_error_model import LSSTErrorModel
from rail.creation.degradation.spectroscopic_degraders import InvRedshiftIncompleteness

import rail.creation 
import rail.creation.engines
from rail.creation.engines.flowEngine import FlowModeler, FlowCreator, FlowPosterior
from rail.core.data import TableHandle
from rail.core.stage import RailStage
from rail.core.utilStages import ColumnMapper, TableConverter

# old : from rail.estimation.algos.flexzboost import Inform_FZBoost, FZBoost

from rail.estimation.algos.pzflow_nf import PZFlowInformer, PZFlowEstimator 
from rail.estimation.algos.flexzboost import FlexZBoostInformer, FlexZBoostEstimator
from rail.estimation.algos.gpz import GPzInformer, GPzEstimator
from rail.estimation.algos.cmnn import CMNNInformer, CMNNEstimator 
from rail.estimation.algos.train_z import TrainZEstimator, TrainZInformer  
# from rail.estimation.algos.k_nearneigh import #Inform_KNearNeighPDF, Inform_KNearNeighPDF 
#from rail.estimation.algos.minisom_som import MiniSOMInformer, MiniSOMEstimator 
#from rail.estimation.algos.sklearn_neurnet import #Inform_SimpleNN, Inform_SimpleNN 
#from rail.estimation.algos.somoclu_som import SOMocluInformer, SOMocluInformer

#from rail.estimation.algos.bpz_lite import BPZliteInformer, BPZliteEstimator


from rail.evaluation.evaluator import Evaluator


## Data Storage: 
DS = RailStage.data_store
DS.__class__.allow_overwrite = True


### CMNN, PZFlow, FlexZBoost, GPZ 

ModuleNotFoundError: No module named 'rail.estimation.algos.knnpz'

In [48]:
help(rail.estimation.algos)

Help on package rail.estimation.algos in rail.estimation:

NAME
    rail.estimation.algos

PACKAGE CONTENTS
    _gpz_util
    bpz_lite
    cmnn
    delightPZ
    equal_count
    flexzboost
    gpz
    k_nearneigh
    minisom_som
    naive_stack
    nz_dir
    point_est_hist
    pzflow_nf
    random_forest
    random_gauss
    sklearn_neurnet
    somoclu_som
    train_z
    uniform_binning
    var_inf

FILE
    (built-in)




In [2]:
help(rail.creation.engines)

Help on package rail.creation.engines in rail.creation:

NAME
    rail.creation.engines

PACKAGE CONTENTS
    dsps_photometry_creator
    dsps_sed_modeler
    flowEngine
    gcr_engine

FILE
    (built-in)




In [3]:
#from rail.stages import *
#rail.stages.import_and_attach_all()
#for val in RailStage.pipeline_stages.values():
#    print(val[0])

# Model

In [2]:
def makeModel():
    #path to access the data 
    DATA_DIR = Path().resolve() / "data"
    DATA_DIR.mkdir(exist_ok=True)

    catalog_file = DATA_DIR / "base_catalog.pq"

    bands = ['u','g','r','i','z','y']
    band_dict = {band:f'mag_{band}_lsst' for band in bands}
    
    #array of galaxies w/ 7 attributes for each: redshift & ugrizy
    catalog = get_galaxy_data().rename(band_dict, axis=1) 

    #turns array into a table 
    tables_io.write(catalog, str(catalog_file.with_suffix("")), catalog_file.suffix[1:])

    catalog_file = str(catalog_file)
    flow_file = str(DATA_DIR / "trained_flow.pkl")

    print(flow_file)

    #we set up the stage 
    flow_modeler_params = {
        "name": "flow_modeler",
        "input": catalog_file,
        "model": flow_file,
        "seed": 0,
        "phys_cols": {"redshift": [0, 3]},
        "phot_cols": {
            "mag_u_lsst": [17, 35],
            "mag_g_lsst": [16, 32],
            "mag_r_lsst": [15, 30],
            "mag_i_lsst": [15, 30],
            "mag_z_lsst": [14, 29],
            "mag_y_lsst": [14, 28],
        },
        "calc_colors": {"ref_column_name": "mag_i_lsst"},
    }
    flow_modeler = FlowModeler.make_stage(**flow_modeler_params)
    # flow_modeler.fit_model()
    return flow_modeler, flow_file ##.get_handle("model")

In [3]:
modelData, flow_file = makeModel() 

/global/u2/a/acraffor/Photo-z-Stress-Test/data/trained_flow.pkl


# Training Set 

In [6]:
def trainSet(ntrain, seed):
    data = FlowCreator.make_stage(
            name = 'train_set',
            model = flow_file,
            n_samples = ntrain,
            seed = seed 
    )
    return data #.sample(ntrain, seed)

def invRedshift(pivot = 1.0):
    degr = InvRedshiftIncompleteness.make_stage(
        name = 'inv_redshift',
        pivot_redshift = pivot
    )
    return degr #(data)

In [7]:
# data = FlowCreator.make_stage(
#             name = 'train_set',
#             model = flow_file,
#             n_samples = 2,
#             seed = 78 )

In [8]:
# origTrainData = trainSet(modelData, 100, 372)
# bubble = origTrainData.sample(100, 372)

# degTrainData = invRedshift(1.0)
# dot = degTrainData(bubble)

In [9]:
# degTrainData.get_handle('output')

In [7]:
def getPosts(data, model, grid):
    posts = FlowPosterior.make_stage(
        name='get_posts'+str(data), 
        column='redshift',
        grid = grid,
        model = model,
        data = data
    )
    return posts #posts.get_posterior(data, column = 'redshift')

In [8]:
def makeGrid(zmin, zmax, nbins):
    import numpy as np
    grid = np.linspace(zmin, zmax, nbins + 1)
    return grid 

In [9]:
grid = makeGrid(0, 2.5, 100)

In [13]:
# origTrainPosts = getPosts(origTrainData, modelData, grid)
# degTrainPosts = getPosts(degTrainData, modelData, grid)

## Posts 

Only run if you need output_orig_train_posts

In [14]:
# flow_post_orig_train = FlowPosterior.make_stage(name='orig_train_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              data = orig_train)

# orig_train_pdfs = flow_post_orig_train.get_posterior(orig_train, column='redshift')

Only run if you need output_deg_train_posts ** rerun this cell!! 

In [15]:
# flow_post_deg_train = FlowPosterior.make_stage(name='deg_train_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              err_samples = 0,
#                                              data = deg_train)



# deg_train_pdfs = flow_post_deg_train.get_posterior(deg_train, column='redshift')

# Test Set 

In [10]:
def testSet(ntest, seed):
    data = FlowCreator.make_stage(
            name = 'test_set',
            model = flow_file,
            n_samples = ntest,
            seed = seed 
    )
    return data #.sample(ntest, seed)


## you need to ask alex about where you can find the defaults for these params 

bands = ['u','g','r','i','z','y']
band_dict = {band:f'mag_{band}_lsst' for band in bands}

def lsstError(dict, seed): #tvis = 1, nYrObs = 1, airmass = 1, extendedSource = 1, sigmaSys = 1, magLim = 1, ndFlag = 1, A_min = 1, A_max = 1):
    deg = LSSTErrorModel.make_stage(
        name='lsst_error',
        renameDict= dict, 
        ndFlag=np.nan,
        seed=seed,
    )
    return deg #(data)

In [11]:
# testSetMaker = testSet(modelData, 100, 17)
# testData = testSetMaker.sample(100, 17)
# degTestData = 

## Posts

Only run if you need output_orig_test_posts

In [18]:
# flow_post_orig_test = FlowPosterior.make_stage(name='orig_test_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              data = orig_test)

# orig_test_pdfs = flow_post_orig_test.get_posterior(orig_test, column='redshift')

Only run if you need output_deg_test_posts

In [19]:
# flow_post_deg_test = FlowPosterior.make_stage(name='deg_test_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              data = deg_test)

# deg_test_pdfs = flow_post_deg_test.get_posterior(deg_test, column='redshift')

# Make tables

In [20]:
# def makeTable(datafile):
    
#     bands = ['u','g','r','i','z','y']
#     rename_dict = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

#     col_remapper = ColumnMapper.make_stage(
#     name='col_remapper', 
#     columns=rename_dict,
#     )
#     table_conv = TableConverter.make_stage(
#     name='table_conv', 
#     output_format='numpyDict',
#     )
#     pq = col_remapper(datafile)
#     tabledata = table_conv(pq)
#     table = tables_io.convertObj(tabledata.data, tables_io.types.PD_DATAFRAME)
#     return table


# ## make two separate functions for each stage, make bands, rename_dict inputs 

In [12]:
bands = ['u','g','r','i','z','y']
band_dict_err = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

def colRemapper(dict):
    col_remap = ColumnMapper.make_stage(
    name='col_remapper', 
    columns=dict,
    )
    return col_remap

def tableConverter():
    table_conv = TableConverter.make_stage(
    name='table_conv', 
    output_format='numpyDict',
    )
    return table_conv

In [13]:
col_remap = colRemapper(band_dict_err)
table_conv = tableConverter()

In [23]:
# squiggle = colRemapper(band_dict_err)
# noodle = tableConverter()

In [24]:
# trainTable = makeTable(trainData)
# testTable = makeTable(testData)

# Inform & Estimate

In [14]:
def informFZBoost():
    info = Inform_FZBoost.make_stage(
    name ='inform_FZBoost', 
    model ='fzboost.pkl', 
    hdf5_groupname='',
    )
    # info.inform(data)
    return info

In [15]:
# informedEst = informFZBoost()
# informedEst.inform(degTrainData.get_handle('output'))

In [16]:
def estimateFZBoost(info, nbins):
    est = FZBoost.make_stage(
    name='est_FZBoost', 
    nondetect_val=np.nan,
    model= info, #.get_handle('model'), 
    hdf5_groupname='',
    aliases=dict(input='test_data', output='fzboost_estim'),
    nzbins = nbins 
    )
    return est #.estimate(data)

In [28]:
# estData = estimateFZBoost(informedEst, 100)

# estData.estimate(testSetMaker.get_handle('output'))

In [29]:
from rail.core import RailStage

help(RailStage.connect_input)

Help on function connect_input in module rail.core.stage:

connect_input(self, other, inputTag=None, outputTag=None)
    Connect another stage to this stage as an input
    
    Parameters
    ----------
    other : RailStage
         The stage whose output is being connected
    inputTag : str
         Which input tag of this stage to connect to.  None -> self.inputs[0]
    outputTag : str
         Which output tag of the other stage to connect to.  None -> other.outputs[0]
    
    Returns
    -------
    handle : The input handle for this stage



In [30]:
def informPZFlow():
    inf = PZFlowInformer.make_stage(
    name = 'inform_PZFlow',
    model = 'pzflow.pkl',
    hdf5_groupname=""
    )
    return inf

def estimatePZFlow(info):
    est = PZFlowEstimator.make_stage(
    name = 'estimate_PZFlow',
    model = 'pzflow.pkl', #info.get_handle('model'),
    hdf5_groupname=""
    )
    return est

In [31]:
#help(PZFlowEstimator)

# Survey-Based Degraders

In [32]:
from rail.creation.degradation.spectroscopic_selections import * 

In [None]:
rail.creation.degradation.spectroscopic_selections.SpecSelection_BOSS
rail.creation.degradation.spectroscopic_selections.SpecSelection_DEEP2
rail.creation.degradation.spectroscopic_selections.SpecSelection_GAMA
rail.creation.degradation.spectroscopic_selections.SpecSelection_HSC
rail.creation.degradation.spectroscopic_selections.SpecSelection_VVDSf02 
rail.creation.degradation.spectroscopic_selections.SpecSelection_zCOSMOS

In [37]:
def specSelectBOSS(ntrain):
    degr = SpecSelection_BOSS.make_stage(
        name = 'specselection_boss',
        N_tot = ntrain
    )
    return degr 

In [38]:
def specSelectDEEP2(ntrain):
    degr = SpecSelection_DEEP2.make_stage(
        name = 'specselection_deep2',
        N_tot = ntrain
    )
    return degr 

In [39]:
def specSelectGAMA(ntrain):
    degr = SpecSelection_GAMA.make_stage(
        name = 'specselection_gama',
        N_tot = ntrain
    )
    return degr 

In [40]:
def specSelectHSC(ntrain):
    degr = SpecSelection_HSC.make_stage(
        name = 'specselection_HSC',
        N_tot = ntrain
    )
    return degr 

In [41]:
def specSelectVVDSf02(ntrain):
    degr = SpecSelection_VVDSf02.make_stage(
        name = 'specselection_VVDSf02',
        N_tot = ntrain
    )
    return degr 

In [42]:
def specSelectzCOSMOS(ntrain):
    degr = SpecSelection_zCOSMOS.make_stage(
        name = 'specselection_zCOSMOS',
        N_tot = ntrain
    )
    return degr 

# Big F 1.0

In [43]:
def bigF1(pivotz, ntrain, ntest, seed1, seed2, seed3, nbins):
    
    ##things you need
    #grid = makeGrid(0, 2.5, nbins) 
    bands = ['u','g','r','i','z','y']
    band_dict = {band: f"mag_{band}_lsst" for band in bands}
    band_dict_err = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

    # modelData = makeModel()
    
    ##stages 
    trainData = trainSet(ntrain, seed1)
    invRed = invRedshift(pivotz)

    # origTrainPosts = getPosts(output_train_set.pq (???), modelData, grid)
    # degTrainPosts = getPosts(###)

    testData = testSet(ntest, seed2)
    lsstErr = lsstError(band_dict, seed3)

    # origTestPosts = getPosts(###)
    # degTestPosts = getPosts(###)

    # informFZB = informFZBoost()
    # estFZB = estimateFZBoost(informFZB, nbins)

    infPZFlow = informPZFlow()
    estPZFlow = estimatePZFlow(infPZFlow)

    
    ##pipeline and yml
    pipe = ceci.Pipeline.interactive()
    stages = [
        trainData, 
        invRed, 
        testData, 
        lsstErr,  
        infPZFlow, 
        estPZFlow]
        #informFZB, 
        #estFZB]
    
    for stage in stages:
        pipe.add_stage(stage)
        

    invRed.connect_input(trainData)
    lsstErr.connect_input(testData)

    infPZFlow.connect_input(invRed) 
    estPZFlow.connect_input(infPZFlow, inputTag = 'model')
    estPZFlow.connect_input(lsstErr, inputTag = 'input') ## trucated out of docs :(

    # informFZB.connect_input(invRed)
    # estFZB.connect_input(informFZB, lsstErr) 
    
    pipe.initialize(
    dict(model=flow_file), dict(output_dir=".", log_dir=".", resume=False), None) 

    outpath = os.path.join(path_1, "invz='% s'_lsstErr_pzflow.yml" % '%.3f'%(pivotz))
    pipe.save(outpath)
    return outpath 
   

# Big F 2.0

In [44]:
def bigF2(ntrain, ntest, seed1, seed2, seed3, nbins):
    
    ##things you need
    #grid = makeGrid(0, 2.5, nbins) 
    bands = ['u','g','r','i','z','y']
    band_dict = {band: f"mag_{band}_lsst" for band in bands}
    band_dict_err = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

    # modelData = makeModel()
    
    ##stages 
    
    trainData = trainSet(ntrain, seed1)
    
    deg_ls = [specSelectBOSS(ntrain), 
          specSelectDEEP2(ntrain), 
          specSelectGAMA(ntrain), 
          specSelectHSC(ntrain), 
          specSelectVVDSf02(ntrain), 
          specSelectzCOSMOS(ntrain)]
    
    name_ls = ['BOSS', 'DEEP2', 'GAMA', 'HSC', 'VVDSf02', 'zCOSMOS']

    for i in range(len(deg_ls)):
        
        deg = deg_ls[i]

        testData = testSet(ntest, seed2)
        lsstErr = lsstError(band_dict, seed3)

        infPZFlow = informPZFlow()
        estPZFlow = estimatePZFlow(infPZFlow)


        ##pipeline and yml
        pipe = ceci.Pipeline.interactive()
        stages = [
            trainData, 
            deg, 
            testData, 
            lsstErr,  
            infPZFlow, 
            estPZFlow]
            #informFZB, 
            #estFZB]

        for stage in stages:
            pipe.add_stage(stage)


        deg.connect_input(trainData)
        lsstErr.connect_input(testData)

        infPZFlow.connect_input(deg) 
        estPZFlow.connect_input(infPZFlow, inputTag = 'model')
        estPZFlow.connect_input(lsstErr, inputTag = 'input') ## trucated out of docs :(

        # informFZB.connect_input(invRed)
        # estFZB.connect_input(informFZB, lsstErr) 

        pipe.initialize(
        dict(model=flow_file), dict(output_dir=".", log_dir=".", resume=False), None) 

        outpath = os.path.join(path_2, "'% s'_lsstErr_pzflow.yml" % name_ls[i])
        pipe.save(outpath)
        return outpath 

In [34]:
# help(rail.creation.degradation.spectroscopic_selections)

In [46]:
##run 

path_lst_2 = []
directory = "specSelection_lsstErr_pzflow"
parent_dir = "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs"
path_2 = os.path.join(parent_dir, directory)
os.makedirs(path_2)

path_lst_2.append(bigF2(100000, 100000, 17, 39, 172, 10))

NameError: name 'informPZFlow' is not defined

# Choose Pivot z's

In [33]:
import numpy as np

## seed1 and ndata should be the same as  seed1 and ntrain used to call bigF!! 
## Otherwise this might not be representative of the real data 

def choosePivots(seed1, ndata):
    nums = trainSet(ndata, seed1)
    data = nums.sample(ndata, seed1)
    data_pq = col_remap(data)
    data_table = table_conv(data_pq)
    table = tables_io.convertObj(data_table.data, tables_io.types.PD_DATAFRAME)
    return np.asarray(table['redshift'])

In [34]:
percentiles = np.arange(10, 100, 10)
pivots = [] 

for i in percentiles:
    pivot = np.percentile(choosePivots(17, 100000), i) 
    pivots.append(pivot)

Inserting handle into data store.  model: /global/u2/a/acraffor/Photo-z-Stress-Test/data/trained_flow.pkl, train_set
Inserting handle into data store.  output_train_set: inprogress_output_train_set.pq, train_set
Inserting handle into data store.  output_col_remapper: inprogress_output_col_remapper.pq, col_remapper
Inserting handle into data store.  output_table_conv: inprogress_output_table_conv.hdf5, table_conv
Inserting handle into data store.  output_train_set: inprogress_output_train_set.pq, train_set
Inserting handle into data store.  output_col_remapper: inprogress_output_col_remapper.pq, col_remapper
Inserting handle into data store.  output_table_conv: inprogress_output_table_conv.hdf5, table_conv
Inserting handle into data store.  output_train_set: inprogress_output_train_set.pq, train_set
Inserting handle into data store.  output_col_remapper: inprogress_output_col_remapper.pq, col_remapper
Inserting handle into data store.  output_table_conv: inprogress_output_table_conv.hdf

In [35]:
print(pivots)

[0.33672517538070684, 0.47006111145019536, 0.6267686605453491, 0.8275491118431091, 1.0106754302978516, 1.2042927742004392, 1.4413679003715512, 1.6783331394195558, 1.9954649806022646]


# Run Big F 1.0

In [37]:
path_lst_1 = []
directory = "invz_lsstErr_pzflow"
parent_dir = "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs"
path_1 = os.path.join(parent_dir, directory)
os.makedirs(path_1)

for i in (pivots):
    path_lst.append(bigF(i, 100000, 100000, 17, 39, 172, 10))


Inserting handle into data store.  output_test_set: inprogress_output_test_set.pq, test_set
Inserting handle into data store.  output_inv_redshift: inprogress_output_inv_redshift.pq, inv_redshift
Inserting handle into data store.  model_inform_PZFlow: inprogress_pzflow.pkl, inform_PZFlow
Inserting handle into data store.  output_lsst_error: inprogress_output_lsst_error.pq, lsst_error


In [38]:
print(path_lst_1)

["/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='0.33672517538070684'_lsstErr_pzflow.yml", "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='0.47006111145019536'_lsstErr_pzflow.yml", "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='0.6267686605453491'_lsstErr_pzflow.yml", "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='0.8275491118431091'_lsstErr_pzflow.yml", "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='1.0106754302978516'_lsstErr_pzflow.yml", "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='1.2042927742004392'_lsstErr_pzflow.yml", "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='1.4413679003715512'_lsstErr_pzflow.yml", "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='1.6783331394195558'_ls

In [39]:
pr = ceci.Pipeline.read(path_lst_1[0])#parent_dir+directory+"/invz=0.33672517538070684_lsstErr_pzflow.yml")
pr.run()

## 1) terminal: go to path up to invz_lsstErr_pzflow, then run these 2 lines 
## 2)  make list/txt file with list of paths to files made by big F

## do 1) 
## open virtual env
## python 
## import ceci 
## run the 2 lines of code above 


### at the end we can put this into a .py file that we can run at the command line 

## %cd ? 

ConstructorError: could not determine a constructor for the tag 'tag:yaml.org,2002:python/object/apply:numpy.core.multiarray.scalar'
  in "/global/u2/a/acraffor/Photo-z-Stress-Test/Pipeline_Outputs/invz_lsstErr_pzflow/invz='0.33672517538070684'_lsstErr_pzflow_config.yml", line 76, column 19

In [None]:
## more config parameters/better config parameters
## have to give path above to estimator model instead of get_handle('model')
## fix truncated parameter printing in help(...)