# Imports 

In [1]:
## Essential Imports: 
import os
import numpy as np
import qp
import tables_io
from pathlib import Path 
from pzflow.examples import get_galaxy_data
import ceci

## RAIL-Specific Imports: 
import rail
from rail.creation.degradation import LSSTErrorModel, InvRedshiftIncompleteness
from rail.creation.engines.flowEngine import FlowModeler, FlowCreator, FlowPosterior
from rail.core.data import TableHandle
from rail.core.stage import RailStage
from rail.core.utilStages import ColumnMapper, TableConverter
from rail.estimation.algos.flexzboost import Inform_FZBoost, FZBoost
from rail.evaluation.evaluator import Evaluator


## Data Storage: 
DS = RailStage.data_store
DS.__class__.allow_overwrite = True

# Model

In [11]:
def makeModel():
    #path to access the data 
    DATA_DIR = Path().resolve() / "data"
    DATA_DIR.mkdir(exist_ok=True)

    catalog_file = DATA_DIR / "base_catalog.pq"

    bands = ['u','g','r','i','z','y']
    band_dict = {band:f'mag_{band}_lsst' for band in bands}
    
    #array of galaxies w/ 7 attributes for each: redshift & ugrizy
    catalog = get_galaxy_data().rename(band_dict, axis=1) 

    #turns array into a table 
    tables_io.write(catalog, str(catalog_file.with_suffix("")), catalog_file.suffix[1:])

    catalog_file = str(catalog_file)
    flow_file = str(DATA_DIR / "trained_flow.pkl")

    print(flow_file)

    #we set up the stage 
    flow_modeler_params = {
        "name": "flow_modeler",
        "input": catalog_file,
        "model": flow_file,
        "seed": 0,
        "phys_cols": {"redshift": [0, 3]},
        "phot_cols": {
            "mag_u_lsst": [17, 35],
            "mag_g_lsst": [16, 32],
            "mag_r_lsst": [15, 30],
            "mag_i_lsst": [15, 30],
            "mag_z_lsst": [14, 29],
            "mag_y_lsst": [14, 28],
        },
        "calc_colors": {"ref_column_name": "mag_i_lsst"},
    }
    flow_modeler = FlowModeler.make_stage(**flow_modeler_params)
    # flow_modeler.fit_model()
    return flow_modeler, flow_file ##.get_handle("model")

In [18]:
modelData, flow_file = makeModel() 

/Users/alicec03/Desktop/Summer_Research/Photo-z-Stress-Test/Photo-z-Stress-Test/data/trained_flow.pkl


# Training Set 

In [16]:
def trainSet(ntrain, seed):
    data = FlowCreator.make_stage(
            name = 'train_set',
            model = flow_file,
            n_samples = ntrain,
            seed = seed 
    )
    return data #.sample(ntrain, seed)

def invRedshift(pivot = 1.0):
    degr = InvRedshiftIncompleteness.make_stage(
        name = 'inv_redshift',
        pivot_redshift = pivot
    )
    return degr #(data)

In [14]:
# data = FlowCreator.make_stage(
#             name = 'train_set',
#             model = flow_file,
#             n_samples = 2,
#             seed = 78 )

Inserting handle into data store.  model: /Users/alicec03/Desktop/Summer_Research/Photo-z-Stress-Test/Photo-z-Stress-Test/data/trained_flow.pkl, train_set


In [31]:
# origTrainData = trainSet(modelData, 100, 372)
# bubble = origTrainData.sample(100, 372)

# degTrainData = invRedshift(1.0)
# dot = degTrainData(bubble)

In [32]:
# degTrainData.get_handle('output')

In [21]:
def getPosts(data, model, grid):
    posts = FlowPosterior.make_stage(
        name='get_posts'+str(data), 
        column='redshift',
        grid = grid,
        model = model,
        data = data
    )
    return posts #posts.get_posterior(data, column = 'redshift')

In [22]:
def makeGrid(zmin, zmax, nbins):
    import numpy as np
    grid = np.linspace(zmin, zmax, nbins + 1)
    return grid 

In [23]:
grid = makeGrid(0, 2.5, 100)

In [36]:
# origTrainPosts = getPosts(origTrainData, modelData, grid)
# degTrainPosts = getPosts(degTrainData, modelData, grid)

## Posts 

Only run if you need output_orig_train_posts

In [37]:
# flow_post_orig_train = FlowPosterior.make_stage(name='orig_train_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              data = orig_train)

# orig_train_pdfs = flow_post_orig_train.get_posterior(orig_train, column='redshift')

Only run if you need output_deg_train_posts ** rerun this cell!! 

In [38]:
# flow_post_deg_train = FlowPosterior.make_stage(name='deg_train_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              err_samples = 0,
#                                              data = deg_train)



# deg_train_pdfs = flow_post_deg_train.get_posterior(deg_train, column='redshift')

# Test Set 

In [76]:
def testSet(ntest, seed):
    data = FlowCreator.make_stage(
            name = 'test_set',
            model = flow_file,
            n_samples = ntest,
            seed = seed 
    )
    return data #.sample(ntest, seed)


## you need to ask alex about where you can find the defaults for these params 

bands = ['u','g','r','i','z','y']
band_dict = {band:f'mag_{band}_lsst' for band in bands}

def lsstError(dict, seed): #tvis = 1, nYrObs = 1, airmass = 1, extendedSource = 1, sigmaSys = 1, magLim = 1, ndFlag = 1, A_min = 1, A_max = 1):
    deg = LSSTErrorModel.make_stage(
        name='lsst_error',
        renameDict= dict, 
        ndFlag=np.nan,
        seed=seed,
    )
    return deg #(data)

In [40]:
# testSetMaker = testSet(modelData, 100, 17)
# testData = testSetMaker.sample(100, 17)
# degTestData = 

## Posts

Only run if you need output_orig_test_posts

In [41]:
# flow_post_orig_test = FlowPosterior.make_stage(name='orig_test_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              data = orig_test)

# orig_test_pdfs = flow_post_orig_test.get_posterior(orig_test, column='redshift')

Only run if you need output_deg_test_posts

In [42]:
# flow_post_deg_test = FlowPosterior.make_stage(name='deg_test_posts', 
#                                              column='redshift',
#                                              grid = np.linspace(0, 2.5, 101),
#                                              model=flow_file,
#                                              data = deg_test)

# deg_test_pdfs = flow_post_deg_test.get_posterior(deg_test, column='redshift')

# Make tables

In [43]:
# def makeTable(datafile):
    
#     bands = ['u','g','r','i','z','y']
#     rename_dict = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

#     col_remapper = ColumnMapper.make_stage(
#     name='col_remapper', 
#     columns=rename_dict,
#     )
#     table_conv = TableConverter.make_stage(
#     name='table_conv', 
#     output_format='numpyDict',
#     )
#     pq = col_remapper(datafile)
#     tabledata = table_conv(pq)
#     table = tables_io.convertObj(tabledata.data, tables_io.types.PD_DATAFRAME)
#     return table


# ## make two separate functions for each stage, make bands, rename_dict inputs 

In [44]:
bands = ['u','g','r','i','z','y']
band_dict_err = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

def colRemapper(dict):
    col_remapper = ColumnMapper.make_stage(
    name='col_remapper', 
    columns=dict,
    )
    return col_remapper

def tableConverter():
    table_conv = TableConverter.make_stage(
    name='table_conv', 
    output_format='numpyDict',
    )
    return table_conv

In [45]:
squiggle = colRemapper(band_dict_err)
noodle = tableConverter()

In [46]:
# trainTable = makeTable(trainData)
# testTable = makeTable(testData)

# Inform & Estimate

In [25]:
def informFZBoost():
    info = Inform_FZBoost.make_stage(
    name ='inform_FZBoost', 
    model ='fzboost.pkl', 
    hdf5_groupname='',
    )
    # info.inform(data)
    return info

In [48]:
# informedEst = informFZBoost()
# informedEst.inform(degTrainData.get_handle('output'))

In [87]:
def estimateFZBoost(info, nbins):
    est = FZBoost.make_stage(
    name='est_FZBoost', 
    nondetect_val=np.nan,
    model= info, #.get_handle('model'), 
    hdf5_groupname='',
    aliases=dict(input='test_data', output='fzboost_estim'),
    nzbins = nbins 
    )
    return est #.estimate(data)

In [50]:
# estData = estimateFZBoost(informedEst, 100)

# estData.estimate(testSetMaker.get_handle('output'))

In [92]:
help(RailStage.connect_input)


Help on function connect_input in module rail.core.stage:

connect_input(self, other, inputTag=None, outputTag=None)
    Connect another stage to this stage as an input
    
    Parameters
    ----------
    other : RailStage
         The stage whose output is being connected
    inputTag : str
         Which input tag of this stage to connect to.  None -> self.inputs[0]
    outputTag : str
         Which output tag of the other stage to connect to.  None -> other.outputs[0]
    
    Returns
    -------
    handle : The input handle for this stage



# Big F

In [99]:
def bigF(pivotz, ntrain, ntest, seed1, seed2, seed3, nbins):
    ##things you need
    grid = makeGrid(0, 2.5, nbins) 
    bands = ['u','g','r','i','z','y']
    band_dict = {band: f"mag_{band}_lsst" for band in bands}
    band_dict_err = {f'mag_{band}_lsst_err':f'mag_err_{band}_lsst' for band in bands}

    # modelData = makeModel()
    
    ##stages 
    trainData = trainSet(ntrain, seed1)
    invRed = invRedshift(pivotz)

    # origTrainPosts = getPosts(output_train_set.pq (???), modelData, grid)
    # degTrainPosts = getPosts(###)

    testData = testSet(ntest, seed2)
    lsstErr = lsstError(band_dict, seed3)

    # origTestPosts = getPosts(###)
    # degTestPosts = getPosts(###)
        
    ## maybe do table things? later tho 

    informFZB = informFZBoost()
    estFZB = estimateFZBoost(informFZB, nbins)

    
    ##pipeline and yml
    pipe = ceci.Pipeline.interactive()
    stages = [
        trainData, 
        invRed, 
        testData, 
        lsstErr,  
        informFZB, 
        estFZB]
    
    for stage in stages:
        pipe.add_stage(stage)
        

    invRed.connect_input(trainData)
    lsstErr.connect_input(testData)

    informFZB.connect_input(invRed)
    estFZB.connect_input(lsstErr)#testData)#informFZB, testData) #lsstErr)

    pipe.initialize(
    dict(model=flow_file), dict(output_dir=".", log_dir=".", resume=False), None) 

    pipe.save("test_pipeline_est.yml") 
   

In [100]:
bigF(1.0, 100, 100, 17, 39, 172, 10)

ValueError: 
Some required inputs to the pipeline could not be found,
(or possibly your pipeline is cyclic):

Stage est_FZBoost is missing input(s): test_data


In [38]:
pr = ceci.Pipeline.read("test_pipeline_est.yml")
pr.run()


Executing test_set
Command is:
OMP_NUM_THREADS=1   python3 -m ceci rail.creation.engines.flowEngine.FlowCreator   --model=/Users/alicec03/Desktop/Summer_Research/Photo-z-Stress-Test/Photo-z-Stress-Test/data/trained_flow.pkl   --name=test_set   --config=test_pipeline_config.yml   --output=./output_test_set.pq 
Output writing to ./test_set.out

Job test_set has completed successfully!

Executing lsst_error
Command is:
OMP_NUM_THREADS=1   python3 -m ceci rail.creation.degradation.lsst_error_model.LSSTErrorModel   --input=./output_test_set.pq   --name=lsst_error   --config=test_pipeline_config.yml   --output=./output_lsst_error.pq 
Output writing to ./lsst_error.out

Job lsst_error has completed successfully!

Executing train_set
Command is:
OMP_NUM_THREADS=1   python3 -m ceci rail.creation.engines.flowEngine.FlowCreator   --model=/Users/alicec03/Desktop/Summer_Research/Photo-z-Stress-Test/Photo-z-Stress-Test/data/trained_flow.pkl   --name=train_set   --config=test_pipeline_config.yml   -

0