# Example of building a MLDataSet


## Building a Features MLDataSet from a Table

In [1]:
from PrimalCore.heterogeneous_table.table import Table
from ElementsKernel.Path import getPathFromEnvVariable

In [2]:
ph_catalog=getPathFromEnvVariable('PrimalCore/test_table.fits','ELEMENTS_AUX_PATH')

In [3]:
catalog=Table.from_fits_file(ph_catalog,fits_ext=0)

| input data built
| data Rows,Cols 1000 124


In [4]:
catalog.keep_columns(['FLUX*','reliable_S15','STAR','AGN','MASKED','FLAG_PHOT'],regex=True)

First we import the classes and the functions we need

In [5]:
from PrimalCore.homogeneous_table.dataset import MLDataSet

In [6]:
dataset=MLDataSet.new_from_table(catalog)

| building features
| features built
| Rows,Cols 1000 50


In [7]:
print dataset.features_names

['FLUX_G_1', 'FLUX_G_2', 'FLUX_G_3', 'FLUX_R_1', 'FLUX_R_2', 'FLUX_R_3', 'FLUX_I_1', 'FLUX_I_2', 'FLUX_I_3', 'FLUX_VIS', 'FLUX_Z_1', 'FLUX_Z_2', 'FLUX_Z_3', 'FLUX_Y_1', 'FLUX_Y_2', 'FLUX_Y_3', 'FLUX_J_1', 'FLUX_J_2', 'FLUX_J_3', 'FLUX_H_1', 'FLUX_H_2', 'FLUX_H_3', 'FLUXERR_G_1', 'FLUXERR_G_2', 'FLUXERR_G_3', 'FLUXERR_R_1', 'FLUXERR_R_2', 'FLUXERR_R_3', 'FLUXERR_I_1', 'FLUXERR_I_2', 'FLUXERR_I_3', 'FLUXERR_VIS', 'FLUXERR_Z_1', 'FLUXERR_Z_2', 'FLUXERR_Z_3', 'FLUXERR_Y_1', 'FLUXERR_Y_2', 'FLUXERR_Y_3', 'FLUXERR_J_1', 'FLUXERR_J_2', 'FLUXERR_J_3', 'FLUXERR_H_1', 'FLUXERR_H_2', 'FLUXERR_H_3', 'FLAG_PHOT', 'FLUX_RADIUS_DETECT', 'MASKED', 'reliable_S15', 'STAR', 'AGN']


In [8]:
print dataset.features_original_entry_ID[1:10]

[1 2 3 4 5 6 7 8 9]


and in this way it **safely** can not be used as a feature.


## Building a Features MLDataSet from a FITS file

In [9]:
dataset_from_file=MLDataSet.new_from_fits_file(ph_catalog,fits_ext=0,\
                                               use_col_names_list=['FLUX*','reliable_S15','STAR','AGN','MASKED','FLAG_PHOT'],\
                                               regex=True)

| input data built
| data Rows,Cols 1000 124
| building features
| features built
| Rows,Cols 1000 50


In [10]:
print dataset_from_file.features_names

['FLUX_G_1', 'FLUX_G_2', 'FLUX_G_3', 'FLUX_R_1', 'FLUX_R_2', 'FLUX_R_3', 'FLUX_I_1', 'FLUX_I_2', 'FLUX_I_3', 'FLUX_VIS', 'FLUX_Z_1', 'FLUX_Z_2', 'FLUX_Z_3', 'FLUX_Y_1', 'FLUX_Y_2', 'FLUX_Y_3', 'FLUX_J_1', 'FLUX_J_2', 'FLUX_J_3', 'FLUX_H_1', 'FLUX_H_2', 'FLUX_H_3', 'FLUXERR_G_1', 'FLUXERR_G_2', 'FLUXERR_G_3', 'FLUXERR_R_1', 'FLUXERR_R_2', 'FLUXERR_R_3', 'FLUXERR_I_1', 'FLUXERR_I_2', 'FLUXERR_I_3', 'FLUXERR_VIS', 'FLUXERR_Z_1', 'FLUXERR_Z_2', 'FLUXERR_Z_3', 'FLUXERR_Y_1', 'FLUXERR_Y_2', 'FLUXERR_Y_3', 'FLUXERR_J_1', 'FLUXERR_J_2', 'FLUXERR_J_3', 'FLUXERR_H_1', 'FLUXERR_H_2', 'FLUXERR_H_3', 'FLUX_RADIUS_DETECT', 'reliable_S15', 'STAR', 'AGN', 'MASKED', 'FLAG_PHOT']


## Columns selection
### using  `use_col_names_list` in the factories

In [11]:
dataset=MLDataSet.new_from_table(catalog,use_col_names_list=['FLUX*','reliable_S15','STAR','AGN','MASKED','FLAG_PHOT'],\
                                regex=True)

| building features
| features built
| Rows,Cols 1000 50


In [12]:
print dataset.features_names

['FLUX_G_1', 'FLUX_G_2', 'FLUX_G_3', 'FLUX_R_1', 'FLUX_R_2', 'FLUX_R_3', 'FLUX_I_1', 'FLUX_I_2', 'FLUX_I_3', 'FLUX_VIS', 'FLUX_Z_1', 'FLUX_Z_2', 'FLUX_Z_3', 'FLUX_Y_1', 'FLUX_Y_2', 'FLUX_Y_3', 'FLUX_J_1', 'FLUX_J_2', 'FLUX_J_3', 'FLUX_H_1', 'FLUX_H_2', 'FLUX_H_3', 'FLUXERR_G_1', 'FLUXERR_G_2', 'FLUXERR_G_3', 'FLUXERR_R_1', 'FLUXERR_R_2', 'FLUXERR_R_3', 'FLUXERR_I_1', 'FLUXERR_I_2', 'FLUXERR_I_3', 'FLUXERR_VIS', 'FLUXERR_Z_1', 'FLUXERR_Z_2', 'FLUXERR_Z_3', 'FLUXERR_Y_1', 'FLUXERR_Y_2', 'FLUXERR_Y_3', 'FLUXERR_J_1', 'FLUXERR_J_2', 'FLUXERR_J_3', 'FLUXERR_H_1', 'FLUXERR_H_2', 'FLUXERR_H_3', 'FLUX_RADIUS_DETECT', 'reliable_S15', 'STAR', 'AGN', 'MASKED', 'FLAG_PHOT']


###  using dataset_handler fucntions

In [13]:
from PrimalCore.homogeneous_table.dataset_handler import drop_features
from PrimalCore.homogeneous_table.dataset_handler import keep_features

In [14]:
drop_features(dataset,['FLUX*1*'])
dataset.features_names

| features initial Rows,Cols= 1000 50
| removing features ['FLUX_G_1', 'FLUX_R_1', 'FLUX_I_1', 'FLUX_Z_1', 'FLUX_Y_1', 'FLUX_J_1', 'FLUX_H_1', 'FLUXERR_G_1', 'FLUXERR_R_1', 'FLUXERR_I_1', 'FLUXERR_Z_1', 'FLUXERR_Y_1', 'FLUXERR_J_1', 'FLUXERR_H_1']
| features final Rows,Cols= 1000 36



['FLUX_G_2',
 'FLUX_G_3',
 'FLUX_R_2',
 'FLUX_R_3',
 'FLUX_I_2',
 'FLUX_I_3',
 'FLUX_VIS',
 'FLUX_Z_2',
 'FLUX_Z_3',
 'FLUX_Y_2',
 'FLUX_Y_3',
 'FLUX_J_2',
 'FLUX_J_3',
 'FLUX_H_2',
 'FLUX_H_3',
 'FLUXERR_G_2',
 'FLUXERR_G_3',
 'FLUXERR_R_2',
 'FLUXERR_R_3',
 'FLUXERR_I_2',
 'FLUXERR_I_3',
 'FLUXERR_VIS',
 'FLUXERR_Z_2',
 'FLUXERR_Z_3',
 'FLUXERR_Y_2',
 'FLUXERR_Y_3',
 'FLUXERR_J_2',
 'FLUXERR_J_3',
 'FLUXERR_H_2',
 'FLUXERR_H_3',
 'FLUX_RADIUS_DETECT',
 'reliable_S15',
 'STAR',
 'AGN',
 'MASKED',
 'FLAG_PHOT']

In [15]:
keep_features(dataset,['FLUX*2*'],regex=True)
print dataset.features_names

| features initial Rows,Cols= 1000 36
| removing features ['FLUX_G_3', 'FLUX_R_3', 'FLUX_I_3', 'FLUX_VIS', 'FLUX_Z_3', 'FLUX_Y_3', 'FLUX_J_3', 'FLUX_H_3', 'FLUXERR_G_3', 'FLUXERR_R_3', 'FLUXERR_I_3', 'FLUXERR_VIS', 'FLUXERR_Z_3', 'FLUXERR_Y_3', 'FLUXERR_J_3', 'FLUXERR_H_3', 'FLUX_RADIUS_DETECT', 'reliable_S15', 'STAR', 'AGN', 'MASKED', 'FLAG_PHOT']
| features final Rows,Cols= 1000 14

['FLUX_G_2', 'FLUX_R_2', 'FLUX_I_2', 'FLUX_Z_2', 'FLUX_Y_2', 'FLUX_J_2', 'FLUX_H_2', 'FLUXERR_G_2', 'FLUXERR_R_2', 'FLUXERR_I_2', 'FLUXERR_Z_2', 'FLUXERR_Y_2', 'FLUXERR_J_2', 'FLUXERR_H_2']


## Adding features

In [16]:
from PrimalCore.homogeneous_table.dataset_handler import add_features

test_feature=dataset.get_feature_by_name('FLUXERR_H_2')**2
add_features(dataset,'test',test_feature)
dataset.features_names

['FLUX_G_2',
 'FLUX_R_2',
 'FLUX_I_2',
 'FLUX_Z_2',
 'FLUX_Y_2',
 'FLUX_J_2',
 'FLUX_H_2',
 'FLUXERR_G_2',
 'FLUXERR_R_2',
 'FLUXERR_I_2',
 'FLUXERR_Z_2',
 'FLUXERR_Y_2',
 'FLUXERR_J_2',
 'FLUXERR_H_2',
 'test']

Or we can add a 2dim array of features


In [17]:
test_feature_2dim=np.zeros((dataset.features_N_rows,5))
test_feature_2dim_names=['a','b','c','d','e']
add_features(dataset,test_feature_2dim_names,test_feature_2dim)
dataset.features_names

['FLUX_G_2',
 'FLUX_R_2',
 'FLUX_I_2',
 'FLUX_Z_2',
 'FLUX_Y_2',
 'FLUX_J_2',
 'FLUX_H_2',
 'FLUXERR_G_2',
 'FLUXERR_R_2',
 'FLUXERR_I_2',
 'FLUXERR_Z_2',
 'FLUXERR_Y_2',
 'FLUXERR_J_2',
 'FLUXERR_H_2',
 'test',
 'a',
 'b',
 'c',
 'd',
 'e']

We can think to a more meaningful example, i.e. we want to add flux ratios. Lets start by defining the list of 
contigous bands, for the flux evaluation

In [18]:
flux_bands_list_2=['FLUX_G_2','FLUX_R_2','FLUX_I_2','FLUX_Z_2','FLUX_Y_2','FLUX_J_2','FLUX_VIS','FLUX_VIS','FLUX_VIS']
flux_bands_list_1=['FLUX_R_2','FLUX_I_2','FLUX_Z_2','FLUX_Y_2','FLUX_J_2','FLUX_H_2','FLUX_Y_2','FLUX_J_2','FLUX_H_2']

In [19]:
from PrimalCore.phz_tools.photometry import FluxRatio

In [20]:
for f1,f2 in zip(flux_bands_list_1,flux_bands_list_2):
    f1_name=f1.split('_')[1]
    f2_name=f2.split('_')[1]
    if f1 in dataset.features_names and f2 in dataset.features_names:
        f=FluxRatio('F_%s'%(f2_name+'-'+f1_name),f1,f2,features=dataset)
        add_features(dataset,f.name,f.values)

  return features.get_feature_by_name(band_2)/features.get_feature_by_name(band_1)
  return features.get_feature_by_name(band_2)/features.get_feature_by_name(band_1)


In [21]:
dataset.features_names

['FLUX_G_2',
 'FLUX_R_2',
 'FLUX_I_2',
 'FLUX_Z_2',
 'FLUX_Y_2',
 'FLUX_J_2',
 'FLUX_H_2',
 'FLUXERR_G_2',
 'FLUXERR_R_2',
 'FLUXERR_I_2',
 'FLUXERR_Z_2',
 'FLUXERR_Y_2',
 'FLUXERR_J_2',
 'FLUXERR_H_2',
 'test',
 'a',
 'b',
 'c',
 'd',
 'e',
 'F_G-R',
 'F_R-I',
 'F_I-Z',
 'F_Z-Y',
 'F_Y-J',
 'F_J-H']

## Operations on rows
### filtering NaN/Inf with dataset_preprocessing functions

In [22]:
from PrimalCore.preprocessing.dataset_preprocessing import drop_nan_inf

In [23]:
drop_nan_inf(dataset)

| features cleaning for nan/inf
| features initial Rows,Cols= 1000 26
| features initial Rows,Cols= 1000 26
| removing features []
| features final Rows,Cols= 1000 26

|removed columns []
|removed rows 468
| features cleaned Rows,Cols= 532 26

