### Analyzing CMF pipeline metadata for Exatrack Pipeline (stored with CMF tracking layer queried with CMF query layer).

In [29]:
import pandas as pd
from cmflib import cmfquery

##### Initialize the library and get all the stages in the pipeline
Point the library to the metadata file. <br>
The `get_pipeline_stages` call point to the different stages in the pipeline.

In [30]:
query = cmfquery.CmfQuery("./mlmd_old")
stages = query.get_pipeline_stages("exatrkx")
print(stages)

['1. Train Metric Learning', '2. Metric Learning Inference', '3. Train GNN', '4. GNN Inference', '5. Build Track Candidates', '6. Evaluate Track Candidates']


##### Query the Executions for **1. Train Metric Learning**

In [31]:
pd.options.display.max_colwidth = 220
executions = query.get_all_executions_in_stage('1. Train Metric Learning')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           2       Train1  ['run_cmf.py']                  
1           2       Train1  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   
1  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  1e842843f55b010a1feeb1f8829641d1b0df1b4f       exatrkx            1   
1  0eb6db20770600353549ae0a4297a4763aeef769       exatrkx            1   

  activation  cell_channels  ...  r_test  r_train  r_val  randomisation  \
0       Tanh              9  ...       0        0      0              2   
1       Tanh              9  ...       0        0      0              2   

                  regime  spatial_channels   train_split  \
0  ['rp', 'hnm', 'norm']                 3  [80, 10, 10]   
1  ['rp', 'hnm', 'norm']                 3  [80, 10, 10]   

           

##### Query the Executions in **2. Metric Learning Inference**

In [33]:
pd.options.display.max_colwidth = 220
executions = query.get_all_executions_in_stage('2. Metric Learning Inference')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           3       Infer1  ['run_cmf.py']                  
1           3       Infer1  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   
1  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  583423f068fae4843f39fcc8753834aedab7bc44       exatrkx            1   
1  02c8f5ce2c03a88450ca65490155cc60f3cefa56       exatrkx            1   

  activation  cell_channels  ...  r_test  r_train  r_val  randomisation  \
0       Tanh              9  ...       0        0      0              2   
1       Tanh              9  ...       0        0      0              2   

                  regime  spatial_channels   train_split  \
0  ['rp', 'hnm', 'norm']                 3  [80, 10, 10]   
1  ['rp', 'hnm', 'norm']                 3  [80, 10, 10]   

           

##### Query the Executions in **3. Train GNN**

In [43]:
executions = query.get_all_executions_in_stage('3. Train GNN')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           4     TrainGNN  ['run_cmf.py']                  
1           4     TrainGNN  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   
1  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  9ed673874415bdd485ffee7e8607c0bfe7cf6ba5       exatrkx            1   
1  1017d2c851e88b2c8128dd45a7e89375e9a3a279       exatrkx            1   

  aggregation  cell_channels  ... noise                         output_dir  \
0     sum_max              0  ...     0  datasets/quickstart_gnn_processed   
1     sum_max              0  ...     0  datasets/quickstart_gnn_processed   

   patience  pt_background_min  pt_signal_min   regime  spatial_channels  \
0         8                  0              0  ['pid']                 3   
1         8                  0 

##### Query the Executions in **4. GNN Inference**

In [44]:
executions = query.get_all_executions_in_stage('4. GNN Inference')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           5     InferGNN  ['run_cmf.py']                  
1           5     InferGNN  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   
1  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  fcb36b975401a6c1c8069e6df7b5f15119abedd9       exatrkx            1   
1  f38ebb71d80b87add556b52f2f74b1815499b98c       exatrkx            1   

  aggregation  cell_channels  ... noise                         output_dir  \
0     sum_max              0  ...     0  datasets/quickstart_gnn_processed   
1     sum_max              0  ...     0  datasets/quickstart_gnn_processed   

   patience  pt_background_min  pt_signal_min   regime  spatial_channels  \
0         8                  0              0  ['pid']                 3   
1         8                  0 

##### Query the Executions in **5. Build Track Candidates**

In [45]:
executions = query.get_all_executions_in_stage('5. Build Track Candidates')
print(executions)

   Context_ID     Context_Type       Execution Git_End_Commit  \
0           6  TrackCandidates  ['run_cmf.py']                  
1           6  TrackCandidates  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   
1  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  id  \
0  5934238ce4a1daa1c380427b9599c96b82eb8a98       exatrkx            1  10   
1  7391e66236f06aab6ac8e2dc562f23370688dcf7       exatrkx            1   5   

                                     output_dir  score_cut  
0  datasets/quickstart_track_building_processed          0  
1  datasets/quickstart_track_building_processed          0  


##### Query the Executions in **6. Evaluate Track Candidates**

In [46]:
executions = query.get_all_executions_in_stage('6. Evaluate Track Candidates')
print(executions)

   Context_ID         Context_Type       Execution Git_End_Commit  \
0           7  EvalTrackCandidates  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  id  \
0  683d2c1accda5f402567411c97a0c04111d6a224       exatrkx            1  11   

   matching_fraction matching_style  max_eta  min_particle_length  min_pt  \
0                  0          ATLAS        4                    3       1   

   min_track_length                      output_dir  
0                 3  datasets/quickstart_evaluation  


##### Get all the artifacts of execution. 
<b>input parameter - execution_id</b><br>
<b>output parameter - artifacts</b><br>


In [50]:
artifacts = query.get_all_artifacts_for_execution(1)
print(artifacts)

                                            Commit  create_time_since_epoch  \
0  commit 5e901658ed19e7ef0bc061ac56315a171bdb025d            1663100205652   
1  commit dd92fe06becd11da4a03b4a3cf93a19e7a9a8097            1663100208307   
2  commit 02c8f5ce2c03a88450ca65490155cc60f3cefa56            1663100210546   

    event                                             git_repo  id  \
0   INPUT  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   1   
1  OUTPUT  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   2   
2  OUTPUT                                                  NaN   3   

   last_update_time_since_epoch   model_framework          model_name  \
0                 1663100205652               NaN                 NaN   
1                 1663100208307               NaN                 NaN   
2                 1663100210546  PyTorchLightning  LayerlessEmbedding   

      model_type  \
0            NaN   
1            NaN   
2  MLP Embedding   

                            

In [51]:
#print(pd.options.display.max_colwidth)
pd.options.display.max_colwidth = 120
artifacts = query.get_all_artifacts_for_execution(1)

# print(artifacts)
print(artifacts["name"])
print(artifacts["event"])

0                     datasets/quickstart_example_1GeV:48d329424dac06431b09486e194a5ae9.dir
1        datasets/quickstart_metric_learning_processed:162349f1df2916e321b506159e634695.dir
2    artifacts/metric_learning/trackml_quickstart_2.ckpt:f90837df06e9649f5a949e8e1a562265:1
Name: name, dtype: object
0     INPUT
1    OUTPUT
2    OUTPUT
Name: event, dtype: object


#### get all executions for an artifact(pass the artifact full name as the input parameter)

In [52]:
linked = query.get_all_executions_for_artifact("datasets/quickstart_metric_learning_processed:162349f1df2916e321b506159e634695.dir")
print(linked)

     Type  execution_id execution_name pipeline                     stage
0  OUTPUT             1                 exatrkx  1. Train Metric Learning


#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [None]:
linked = query.get_all_parent_artifacts("artifacts/features/test.pkl")
print(linked)

#### Get all the child artifacts of an artifact. Provides the lineage chain in the downstream direction

In [None]:
linked = query.get_all_child_artifacts("artifacts/features/train.pkl")

print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [None]:
linked = query.get_all_parent_artifacts("artifacts/model/model.pkl")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_parent_artifacts("artifacts/parsed/test.tsv")
print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_child_artifacts("artifacts/parsed/test.tsv")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

#### Get immediate child artifacts of an artifact. 

In [None]:
linked = query.get_one_hop_child_artifacts("artifacts/data.xml.gz")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_child_artifacts("artifacts/data.xml.gz")
#print(linked.sort_values('create_time_since_epoch', ascending=True))
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_artifacts_for_execution(4)
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

### Change the metrics name in the get_artifact call with the metrics name from output of the previous cell

In [None]:
artifacts = query.get_artifact("metrics:aaae534e-915d-11ec-b106-89841b9859cd:4")
print(artifacts)


In [None]:
new_parquet_df = pd.read_parquet("./slice-a")
print(new_parquet_df)