### Analyzing CMF pipeline metadata for Exatrack Pipeline (stored with CMF tracking layer queried with CMF query layer).

In [1]:
import pandas as pd
from cmflib import cmfquery

##### Initialize the library and get all the stages in the pipeline
Point the library to the metadata file. <br>
The `get_pipeline_stages` call point to the different stages in the pipeline.

In [13]:
query = cmfquery.CmfQuery("./mlmd")
stages = query.get_pipeline_stages("exatrkx")
print(stages)

['Train', '1. Train Metric Learning', '2. Metric Learning Inference', '3. Train GNN', '4. GNN Inference', '5. Build Track Candidates', '6. Evaluate Track Candidates']


##### Query the Executions for **1. Train Metric Learning**

In [14]:
pd.options.display.max_colwidth = 220
executions = query.get_all_executions_in_stage('1. Train Metric Learning')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           3       Train1  ['run_cmf.py']                  
1           3       Train1  ['run_cmf.py']                  
2           3       Train1  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   
1  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   
2  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  a7d81214292d9a3233d749a8aaa1f489695135ef       exatrkx            1   
1  a7d81214292d9a3233d749a8aaa1f489695135ef       exatrkx            1   
2  9217c09fa9c9626dbb6809880b70aa34f87e4707       exatrkx            1   

  activation  cell_channels  ...  r_test  r_train r_val randomisation  \
0       Tanh            9.0  ...     0.1      0.1   0.1           2.0   
1        NaN            NaN  ...     NaN      NaN   NaN           NaN   
2     

##### Query the Executions in **2. Metric Learning Inference**

In [4]:
pd.options.display.max_colwidth = 220
executions = query.get_all_executions_in_stage('2. Metric Learning Inference')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           4       Infer1  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  919137466dcd982728e2d222983e96c2113c1a86       exatrkx            1   

  activation  cell_channels  ...  r_test  r_train  r_val  randomisation  \
0       Tanh              9  ...       0        0      0              2   

                  regime  spatial_channels   train_split  \
0  ['rp', 'hnm', 'norm']                 3  [80, 10, 10]   

              true_edges  warmup  weight  
0  modulewise_true_edges       8       2  

[1 rows x 35 columns]


##### Query the Executions in **3. Train GNN**

In [5]:
executions = query.get_all_executions_in_stage('3. Train GNN')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           5     TrainGNN  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  df811c34f7574c81bf685e261bb926e233eea59a       exatrkx            1   

  aggregation  cell_channels  ... noise                         output_dir  \
0     sum_max              0  ...     0  datasets/quickstart_gnn_processed   

   patience  pt_background_min  pt_signal_min   regime  spatial_channels  \
0         8                  0              0  ['pid']                 3   

    truth_key  warmup  weight  
0  pid_signal      10       2  

[1 rows x 35 columns]


##### Query the Executions in **4. GNN Inference**

In [6]:
executions = query.get_all_executions_in_stage('4. GNN Inference')
print(executions)

   Context_ID Context_Type       Execution Git_End_Commit  \
0           6     InferGNN  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  \
0  dc837ed8c527804f8e06c9c9a04998b234f9f203       exatrkx            1   

  aggregation  cell_channels  ... noise                         output_dir  \
0     sum_max              0  ...     0  datasets/quickstart_gnn_processed   

   patience  pt_background_min  pt_signal_min   regime  spatial_channels  \
0         8                  0              0  ['pid']                 3   

    truth_key  warmup  weight  
0  pid_signal      10       2  

[1 rows x 35 columns]


##### Query the Executions in **5. Build Track Candidates**

In [7]:
executions = query.get_all_executions_in_stage('5. Build Track Candidates')
print(executions)

   Context_ID     Context_Type       Execution Git_End_Commit  \
0           7  TrackCandidates  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  id  \
0  9f2969ce8a2969bb0d5165a431669ca29bef1596       exatrkx            1   9   

                                     output_dir  score_cut  
0  datasets/quickstart_track_building_processed          0  


##### Query the Executions in **6. Evaluate Track Candidates**

In [8]:
executions = query.get_all_executions_in_stage('6. Evaluate Track Candidates')
print(executions)

   Context_ID         Context_Type       Execution Git_End_Commit  \
0           8  EvalTrackCandidates  ['run_cmf.py']                  

                                              Git_Repo  \
0  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git   

                           Git_Start_Commit Pipeline_Type  Pipeline_id  id  \
0  a7d81214292d9a3233d749a8aaa1f489695135ef       exatrkx            1  10   

   matching_fraction matching_style  max_eta  min_particle_length  min_pt  \
0                  0          ATLAS        4                    3       1   

   min_track_length                      output_dir  
0                 3  datasets/quickstart_evaluation  


##### Get all the artifacts of execution. 
<b>input parameter - execution_id</b><br>
<b>output parameter - artifacts</b><br>


In [12]:
artifacts = query.get_all_artifacts_for_execution(10)
print(artifacts)

                                            Commit  create_time_since_epoch  \
0  commit a7d81214292d9a3233d749a8aaa1f489695135ef            1663622654091   
1  commit 1304578491ac94056a0d31b642058ab1ef7bba2f            1663622665523   

    event                                             git_repo   id  \
0   INPUT  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git  166   
1  OUTPUT  git@github.com:atripathy86/Tracking-ML-Exa.TrkX.git  167   

   last_update_time_since_epoch  \
0                 1663622654091   
1                 1663622665523   

                                                                                name  \
0  datasets/quickstart_track_building_processed:69fcf465bc7f8c409bab2a04b40bd48c.dir   
1                datasets/quickstart_evaluation:d751713988987e9331980363e24189ce.dir   

      type                                   uri  
0  Dataset  69fcf465bc7f8c409bab2a04b40bd48c.dir  
1  Dataset  d751713988987e9331980363e24189ce.dir  


In [11]:
#print(pd.options.display.max_colwidth)
pd.options.display.max_colwidth = 120
artifacts = query.get_all_artifacts_for_execution(6)

# print(artifacts)
print(artifacts["name"])
print(artifacts["event"])

0     training_metrics_6:4cb39f604b258be19bd78397258f3bb4:6:59fd9f42-385e-11ed-8c49-d4c9efcecfde
1     training_metrics_6:f2eec4f8dd0ef9b39a16cb81952581e6:6:5b9f5818-385e-11ed-8c49-d4c9efcecfde
2     training_metrics_6:6ed8013819d8ffd0bb6a85141375b802:6:70b834c2-385e-11ed-8c49-d4c9efcecfde
3     training_metrics_6:f9ce4f946dd37dde6cd656578f9f9722:6:7291134a-385e-11ed-8c49-d4c9efcecfde
4     training_metrics_6:84ae984e7d50511825be63d0195d98aa:6:86fca1fa-385e-11ed-8c49-d4c9efcecfde
                                                 ...                                            
67    training_metrics_6:093b0824afdc04f9d72d1745e172a36e:6:31a241bc-3861-11ed-8c49-d4c9efcecfde
68    training_metrics_6:1583bb048f3800b33ed261aa2dd5ad54:6:470acce0-3861-11ed-8c49-d4c9efcecfde
69    training_metrics_6:1583bb048f3800b33ed261aa2dd5ad54:6:48a105a6-3861-11ed-8c49-d4c9efcecfde
70    training_metrics_6:3921b9edfc07a750c0a26b771880be63:6:4a3706a4-3861-11ed-8c49-d4c9efcecfde
71    training_metrics_6:3921b

#### get all executions for an artifact(pass the artifact full name as the input parameter)

In [52]:
linked = query.get_all_executions_for_artifact("datasets/quickstart_metric_learning_processed:162349f1df2916e321b506159e634695.dir")
print(linked)

     Type  execution_id execution_name pipeline                     stage
0  OUTPUT             1                 exatrkx  1. Train Metric Learning


#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [None]:
linked = query.get_all_parent_artifacts("artifacts/features/test.pkl")
print(linked)

#### Get all the child artifacts of an artifact. Provides the lineage chain in the downstream direction

In [None]:
linked = query.get_all_child_artifacts("artifacts/features/train.pkl")

print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

#### Get all the parent artifacts of an artifact. Provides the artifact lineage chain

In [None]:
linked = query.get_all_parent_artifacts("artifacts/model/model.pkl")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_parent_artifacts("artifacts/parsed/test.tsv")
print("Name : " + linked["name"].to_string(index=False, header=False))
print("Type : " + linked["type"].to_string(index=False, header=False))
print("URI : " + linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_child_artifacts("artifacts/parsed/test.tsv")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

#### Get immediate child artifacts of an artifact. 

In [None]:
linked = query.get_one_hop_child_artifacts("artifacts/data.xml.gz")
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_child_artifacts("artifacts/data.xml.gz")
#print(linked.sort_values('create_time_since_epoch', ascending=True))
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

In [None]:
linked = query.get_all_artifacts_for_execution(4)
print("NAME")
print(linked["name"].to_string(index=False, header=False))
print("TYPE")
print(linked["type"].to_string(index=False, header=False))
print("URI")
print(linked["uri"].to_string(index=False, header=False))

### Change the metrics name in the get_artifact call with the metrics name from output of the previous cell

In [None]:
artifacts = query.get_artifact("metrics:aaae534e-915d-11ec-b106-89841b9859cd:4")
print(artifacts)


In [None]:
new_parquet_df = pd.read_parquet("./slice-a")
print(new_parquet_df)