In [1]:
import os
import sys
import logging

import pandas as pd

In [2]:
sys.path.append("../") # go to parent dir

In [3]:
from dependencynet.datasets_utils import extract_hierarchy, write_dataset
from dependencynet.trees_utils import build_tree, pretty_print_tree

In [4]:
logging.basicConfig()
logger = logging.getLogger('dependencynet')
logger.setLevel(logging.INFO)

In [5]:
data = {'process':  ['X', 'Y', 'Y', 'Y', 'Z', 'Z'],
        'step': ['load data', 'load data', 'load data', 'aggregate', 'load data', 'aggregate', ],
        'task': ['read file', 'read file', 'parse file', 'average', 'read csv', 'sum' ]
        }

df = pd.DataFrame (data, columns = ['process','step', 'task'])

display(df)

Unnamed: 0,process,step,task
0,X,load data,read file
1,Y,load data,read file
2,Y,load data,parse file
3,Y,aggregate,average
4,Z,load data,read csv
5,Z,aggregate,sum


In [6]:
keys = ['process', 'step', 'task']
marks = "PST"
dfs = extract_hierarchy(df, keys, marks)

if logger.level <= logging.INFO:
    logger.info('main resultats')
    display(dfs[0])
    display(dfs[1])
    display(dfs[2])

INFO:dependencynet.datasets_utils:extract_items_root keys=['process'] id_pattern=P{id:02d} => shape=(3, 4)
INFO:dependencynet.datasets_utils:extract_items_root keys=['process', 'step'] id_pattern={id_parent}S{id:02d} => shape=(5, 6)
INFO:dependencynet.datasets_utils:extract_items_root keys=['process', 'step', 'task'] id_pattern={id_parent}T{id:02d} => shape=(6, 7)
INFO:dependencynet:main resultats


Unnamed: 0,process,pos,id,label
0,X,1,P01,P01 X
1,Y,2,P02,P02 Y
4,Z,3,P03,P03 Z


Unnamed: 0,process,step,pos,id_parent,id,label
0,X,load data,1,P01,P01S01,P01S01 load data
1,Y,load data,1,P02,P02S01,P02S01 load data
2,Y,aggregate,2,P02,P02S02,P02S02 aggregate
3,Z,load data,1,P03,P03S01,P03S01 load data
4,Z,aggregate,2,P03,P03S02,P03S02 aggregate


Unnamed: 0,process,step,task,pos,id_parent,id,label
0,X,load data,read file,1,P01S01,P01S01T01,P01S01T01 read file
1,Y,load data,read file,1,P02S01,P02S01T01,P02S01T01 read file
2,Y,load data,parse file,2,P02S01,P02S01T02,P02S01T02 parse file
3,Y,aggregate,average,1,P02S02,P02S02T01,P02S02T01 average
4,Z,load data,read csv,1,P03S01,P03S01T01,P03S01T01 read csv
5,Z,aggregate,sum,1,P03S02,P03S02T01,P03S02T01 sum


In [7]:
folder_name = os.path.join('output', 'datasets')

[ write_dataset(dfs[i], folder_name, keys[i], sep=";") for i in range(len(keys)) ];

INFO:dependencynet.datasets_utils:dateset saved under name output/datasets/process.csv
INFO:dependencynet.datasets_utils:dateset saved under name output/datasets/step.csv
INFO:dependencynet.datasets_utils:dateset saved under name output/datasets/task.csv


In [8]:
tree = build_tree(dfs, keys)
display(tree)

defaultdict(dict,
            {'process_dict': {'P01': {'process': 'X',
               'pos': 1,
               'id': 'P01',
               'label': 'P01 X',
               'step_dict': {'P01S01': {'process': 'X',
                 'step': 'load data',
                 'pos': 1,
                 'id_parent': 'P01',
                 'id': 'P01S01',
                 'label': 'P01S01 load data',
                 'task_dict': {'P01S01T01': {'process': 'X',
                   'step': 'load data',
                   'task': 'read file',
                   'pos': 1,
                   'id_parent': 'P01S01',
                   'id': 'P01S01T01',
                   'label': 'P01S01T01 read file'}}}}},
              'P02': {'process': 'Y',
               'pos': 2,
               'id': 'P02',
               'label': 'P02 Y',
               'step_dict': {'P02S01': {'process': 'Y',
                 'step': 'load data',
                 'pos': 1,
                 'id_parent': 'P02',
                 

In [10]:
pretty_print_tree(tree, keys)

['there are 3 process(s)',
 '  P01, P02, P03',
 '  process P01: X',
 '    has 1 step(s)',
 '      P01S01',
 '      step P01S01: load data',
 '        has 1 task(s)',
 '          P01S01T01',
 '          task P01S01T01: read file',
 '  process P02: Y',
 '    has 2 step(s)',
 '      P02S01, P02S02',
 '      step P02S01: load data',
 '        has 2 task(s)',
 '          P02S01T01, P02S01T02',
 '          task P02S01T01: read file',
 '          task P02S01T02: parse file',
 '      step P02S02: aggregate',
 '        has 1 task(s)',
 '          P02S02T01',
 '          task P02S02T01: average',
 '  process P03: Z',
 '    has 2 step(s)',
 '      P03S01, P03S02',
 '      step P03S01: load data',
 '        has 1 task(s)',
 '          P03S01T01',
 '          task P03S01T01: read csv',
 '      step P03S02: aggregate',
 '        has 1 task(s)',
 '          P03S02T01',
 '          task P03S02T01: sum']