In [1]:
from pycelonis.pql import PQLColumn, PQLFilter
import pycelonis
from src.celonis_data_integration import execute_PQL_query, get_connection, check_invalid_table_in_celonis, \
    get_celonis_info


def get_execution_time_per_res_per_act(data_mode, table_name, case_column, activity_column, resource_column,
                                       timestamp_column):
    columns = [PQLColumn(name="case_id", query=f'SOURCE("{table_name}"."{case_column}")'),
               PQLColumn(name="source_act", query=f'SOURCE("{table_name}"."{activity_column}")'),
               PQLColumn(name="source_resource", query=f'SOURCE("{table_name}"."{resource_column}")'),
               PQLColumn(name="target_act", query=f'TARGET("{table_name}"."{activity_column}")'),
               PQLColumn(name="source_timestamp", query=f'SOURCE("{table_name}"."{timestamp_column}")'),
               PQLColumn(name="target_timestamp", query=f'TARGET("{table_name}"."{timestamp_column}")'),
               PQLColumn(name="time_between",
                         query=f'minutes_between(SOURCE("{table_name}"."{timestamp_column}"), TARGET("{table_name}"."{timestamp_column}"))')]
    res = execute_PQL_query(data_mode, columns)
    return res


def get_unique_activity(df, act_column_name):
    return df[act_column_name].unique()


def get_unique_resource(df, res_column_name):
    return df[res_column_name].unique()


def get_res_act_relation(df, activities):
    res_dict = {}
    for a in activities:
        g1 = df.groupby(["activity"]).get_group(a)
        keys = list(g1.groupby(["resource"]).groups.keys())
        res_dict[a] = keys
    return res_dict


def get_target_activity_with_start_end_timestamp(data_mode, table_name, case_column, activity_column, resource_column,
                                                 timestamp_column):
    columns = [PQLColumn(name="case_id", query=f'SOURCE("{table_name}"."{case_column}")'),
               PQLColumn(name="activity", query=f'SOURCE("{table_name}"."{activity_column}")'),
               PQLColumn(name="resource", query=f'SOURCE("{table_name}"."{resource_column}")'),
               PQLColumn(name="start_at", query=f'SOURCE("{table_name}"."{timestamp_column}")'),
               PQLColumn(name="end_at", query=f'TARGET("{table_name}"."{timestamp_column}")')]
    res = execute_PQL_query(data_mode, columns)
    return res


def get_task_duration_time_distance(data_pool, data_model, table_name, case_column, activity_column, time_column,
                                    lifecycle_column):
    """
    Get task duration for a certain activity and
    time distance between different/multiple-tries activities that act as souce and target on DFG
    :param data_pool:
    :param data_model:
    :param table_name:
    :param case_column:
    :param activity_column:
    :param time_column:
    :param lifecycle_column:
    :return: Two data frame: One for task duration (only for tasks that have start and complete life status);
    the other one for time distance (between activities act as source and target on DFG)
    """

    columns_dur = [PQLColumn(name="case_id", query=f'SOURCE("{table_name}"."{case_column}")'),
                   PQLColumn(name="start_activity", query=f'SOURCE("{table_name}"."{activity_column}")'),
                   PQLColumn(name="end_activity", query=f'TARGET("{table_name}"."{activity_column}")'),
                   PQLColumn(name="start_life", query=f'SOURCE("{table_name}"."{lifecycle_column}")'),
                   PQLColumn(name="end_life", query=f'TARGET("{table_name}"."{lifecycle_column}")'),
                   PQLColumn(name="task_duration(min)",
                             query=f'minutes_between(SOURCE("{table_name}"."{time_column}"), TARGET("{table_name}"."{time_column}"))')
                   ]
    filter_dur = [
        PQLFilter(
            query=f'FILTER SOURCE("{table_name}"."{activity_column}") = TARGET("{table_name}"."{activity_column}");'),
        PQLFilter(
            query=f'FILTER SOURCE("{table_name}"."{lifecycle_column}") = \'start\' AND TARGET("{table_name}"."{lifecycle_column}") = \'complete\';')
    ]
    res_task_duration = execute_PQL_query(data_model, columns_dur, filters=filter_dur)
    
    if not res_task_duration.empty:
        try:
            data_pool_table = data_pool.get_tables().find(f'{table_name}_task_duration')
        except PyCelonisNotFoundError:
            data_pool.create_table(df=res_task_duration, table_name=f'{table_name}_task_duration',
                                   drop_if_exists=False)
            data_model.add_table(name=f'{table_name}_task_duration', alias=f'{table_name}_task_duration')
        else:
            data_pool_table.upsert(res_task_duration, keys=["case_id", "start_activity", "end_activity"])
        data_model.reload()
    else:
        imsi_dur=[PQLColumn(name="case_id", query=f'"{table_name}"."{case_column}"'),
                   PQLColumn(name="start_activity", query=f'"{table_name}"."{activity_column}"'),
                   PQLColumn(name="end_activity", query=f'"{table_name}"."{activity_column}"'),
                   PQLColumn(name="start_life", query=f'"{table_name}"."{lifecycle_column}"'),
                   PQLColumn(name="end_life", query=f'"{table_name}"."{lifecycle_column}"'),
                   PQLColumn(name="task_duration(min)",
                             query=f'0')
                   ]
        res_task_duration=execute_PQL_query(data_model,imsi_dur)
        try:
            data_pool_table = data_pool.get_tables().find(f'{table_name}_task_duration')
        except PyCelonisNotFoundError:
            data_pool.create_table(df=res_task_duration, table_name=f'{table_name}_task_duration',
                                   drop_if_exists=False)
            data_model.add_table(name=f'{table_name}_task_duration', alias=f'{table_name}_task_duration')
        else:
            data_pool_table.upsert(res_task_duration, keys=["case_id", "start_activity", "end_activity"])
        data_model.reload()
        
    columns_dis = [PQLColumn(name="case_id", query=f'SOURCE("{table_name}"."{case_column}")'),
                   PQLColumn(name="start_activity", query=f'SOURCE("{table_name}"."{activity_column}")'),
                   PQLColumn(name="end_activity", query=f'TARGET("{table_name}"."{activity_column}")'),
                   PQLColumn(name="start_life", query=f'SOURCE("{table_name}"."{lifecycle_column}")'),
                   PQLColumn(name="end_life", query=f'TARGET("{table_name}"."{lifecycle_column}")'),
                   PQLColumn(name="start_timestamp", query=f'SOURCE("{table_name}"."{time_column}")'),
                   PQLColumn(name="end_timestamp", query=f'TARGET("{table_name}"."{time_column}")'),
                   PQLColumn(name="time_distance(min)",
                             query=f'minutes_between(SOURCE("{table_name}"."{time_column}"), TARGET("{table_name}"."{time_column}"))')
                   ]
    filter_dis = [
        PQLFilter(query=f'FILTER SOURCE("{table_name}"."{lifecycle_column}") != \'start\';')
    ]
    res_time_distance = execute_PQL_query(data_model, columns_dis, filters=filter_dis)
    if not res_time_distance.empty:
        try:
            data_pool_table = data_pool.get_tables().find(f'{table_name}_time_distance')
        except PyCelonisNotFoundError:
            data_pool.create_table(df=res_task_duration, table_name=f'{table_name}_time_distance',
                                   drop_if_exists=False)
            data_model.add_table(name=f'{table_name}_task_duration', alias=f'{table_name}_time_distance')
        else:
            data_pool_table.upsert(res_time_distance, keys=["case_id", "start_activity", "end_activity"])
        data_model.reload()

    return res_task_duration, res_time_distance


def calculate_temporal_profile(data_model, table_name, types, case_column, mainstream_case_id=None):
    """
    Calculate the temporal profile for task duration and time distance
    :param case_column:
    :param mainstream_case_id:
    :param data_model:
    :param table_name: 
    :param types: ['overall', 'mainstream', 'new']
    :return: 
    """
    s = "("
    if mainstream_case_id:
        for id in mainstream_case_id:
            s += f"\'{id}\',"
        s = s[:-1]
        s += ")"
    if types == "mainstream":
        filters = [PQLFilter(query=f'FILTER "{table_name}_task_duration"."{case_column}" IN {s};')]
    elif types == "new":
        filters = [PQLFilter(query=f'FILTER "{table_name}_task_duration"."{case_column}" NOT IN {s};')]
    else:
        filters = []

    cols_dur = [PQLColumn(name="Activity", query=f'"{table_name}_task_duration"."start_activity"'),
                PQLColumn(name="max_task_duration(min)",
                          query=f'MAX("{table_name}_task_duration"."task_duration(min)")'),
                PQLColumn(name="min_task_duration(min)",
                          query=f'MIN("{table_name}_task_duration"."task_duration(min)")'),
                PQLColumn(name="mean_task_duration(min)",
                          query=f'ROUND(AVG("{table_name}_task_duration"."task_duration(min)"), 2)'),
                PQLColumn(name="stdev_task_duration(min)",
                          query=f'ROUND(STDEV("{table_name}_task_duration"."task_duration(min)"), 2)'),
                PQLColumn(name="var_task_duration(min)",
                          query=f'ROUND(VAR("{table_name}_task_duration"."task_duration(min)"), 2)'),
                ]
    res_dur = execute_PQL_query(data_model, cols_dur, filters=filters)

    cols_dis = [PQLColumn(name="Start_activity", query=f'"{table_name}_time_distance"."start_activity"'),
                PQLColumn(name="End_activity", query=f'"{table_name}_time_distance"."end_activity"'),
                PQLColumn(name="max_time_distance(min)",
                          query=f'MAX("{table_name}_time_distance"."time_distance(min)")'),
                PQLColumn(name="min_time_distance(min)",
                          query=f'MIN("{table_name}_time_distance"."time_distance(min)")'),
                PQLColumn(name="mean_time_distance(min)",
                          query=f'ROUND(AVG("{table_name}_time_distance"."time_distance(min)"), 2)'),
                PQLColumn(name="stdev_time_distance(min)",
                          query=f'ROUND(STDEV("{table_name}_time_distance"."time_distance(min)"), 2)'),
                PQLColumn(name="var_time_distance(min)",
                          query=f'ROUND(VAR("{table_name}_time_distance"."time_distance(min)"), 2)'),
                ]
    res_dis = execute_PQL_query(data_model, cols_dis, filters=filters)
    return res_dur, res_dis



def trace_cluster(data_model, table_name, case_column, activity_column, resource_column, lifecycle_column):


    columns = [PQLColumn(name="case_id", query=f'("{table_name}"."{case_column}")'),
               PQLColumn(name="activity_trace", query=f'VARIANT("{table_name}"."{activity_column}")'),
               PQLColumn(name="Cluster", query=f'CLUSTER_VARIANTS ( VARIANT ( "{table_name}"."{activity_column}" ) , 2 , 2 )'),
              ]

    res = execute_PQL_query(data_model, columns, distinct=True)

    # Group by 'activity_trace', then aggregate 'case_id' and count
    new_df = res.groupby('activity_trace').agg({'case_id': list}).reset_index()

    # Create a new column, counting the number of 'case_id' for each 'activity_trace'
    new_df['case_count'] = new_df['case_id'].apply(len)

    # Rename column names
    new_df.columns = ['activity_trace', 'case_id_list', 'case_count']

    # Sort by the values in the 'case_count' column in descending order
    new_df = new_df.sort_values('case_count', ascending=False).reset_index(drop=True)
    
    return new_df



def split_df(df, p=0.2):
    # Calculate the row for the cutoff
    cutoff = int(len(df) * p)
    
    # Get the 'activity_trace' of the first p percent of rows
    first_p = [[trace] for trace in df['activity_trace'].iloc[:cutoff]]
    
    # Get the 'activity_trace' of the rest of the rows
    rest = [[trace] for trace in df['activity_trace'].iloc[cutoff:]]
    
    return first_p, rest



In [2]:
import pycelonis
import yaml
from pycelonis.pql import PQL, PQLColumn, PQLFilter, OrderByColumn
from pycelonis_core.utils.errors import PyCelonisNotFoundError


def get_connection():
    """
    Connect to the Celonis platform
    :return: Celonis object
    """
    #try:
    file = open("./config.yaml")
    config = yaml.safe_load(file)
    #except FileNotFoundError:
        #return "The configuration file is empty."
    celonis_url = config["celonis"]["base_url"]
    celonis_api_token = config["celonis"]["api_token"]

    try:
        celonis = pycelonis.get_celonis(base_url=celonis_url, api_token=celonis_api_token,
                                        key_type="APP_KEY", permissions=False)
    except:
        return f"The base_url {celonis_url} or the api token {celonis_api_token} is invalid."

    return celonis


def get_celonis_info(celonis):
    """
    Get the settings of Celonis
    :param celonis: conncted celonis object
    :return: data model and data pool of our project
    """
    #try:
    file = open("./config.yaml")
    config = yaml.safe_load(file)
    #except FileNotFoundError:
        #return "The configuration file is empty."

    pool_name = config["data_pool"]

    model_name = config["data_model"]

    case_column_name, act_column_name, time_column_name, res_column_name, lifecycle_column_name = config["case_column_name"], config[
        "activity_column_name"], \
        config["timestamp_column_name"], config["resource_column_name"], config["lifecycle_column_name"]

    try:
        data_pool = celonis.data_integration.get_data_pools().find(pool_name)

    except PyCelonisNotFoundError:
        return f"Data pool: {pool_name} does not exist."

    try:
        data_model = data_pool.get_data_models().find(model_name)
    except PyCelonisNotFoundError:
        return f"Data model: {model_name} does not exist in data pool {pool_name}."

    return data_pool, data_model, pool_name, model_name, case_column_name, act_column_name, time_column_name, res_column_name, lifecycle_column_name


def create_pool_and_model(celonis, pool_name, model_name):
    """
    Create a data pool and a data model using the given names
    :param celonis: connected celonis object
    :param pool_name: name of the data pool
    :param model_name: name of the data model
    :return: created data model and data pool
    """
    data_pool = celonis.data_integration.create_data_pool(pool_name)
    data_model = data_pool.create_data_model(model_name)
    return data_pool, data_model


def check_invalid_table_in_celonis(data_model, table):
    """
    Check if the given table not in the data pool/model
    :param celonis: the connection
    :param table: table name
    :return: Return False if the table exists (valid table) otherwise retunr the error message
    """
    try:
        data_model.get_tables().find(table)
    except PyCelonisNotFoundError:
        return f"Table: \"{table}\" does not exist in data model"

    return False


def execute_PQL_query(data_model, columns=None, filters=None, order_by_columns=None, distinct=False, limit=None,
                      offset=None):
    """
    Get dataframe executing PQL query
    :param data_model:
    :param columns: list of PQLColumn
    :param filters: list of PQLFilter
    :param order_by_columns: list of OrderByColumnOrderByColumn
    :param distinct: True/False
    :param limit: limit parameter
    :param offset: offfset parameter
    :return: dataframe with the result of the query
    """
    query = PQL(distinct=distinct, limit=limit, offset=offset)
    if columns:
        for c in columns:
            query += c
    if filters:
        for f in filters:
            query += f
    if order_by_columns:
        for o in order_by_columns:
            query += o

    res_df = data_model.export_data_frame(query)
    return res_df


In [3]:
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import pandas as pd

def find_deviations(df, threshold):
    # Determine the "usual" activities for each resource
    usual_activities = df.groupby('resource')['activity'].value_counts()
    
    # Filter the usual activities based on a threshold
    usual_activities = usual_activities[usual_activities > threshold].reset_index(name='count').groupby('resource')['activity'].apply(set)

    # Track the activities of each resource in the log
    for case in df['case_id'].unique():
        case_df = df[df['case_id'] == case]
        for idx, row in case_df.iterrows():
            resource = row['resource']
            activity = row['activity']

            # If a resource executes an activity that is not in their usual set, flag it
            if activity not in usual_activities.get(resource, set()):
                print(f'Deviation found: Resource {resource} performed unusual activity {activity} in case {case}')



In [4]:
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


def find_high_rework_resources(data, rework_threshold=1, count_threshold=4):
    data['activity'] = data['activity'] + "_" + data['transition']

    # Count unique resources for each case-activity combination
    task_counts = data.groupby(['case_id', 'activity'])['resource'].nunique()

    # Filter tasks where count > 1, meaning they are performed by more than one resource within the case
    repeated_tasks = task_counts[task_counts > rework_threshold]

    # Filter the original data to only include tasks identified as repeated
    high_rework = data[data.set_index(['case_id', 'activity']).index.isin(repeated_tasks.index)]

    # Collect all timestamps for each case-activity-resource combination
    rework_resources = high_rework.groupby(['case_id', 'activity', 'resource']).size().reset_index(name='Count')
    df_more_than_3 = rework_resources[rework_resources['Count'] > count_threshold]
    resource_list = df_more_than_3['resource'].unique().tolist()

    return resource_list



In [5]:
from collections import defaultdict
import pandas as pd
from flask import render_template, Flask, Blueprint
import warnings
from src.get_data import get_res_act_relation

resource_based_analysis_app = Blueprint('resource_based_analysis_app', __name__)


@resource_based_analysis_app.route('/resource_performance')
def resource_performance(df):
    """
    Find the most efficient resource and the least efficient resource for each activity
    :param df: dataframe with activity, resource, and execution time
    :return:
    two dataframes: 1. the least efficient resource for each activity and their average execution time 2. the most
    efficient resource for each activity and their average execution time
    """
    # calculate the total execution time and times of each activity executed by each resource
    avg_time_df = df.groupby(["source_act", "source_resource"]).agg({"time_between": ["sum", "count"]}).reset_index()
    avg_time_df.columns = ["source_act", "source_resource", "total_time", "count"]
    # calculate the average execution time for each resource of each activity
    avg_time_df["avg_time"] = avg_time_df["total_time"] / avg_time_df["count"]
    # the least efficient
    least_efficient = avg_time_df[["source_act", "source_resource", "avg_time"]].groupby(["source_act"],
                                                                                         as_index=False).max()
    least_efficient.columns = ["activity", "the least efficient resource", "avg_execution_time(min)"]
    # the most efficient
    most_efficient = avg_time_df[["source_act", "source_resource", "avg_time"]].groupby(["source_act"],
                                                                                        as_index=False).min()
    most_efficient.columns = ["activity", "the most efficient resource", "avg_execution_time(min)"]

    return least_efficient, most_efficient


@resource_based_analysis_app.route('/batch_identification')
def batch_identification(df, activities):
    """
    Identify all possible batches for al available combination of resources and activities
    :param df: data frame with case_id, activity, resource, start_at, end_at
    :param activities: list of all kinds of activities in df
    :return: batches
    """

    warnings.filterwarnings('ignore')
    res_act_dict = get_res_act_relation(df, activities)
    batches = defaultdict(dict)
    group_res_act = df.groupby(["resource", "activity"])

    for a in activities:
        for r in res_act_dict[a]:

            data_res_act = group_res_act.get_group((r, a))
            data_res_act["start_at"] = data_res_act["start_at"].round("T")
            data_res_act["end_at"] = data_res_act["end_at"].round("T")
            data_res_act = data_res_act.sort_values(by=["start_at", "end_at"])

            sim = []
            seq = []
            con = []
            invalid = []

            data = data_res_act.values

            for row1 in data:
                for row2 in data:
                    if sum(row1 != row2) == 0:
                        continue
                    if (row1[3] == row2[3]) and (row1[4] == row2[4]):
                        sim.append(row1)
                    elif (row1[3] == row2[4]) or (row1[4] == row2[3]):
                        seq.append(row1)
                    elif (row1[4] < row2[3]) or (row2[4] < row1[3]):
                        invalid.append(row1)
                    else:
                        con.append(row2)
            df_sim = pd.DataFrame(sim,
                                  columns=["case_id", "activity", "resource", "start_at", "end_at"]).drop_duplicates()
            df_seq = pd.DataFrame(seq,
                                  columns=["case_id", "activity", "resource", "start_at", "end_at"]).drop_duplicates()
            df_con = pd.DataFrame(con,
                                  columns=["case_id", "activity", "resource", "start_at", "end_at"]).drop_duplicates()
            if not df_sim.empty:
                batches[(a, r)]["Simultaneous"] = df_sim
            if not df_seq.empty:
                batches[(a, r)]["Sequential"] = df_seq
            if not df_con.empty:
                batches[(a, r)]["Concurrent"] = df_con

    return batches


In [6]:
from pycelonis.pql import PQLColumn

from src.celonis_data_integration import execute_PQL_query, get_connection, check_invalid_table_in_celonis,get_celonis_info

def get_caseid_activity_lifecycle_resource(data_model, table_name, case_column, activity_column, resource_column,
                                       lifecycle_column):
    columns = [PQLColumn(name="case_id", query=f'SOURCE("{table_name}"."{case_column}")'),
               PQLColumn(name="activity", query=f'SOURCE("{table_name}"."{activity_column}")'),
               PQLColumn(name="transition", query=f'SOURCE("{table_name}"."{lifecycle_column}")'),
               PQLColumn(name="resource", query=f'SOURCE("{table_name}"."{resource_column}")')
              ]
    res = execute_PQL_query(data_model, columns)
    return res


In [7]:
def z_score(x,m,s):
    """
    z_score: A function that returns z_score.
    """
    if x==m:
        return 0
    else:
        return abs((x-m)/s)

def temporal_deviation_cost_function(x,w,phi,k,a,b,c):
    """
    temporal_deviation_cost_function: A function which calculates temporal deviation cost function.
    """

    z=z_score(x,a,b)
    if c | z<k:
        return 0
    else:
        return z*w*phi

def get_z_score(task_duration, time_distance, temp_profile_dur, temp_profile_dis, k):
    """
    get_z_score
        Args:
            task_duration: A dataframe which contains case_id, activity, task_duration. Return value of a function 'get_task_duration_time_distance'.
            time_distance: A dataframe which contains case_id, activity, time_distance. Return value of a function 'get_task_duration_time_distance'.
            temp_profile_dur: A dataframe which contains max, min, mean, stdev, var of a task durations. Return value of a function 'calculate_temporal_profile'.
            temp_profile_dis: A dataframe which contains max, min, mean, stdev, var of a time_distances. Return value of a function 'calculate_temporal_profile'.
            k: An integer which determines the threshold of a z-score.
        Returns:
            normal_dur: A list with task durations which are not anomaly.
            anomaly_dur: A list with task durations which are anomaly.
            normal_dis: A list with time_distances which are not anomaly.
            anomaly_dis: A list with time_distances which are anomaly.
    """

    normal_dur=list()
    anomaly_dur=list()
    normal_dis=list()
    anomaly_dis=list()
    
    #find anomalies in task_duration
    for index,line in task_duration.iterrows():

        x=line['task_duration(min)']
        y=temp_profile_dur.loc[temp_profile_dur['Activity']==line['start_activity']]
        #print(line)
        #print('----')
        #print(type(y))
        #print(y)
        #print((y['mean_task_duration(min)']))
        
        #dd=y.iloc[0,3]
        #print(dd)

        if z_score(x,y.iloc[0,3],y.iloc[0,4])>k:
            anomaly_dur.append(line)
        else:
            normal_dur.append(line)
    
    #find anomalies in time_distance
    for index,line in time_distance.iterrows():

        x=line['time_distance(min)']
        y=temp_profile_dis.loc[(temp_profile_dis['Start_activity']==line['start_activity']) & (temp_profile_dis['End_activity']==line['end_activity'])]
        #print(y)
        #print(y.iloc[0,4])
        
        if z_score(x,y.iloc[0,4],y.iloc[0,5])>k:
            anomaly_dis.append(line)
        else:
            normal_dis.append(line)
        
    return normal_dur,anomaly_dur,normal_dis,anomaly_dis

In [8]:
from src.celonis_data_integration import get_connection, get_celonis_info, check_invalid_table_in_celonis
from src.get_data import get_execution_time_per_res_per_act
from src.resource_based_analysis import resource_performance

# connect to Celonis
#celonis = get_connection()
file = open("./config.yaml")
config = yaml.safe_load(file)
celonis_url = config["celonis"]["base_url"]
celonis_api_token = config["celonis"]["api_token"]
celonis = pycelonis.get_celonis(base_url=celonis_url, api_token=celonis_api_token,
                                        key_type="APP_KEY", permissions=False)

print(celonis)
# get the data pool and data model of our project
#data_pool, data_model, pool_name, model_name, case_column_name, act_column_name, time_column_name, res_column_name ,
#x = get_celonis_info(celonis=celonis)
pool_name = config["data_pool"]

model_name = config["data_model"]

case_column_name, act_column_name, time_column_name, res_column_name, lifecycle_column_name = config["case_column_name"], config[
        "activity_column_name"], \
        config["timestamp_column_name"], config["resource_column_name"], config["lifecycle_column_name"]
data_pool = celonis.data_integration.get_data_pools().find(pool_name)
data_model = data_pool.get_data_models().find(model_name)
# check if one table is invalid (does not exist in our data pool/model)
#flag = check_invalid_table_in_celonis(celonis, table="receipt")
flag=0
# One example of getting data of table receipt from our data pool
if not flag:
    df = get_execution_time_per_res_per_act(data_model, "reviewing", case_column_name, act_column_name, res_column_name, time_column_name)
    
# One example: get the least and the most efficient resource of above data
least, most = resource_performance(df)

print(least)
print(most)



Your PyCelonis Version 2.2.0 is outdated (Newest Version: 2.3.0). Please upgrade the package via: pip install --extra-index-url=https://pypi.celonis.cloud/ pycelonis pycelonis_core --upgrade


<pycelonis.celonis.Celonis object at 0x7f8788ce3be0>


0it [00:00, ?it/s]

                      activity the least efficient resource  \
0                       accept                         Mike   
1              collect reviews                         Mike   
2                       decide                          Wil   
3                 get review 1                         Sara   
4                 get review 2                         Sara   
5                 get review 3                         Sara   
6                 get review X                         Sara   
7   invite additional reviewer                         Mike   
8             invite reviewers                         Mike   
9                       reject                         Mike   
10                  time-out 1                  __INVALID__   
11                  time-out 2                  __INVALID__   
12                  time-out 3                  __INVALID__   
13                  time-out X                  __INVALID__   

    avg_execution_time(min)  
0               1285.714

In [9]:
res_task_duration, res_time_distance = get_task_duration_time_distance(data_pool, data_model, "reviewing", case_column_name, act_column_name, time_column_name,lifecycle_column_name)
print(res_task_duration)
print(res_time_distance)

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

     case_id              start_activity                end_activity  \
0          1            invite reviewers            invite reviewers   
1          1                      decide                      decide   
2          1  invite additional reviewer  invite additional reviewer   
3          1                      decide                      decide   
4          1  invite additional reviewer  invite additional reviewer   
...      ...                         ...                         ...   
1077      99  invite additional reviewer  invite additional reviewer   
1078      99                      decide                      decide   
1079      99  invite additional reviewer  invite additional reviewer   
1080      99                      decide                      decide   
1081      99                      accept                      accept   

     start_life  end_life  task_duration(min)  
0         start  complete              7200.0  
1         start  complete              

In [10]:
print(res_task_duration.empty)
data_pool_table = data_pool.get_tables().find(f'reviewing_time_distance')
print(data_pool_table)

False
name='reviewing_time_distance' loader_source=None available=False data_source_id=None data_source_name=None columns=[] type_=<PropertyType.TABLE: 'TABLE'> schema_name='6292220c-f2c1-4f8c-a6b3-bd77585b8e7a' client=<pycelonis_core.client.client.Client object at 0x7f8788c93a00> data_pool_id='6292220c-f2c1-4f8c-a6b3-bd77585b8e7a'


In [11]:
res_dur, res_dis=calculate_temporal_profile(data_model, "reviewing", "overall", case_column_name, mainstream_case_id=None)
print(res_dur)
print(res_dis)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

                     Activity  max_task_duration(min)  min_task_duration(min)  \
0                      accept                 14400.0                  2880.0   
1             collect reviews                 17280.0                  1440.0   
2                      decide                 15840.0                  1440.0   
3  invite additional reviewer                 11520.0                     0.0   
4            invite reviewers                  7200.0                  1440.0   
5                      reject                  7200.0                  1440.0   

   mean_task_duration(min)  stdev_task_duration(min)  var_task_duration(min)  
0                  5920.00                   3411.04             11635200.00  
1                  4695.65                   2586.02              6687492.58  
2                  4504.46                   2111.06              4456556.44  
3                  4097.39                   1999.28              3997108.92  
4                  3748.57           

In [12]:
normal_dur,anomaly_dur,normal_dis,anomaly_dis=get_z_score(res_task_duration,res_time_distance,res_dur,res_dis,2)
#print(normal_dur)
#print(normal_dis)
print(anomaly_dur)

print(len(anomaly_dur))

print(anomaly_dis)
print('len')
print(len(anomaly_dis))

[case_id                      3
start_activity          accept
end_activity            accept
start_life               start
end_life              complete
task_duration(min)     14400.0
Name: 271, dtype: object, case_id                                       38
start_activity        invite additional reviewer
end_activity          invite additional reviewer
start_life                                 start
end_life                                complete
task_duration(min)                       11520.0
Name: 360, dtype: object, case_id                     53
start_activity          decide
end_activity            decide
start_life               start
end_life              complete
task_duration(min)     14400.0
Name: 512, dtype: object, case_id                     75
start_activity          decide
end_activity            decide
start_life               start
end_life              complete
task_duration(min)     15840.0
Name: 770, dtype: object, case_id                            86
start