# Analyse clusters of international declarations

In [1]:
# Imports
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load log
import sys
sys.path.insert(1, '../')
from src.io import INT_DEC, read_log

id_log = read_log(INT_DEC)

HBox(children=(FloatProgress(value=0.0, description='parsing log, completed traces :: ', max=6323.0, style=Pro…




In [3]:
def analyze_declaration(declaration):
    trip_start = None
    trip_end = None
    trip_length = np.nan
    no_submissions = 0
    reimbursed = False
    

    for event in declaration:
        activity_type = event["concept:name"].lower()
        if activity_type == "start trip":
            trip_start = event["time:timestamp"]
        elif activity_type == "end trip":
            trip_end = event["time:timestamp"]
        elif activity_type == "declaration submitted by employee":
            no_submissions += 1
        elif activity_type == "payment handled":
            reimbursed = True

    if (trip_start and trip_end):
        trip_length = (trip_end - trip_start).days
    # if (trip_length > 70):
    #     trip_length = 70

    return (trip_length, no_submissions, reimbursed)


def extract_info_from_declaration(declaration):
    (trip_length, no_submissions, reimbursed) = analyze_declaration(declaration)
    
    attributes = declaration[0]

    decl_obj = {
        'id': attributes["(case)_id"],
        'amount': float(attributes["(case)_Amount"]),
        'budget': attributes["(case)_BudgetNumber"],
        'permit_budget': attributes["(case)_Permit_BudgetNumber"],
        'activity': attributes["(case)_Permit_ActivityNumber"],
        'org': attributes["(case)_Permit_OrganizationalEntity"],
        'project': attributes["(case)_Permit_ProjectNumber"],
        'requested_budget': float(attributes["(case)_Permit_RequestedBudget"]),
        'task': attributes["(case)_Permit_TaskNumber"],
        'permit': attributes["(case)_Permit_id2"],
        'travel_time': trip_length,
        'submissions': no_submissions,
        'reimbursed': reimbursed
    }
    return decl_obj

In [4]:
# Schema: [{'id', 'amount', 'budget', 'permit_budget', 'activity', 'org', 'project', 'requested_budget', 'task', 'permit', 'travel_time', 'submissions', 'reimbursed'}]
declarations = []

for declaration in id_log:
    
    declarations.append(extract_info_from_declaration(declaration))

id_dataframe = pd.DataFrame(declarations)
id_dataframe.set_index('id', drop=False)


Unnamed: 0_level_0,id,amount,budget,permit_budget,activity,org,project,requested_budget,task,permit,travel_time,submissions,reimbursed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
declaration 72590,declaration 72590,14.569044,budget 143921,budget 2233,UNKNOWN,organizational unit 65455,UNKNOWN,245.060103,UNKNOWN,travel permit 72588,2,2,True
declaration 143637,declaration 143637,709.704557,budget 143641,budget 425,UNKNOWN,organizational unit 65454,project 426,923.081315,task 427,travel permit 423,3,1,True
declaration 74628,declaration 74628,0.000000,budget 143485,budget 635,UNKNOWN,organizational unit 65458,project 3407,0.000000,task 427,travel permit 74626,6,0,False
declaration 143644,declaration 143644,399.691001,budget 143648,budget 425,UNKNOWN,organizational unit 65454,project 426,923.081315,task 427,travel permit 423,3,3,True
declaration 72817,declaration 72817,614.080529,budget 143506,budget 974,UNKNOWN,organizational unit 65458,project 54409,1015.978607,task 427,travel permit 72815,3,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
declaration 34470,declaration 34470,385.909918,budget 147444,budget 698,UNKNOWN,organizational unit 65458,project 12435,409.670826,task 427,travel permit 34468,1,1,True
declaration 23807,declaration 23807,268.754590,budget 147442,budget 980,UNKNOWN,organizational unit 65456,project 981,258.261951,task 427,travel permit 23805,1,1,True
declaration 15963,declaration 15963,389.275259,budget 147381,budget 922,UNKNOWN,organizational unit 65458,project 12152,798.420236,task 427,travel permit 15961,2,1,True
declaration 24958,declaration 24958,96.357465,budget 147442,budget 864,UNKNOWN,organizational unit 65454,UNKNOWN,55.271331,UNKNOWN,travel permit 24956,0,1,True


In [5]:
id_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                6323 non-null   object 
 1   amount            6323 non-null   float64
 2   budget            6323 non-null   object 
 3   permit_budget     6323 non-null   object 
 4   activity          6323 non-null   object 
 5   org               6323 non-null   object 
 6   project           6323 non-null   object 
 7   requested_budget  6323 non-null   float64
 8   task              6323 non-null   object 
 9   permit            6323 non-null   object 
 10  travel_time       6323 non-null   int64  
 11  submissions       6323 non-null   int64  
 12  reimbursed        6323 non-null   bool   
dtypes: bool(1), float64(2), int64(2), object(8)
memory usage: 599.1+ KB


In [6]:
org_group = id_dataframe.groupby(['org'])
org_group.agg({
    'amount': 'mean',
    'travel_time': ['mean', 'median', 'max', 'min'],
    'submissions': 'mean',
    'reimbursed': ['mean', 'size']
    }).sort_values(by=('reimbursed', 'size'), ascending=False)

Unnamed: 0_level_0,amount,travel_time,travel_time,travel_time,travel_time,submissions,reimbursed,reimbursed
Unnamed: 0_level_1,mean,mean,median,max,min,mean,mean,size
org,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
organizational unit 65458,818.656975,17.186047,5.0,364,0,1.224101,0.978858,1419
organizational unit 65455,594.881918,10.071361,4.0,364,0,1.128449,0.970504,1051
organizational unit 65454,820.58335,4.237592,3.0,120,0,1.269271,0.952482,947
organizational unit 65456,801.302783,6.122402,4.0,364,0,1.34873,0.944573,866
organizational unit 65459,892.717422,8.973361,5.0,124,0,1.344262,0.940574,488
organizational unit 65460,912.633768,6.496788,4.0,228,0,1.229122,0.96788,467
organizational unit 65457,678.705707,7.094595,4.0,274,0,1.219595,0.952703,296
organizational unit 65464,674.776625,5.438356,4.0,176,0,1.359589,0.958904,292
organizational unit 65466,872.319099,13.63981,5.0,307,0,1.436019,0.947867,211
organizational unit 65461,409.996261,1.961039,2.0,6,0,1.220779,0.974026,77


In [7]:
project_group = id_dataframe.groupby(['project'])
project_group.agg({
    'amount': 'mean',
    'travel_time': ['mean', 'median', 'max', 'min'],
    'submissions': 'mean',
    'reimbursed': ['mean', 'size']
    }).sort_values(by=('reimbursed', 'size'), ascending=False)

Unnamed: 0_level_0,amount,travel_time,travel_time,travel_time,travel_time,submissions,reimbursed,reimbursed
Unnamed: 0_level_1,mean,mean,median,max,min,mean,mean,size
project,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
UNKNOWN,710.190527,12.799913,4.0,364,0,1.280732,0.959024,2294
project 426,869.242355,2.976190,3.0,4,1,1.166667,0.952381,462
project 3442,682.669724,7.460000,5.0,66,0,1.120000,1.000000,50
project 647,511.960255,8.717949,7.0,32,1,1.051282,0.974359,39
project 636,1060.979131,8.473684,6.0,62,0,1.078947,0.947368,38
...,...,...,...,...,...,...,...,...
project 66881,579.425628,7.000000,7.0,7,7,2.000000,1.000000,1
project 46796,373.366016,5.000000,5.0,5,5,1.000000,1.000000,1
project 26728,352.556652,3.000000,3.0,3,3,1.000000,1.000000,1
project 18190,969.441524,7.000000,7.0,7,7,1.000000,1.000000,1
