# Analyse clusters of international declarations

In [1]:
# Imports
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load log
import sys
sys.path.insert(1, '../')
from src.io import PRE, read_log

ptc_log = read_log(PRE)

HBox(children=(FloatProgress(value=0.0, description='parsing log, completed traces :: ', max=2007.0, style=Pro…




In [3]:
def analyze_request(request):
    no_submissions = 0
    reimbursed = False
    

    for event in request:
        activity_type = event["concept:name"].lower()
        if activity_type == "request for payment submitted by employee":
            no_submissions += 1
        elif activity_type == "payment handled":
            reimbursed = True

    return (no_submissions, reimbursed)


def extract_info_from_request(request):
    (no_submissions, reimbursed) = analyze_request(request)
    
    attributes = request[0]

    decl_obj = {
        'id': attributes["(case)_Rfp_id"],
        'amount': float(attributes["(case)_RequestedAmount"]),
        'permit_budget': attributes["(case)_Permit_BudgetNumber"],
        'org': attributes["(case)_OrganizationalEntity"],
        'permit_org': attributes["(case)_Permit_OrganizationalEntity"],
        'project': attributes["(case)_Project"],
        'permit_project': attributes["(case)_Permit_ProjectNumber"],
        'requested_budget': float(attributes["(case)_Permit_RequestedBudget"]),
        'permit': attributes["(case)_Permit_id"],
        'submissions': no_submissions,
        'reimbursed': reimbursed
    }
    return decl_obj

In [4]:
# Schema: [{'id', 'amount', 'budget', 'permit_budget', 'activity', 'org', 'project', 'requested_budget', 'task', 'permit', 'travel_time', 'submissions', 'reimbursed'}]
requests = []

for request in ptc_log:
    
    requests.append(extract_info_from_request(request))

ptc_dataframe = pd.DataFrame(requests)
ptc_dataframe.set_index('id', drop=False)


Unnamed: 0_level_0,id,amount,permit_budget,org,permit_org,project,permit_project,requested_budget,permit,submissions,reimbursed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
request for payment 73550,request for payment 73550,854.579838,budget 6198,organizational unit 65463,organizational unit 65455,project 503,UNKNOWN,1979.272104,travel permit 73549,1,False
request for payment 73552,request for payment 73552,854.579838,budget 6198,organizational unit 65463,organizational unit 65455,project 503,UNKNOWN,1979.272104,travel permit 73549,1,True
request for payment 76316,request for payment 76316,1173.957795,budget 2233,organizational unit 65463,organizational unit 65455,project 503,UNKNOWN,3553.601973,travel permit 76314,1,True
request for payment 73536,request for payment 73536,790.552073,budget 899,organizational unit 65463,organizational unit 65455,project 503,project 2260,3627.364438,travel permit 73535,1,True
request for payment 76195,request for payment 76195,181.978003,budget 4164,organizational unit 65463,organizational unit 65455,project 503,UNKNOWN,265.661318,travel permit 76193,1,True
...,...,...,...,...,...,...,...,...,...,...,...
request for payment 186317,request for payment 186317,251.008620,UNKNOWN,organizational unit 65456,UNKNOWN,project 151278,UNKNOWN,0.000000,UNKNOWN,1,True
request for payment 186241,request for payment 186241,261.459608,UNKNOWN,organizational unit 65468,UNKNOWN,project 147556,UNKNOWN,0.000000,UNKNOWN,1,True
request for payment 48748,request for payment 48748,1511.596873,budget 1755,organizational unit 65469,organizational unit 65454,project 503,project 48747,1827.310143,travel permit 48745,1,False
request for payment 186247,request for payment 186247,20.202607,UNKNOWN,organizational unit 65463,UNKNOWN,project 147531,UNKNOWN,0.000000,UNKNOWN,1,True


In [5]:
ptc_dataframe['reimbursed'].mean()

0.9511709018435476

In [6]:

org_group = ptc_dataframe.groupby(['org'])
org_group.agg({
    'amount': 'mean',
    'submissions': 'mean',
    'reimbursed': ['mean', 'size']
    }).sort_values(by=('reimbursed', 'size'), ascending=False)

Unnamed: 0_level_0,amount,submissions,reimbursed,reimbursed
Unnamed: 0_level_1,mean,mean,mean,size
org,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
organizational unit 65461,686.843526,1.125382,0.957187,327
organizational unit 65463,761.4625,1.020202,0.942761,297
organizational unit 65458,856.013544,1.027778,0.958333,288
organizational unit 65454,621.400451,1.104478,0.973881,268
organizational unit 65469,749.675184,1.106481,0.953704,216
organizational unit 65462,835.325322,1.111111,0.9375,144
organizational unit 65456,514.093513,1.117117,0.981982,111
organizational unit 65465,611.221295,1.04902,0.901961,102
organizational unit 65468,18526.409722,1.073171,0.926829,82
organizational unit 65457,787.445716,1.189655,0.965517,58


In [7]:
project_group = ptc_dataframe.groupby(['project'])
project_group.agg({
    'amount': 'mean',
    'submissions': 'mean',
    'reimbursed': ['mean', 'size']
    }).sort_values(by=('reimbursed', 'size'), ascending=False)

Unnamed: 0_level_0,amount,submissions,reimbursed,reimbursed
Unnamed: 0_level_1,mean,mean,mean,size
project,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
project 503,1520.142604,1.092593,0.962963,1890
UNKNOWN,591.306033,0.814815,0.0,27
project 147556,257.043428,1.066667,1.0,15
project 147546,136.153187,1.0,1.0,8
project 148052,1004.78868,1.125,1.0,8
project 147620,410.579192,1.125,1.0,8
project 147531,306.597268,1.0,1.0,7
project 151278,393.610495,1.0,1.0,6
project 147582,117.894452,1.0,0.8,5
project 147649,102.763809,1.0,1.0,4
