In [1]:
cd ..

/home/avani.gupta/bpirl2


In [2]:
import pandas as pd
import numpy as np
import random
import utils

In [3]:
def get_trace_len(df):
    """
    get the length of trace
    """
    cid = list(df['CaseID'])
    dic = {}
    for i in cid:
        if i in dic:
            dic[i] += 1
        else:
            dic[i] = 1
    vals = np.array(list(dic.values()))
#     print(vals)
    return np.max(vals), np.min(vals), np.mean(vals)

In [4]:
def analyse_dataset(df):
    """
    Analyses the given dataset
    prints num_events, num_cases, num_unique_activities and trace length of the given dataframe of the dataset
    
    assumes that df has columns [CaseID, ActivityID, CompleteTimestamp]
    each row in df denotes one event, and has attributes corresponding to it
    CaseID: has case id information (each process instance has a unique case id)
    ActivityID: has activity id (each activity has one unique activity id)
    CompleteTimestamp: The time of completetion of that event
    """
    
    # num events
    print("num events:",len(df))
    
    # num cases
    print("num cases:",len(df["CaseID"].unique()))
    
    # unique activities
    unique_act = [0] + sorted(df['ActivityID'].unique())
    print("num unique activities:", len(unique_act))
    
    # trace_len
    maxx, minn, avg = get_trace_len(df)
    print("max {} min {} mean {}".format(maxx, minn, avg))
    
    


In [5]:
path = "dataset/"
#traffic is original traffic dataset, traffic_ss is 10% sampled version of it
datasets = ["helpdesk", "bpi_12_w", "traffic_ss"]
for dataset in datasets:
    df = pd.read_csv(path+dataset+".csv")
    print(dataset)
    analyse_dataset(df)
    print("\n")

helpdesk
num events: 13710
num cases: 3804
num unique activities: 10
max 14 min 1 mean 3.6041009463722395


bpi_12_w
num events: 72413
num cases: 9658
num unique activities: 7
max 74 min 1 mean 7.497722095671982


traffic_ss
num events: 56388
num cases: 15037
num unique activities: 12
max 17 min 2 mean 3.7499501230298597




In [6]:
path = "dataset/"
#traffic is original traffic dataset, traffic_ss is 10% sampled version of it
datasets = ["helpdesk_converted","helpdesk"]
for dataset in datasets:
    if dataset == "helpdesk_converted":
        df = pd.read_csv(path+dataset+".csv",sep = ';')
        print(dataset)
        analyse_dataset(df)
    else:
        df = pd.read_csv(path+dataset+".csv")
        print(dataset)
        analyse_dataset(df)
        
# print("\n")

helpdesk_converted
num events: 21348
num cases: 4580
num unique activities: 15
max 15 min 2 mean 4.661135371179039
helpdesk
num events: 13710
num cases: 3804
num unique activities: 10
max 14 min 1 mean 3.6041009463722395


In [7]:
datasets = ["helpdesk"]

df2 = pd.read_csv(path+datasets[0]+".csv")

In [8]:
df2

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,2,1,2012-04-03 16:55:38
1,2,8,2012-04-03 16:55:53
2,2,6,2012-04-05 17:15:52
3,3,1,2010-10-29 18:14:06
4,3,8,2010-11-04 01:16:11
...,...,...,...
13705,4579,8,2010-07-26 21:31:59
13706,4579,6,2010-07-26 21:32:11
13707,4580,8,2012-01-03 18:33:43
13708,4580,9,2012-01-11 00:30:11


In [18]:
path = "dataset/preprocessed/"
#traffic is original traffic dataset, traffic_ss is 10% sampled version of it
datasets = ["helpdesk", "bpi_12_w",  "traffic_ss"]
for dataset in datasets:
    df = pd.read_pickle(path+dataset+"_d2_test_RL.pkl")
    print(len(df))
    print(dataset)
    lastk = {}
    group = df.groupby('CaseID')
    for name, grp in group:
        events = list(grp.tail(3)['class'])
        for e in events:
            if e in lastk:
                lastk[e] += 1
            else:
                lastk[e] = 1
    print(lastk)
# analyse_dataset(df)
# print("\n")

2759
helpdesk
{1: 604, 8: 788, 6: 790, 9: 170, 2: 5, 5: 1}
16251
bpi_12_w
{3: 1280, 4: 708, 6: 1008, 5: 1700, 1: 125}
10815
traffic_ss
{3: 1419, 4: 1480, 5: 1422, 1: 1659, 2: 501, 6: 1161, 7: 57, 8: 59, 11: 3, 9: 5, 10: 4}


In [10]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,duration_time,remaining_time,class,CaseID
20,0,0,0,1,0,0,0,0,0,0,0.000000,18.106516,3,9
21,0,1,0,0,0,0,0,0,0,0,0.000150,18.106366,1,9
22,0,0,0,0,0,0,0,0,1,0,16.891597,1.214769,8,9
23,0,0,0,0,0,0,1,0,0,0,1.214769,0.000000,6,9
24,0,1,0,0,0,0,0,0,0,0,0.000000,36.997836,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13676,0,0,0,0,0,0,0,0,1,0,22.169537,0.000104,8,4569
13677,0,0,0,0,0,0,1,0,0,0,0.000104,0.000000,6,4569
13699,0,1,0,0,0,0,0,0,0,0,0.000000,19.709271,1,4577
13700,0,0,0,0,0,0,0,0,1,0,19.709213,0.000058,8,4577


In [12]:
lastk

{1: 405, 8: 787, 6: 789, 9: 360, 4: 4, 2: 16, 5: 2, 7: 1}

In [13]:
from utils import get_avg_duration

In [14]:
datasets = ["helpdesk", "bpi_12_w", "traffic_ss"]
for dataset in datasets:
    print(dataset,get_avg_duration(dataset))

helpdesk 8.490949396798573
bpi_12_w 15.718907135529657
traffic_ss 342.4085792697478
