### Setup and Import Section ###

In [None]:
# replicateExp.ipynb
import os
import sys # For robust Colab detection

# --- Setup Section ---
# This part of the code takes care of preparing the environment, both on Colab and locally.

# Check if we are on Google Colab
# A more robust way to check if the code is running in Google Colab
IS_COLAB = 'google.colab' in sys.modules

if IS_COLAB:
    print("Running on Google Colab. Setting up environment...")
    # Name of your GitHub repository - IMPORTANT: MAKE SURE THIS MATCHES YOUR REPO NAME
    repo_name = "REMEMBER" # This should be the actual name of your repository folder
    # Ensure the URL is correct for your repository
    repo_url = f"https://github.com/bistrulli/{repo_name}.git"

    # Clone the repository if the directory doesn't already exist
    if not os.path.exists(repo_name):
        print(f"Cloning repository '{repo_name}' from {repo_url}...")
        !git clone {repo_url}
        if os.path.exists(repo_name):
            print(f"Repository '{repo_name}' cloned successfully.")
        else:
            print(f"Error: Cloning failed or repository directory not found as '{repo_name}'.")
            # Attempt to list current directory contents to help debug
            print("Current directory contents:")
            !ls -a
    else:
        print(f"Repository directory '{repo_name}' already exists. Skipping clone.")

    # Change to the repository directory if it exists
    if os.path.isdir(repo_name):
        os.chdir(repo_name)
        print(f"Changed directory to: {os.getcwd()}")
    else:
        print(f"Warning: Could not change to directory '{repo_name}'. It does not exist or is not a directory.")
        print(f"Current directory remains: {os.getcwd()}")
else:
    import argparse
    print("Not running on Google Colab (or Colab detection failed). Assuming local environment.")
    # You might want to add any local-specific setup here if needed

# --- End Setup Section ---

print("Installing required packages...")
!pip install -r requirements.txt
print("Package installation attempt complete.")

# Attempt to import pm4py to verify installation, especially on Colab
try:
    import pm4py
    print("pm4py imported successfully.")
except ImportError:
    print("Error: pm4py (or one of its dependencies) could not be imported after installation.")


In [1]:

from vlmcProcessMining import *
import numpy as np
import os
import subprocess
import sys
import json
import re
from scipy.io import savemat
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import pathlib
import shutil
import tqdm
import time
import glob
import warnings
import xml.parsers.expat
import warnings
from pm4py.objects.log.importer.xes import importer as xes_importer
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.petri_net.obj import PetriNet, Marking
from pm4py.objects.petri_net.importer import importer as pnml_importer
from pm4py.algo.simulation.playout.petri_net import algorithm as simulator
from pm4py.statistics.variants.log import get as variants_module
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.util import dataframe_utils
import gzip
import tempfile
import atexit

import matplotlib.pyplot as plt
from scipy import stats


noisydir=Path(os.getcwd())/"data"/"noisy_data"
ldir=Path(os.getcwd())/pathlib.Path("likelyhood")
dataDir=Path("./data/sldpn-reproducibility/experiments/")
trainLogDir=dataDir/Path("2-splitlogs")
testLogDir=dataDir/Path("2-splitlogstest")
meausreDir=dataDir/Path("4-measures")
discoveredStochModel=dataDir/Path("3-discoveredstochasticmodels")
enjoySilentDir=Path(os.getcwd())/"data"/"enjoythesilent"

lognames=['bpic12-a','BPI_Challenge_2013_incidents',
          'BPI_Challenge_2013_open_problems','BPI_Challenge_2013_closed_problems',
          'BPI Challenge 2017 - Offer log','bpic2020-DomesticDeclarations',
          'bpic2020-InternationalDeclarations','bpic2020-PrepaidTravelCost',
          'bpic2020-RequestForPayment','Sepsis',
          'Road_Traffic_Fine_Management_Process']

def getVLMCName(inputFile):
    vlmcName=None
    inputFile=Path(inputFile)
    if inputFile.suffix == '.gz':
        vlmcName='.'.join(inputFile.stem.split('.')[0:-1])
    else:
        vlmcName=inputFile.stem
    return vlmcName
    

### Mine The VLMC From an Event Log ###

In [2]:
def createVLMC(inputFile):
    readInputFile(inputFile)
    vlmcName=getVLMCName(inputFile)
    st=time.time()
    mineProcess(ecfFile=f"{os.getcwd()}/data/VLMC/{vlmcName}.ecf",
    infile=f"{os.getcwd()}/data/converted/{vlmcName}.txt",
    vlmcfile=f"{os.getcwd()}/data/VLMC/{vlmcName}.vlmc", 
    nsim="1", ntime="1", alfa="1")
    vlmcTime=time.time()-st
    print(f"VLMC Mined In {vlmcTime}")
    return vlmcTime

### Compute Likelihood With VLMC ###

In [3]:
def getVLMCLikelyhood(vlmcName=None,traceFile=None):
    st=time.time()
    ecfFile=pathlib.Path(f"{os.getcwd()}/data/VLMC/{vlmcName}.ecf").absolute()
    infile=pathlib.Path(f"{os.getcwd()}/data/converted/{vlmcName}.txt").absolute()
    vlmcfile=pathlib.Path(f"{os.getcwd()}/data/VLMC/{vlmcName}2.vlmc").absolute()
    vlmc=pathlib.Path(f"{os.getcwd()}/data/VLMC/{vlmcName}.vlmc").absolute()
    
    cwd=(ldir/pathlib.Path(traceFile.name.split(".")[0]))
    cwd.mkdir(parents=True, exist_ok=True)

    getLikelyhood(
        ecfFile=str(ecfFile),
        infile=str(infile), #dataset used to learn the vlmc
        vlmc=str(vlmc), #input VLMC to use
        vlmcfile=str(vlmcfile),  #output VLMC
        traces=str(traceFile), #traces to compute likelyhood
        cwd=str(cwd),
        outFile="out.mat",nsim="1", ntime="1",alfa="1")
    liktime=time.time()-st
    return liktime

### Preprocess Test Data ###

In [4]:
def prepareTestDate(inputTrace=None):
    datasetName=inputTrace.name
    testsetName=f"{'.'.join(datasetName.split('.')[0:-2])}_test.{'.'.join(datasetName.split('.')[-2:])}"
    testTraceFile=testLogDir/Path(testsetName)
    if(not testTraceFile.is_file()):
        (testLogDir/Path(datasetName)).rename(testTraceFile)
    return testTraceFile

### Compute uEMSC With VLMC Likelihood ###

In [5]:
def uEMSCVLMC(inputLan=None,vlmcName=None):
    st=time.time()
    cwd=(ldir/pathlib.Path(inputLan.name.split(".")[0]))
    cwd.mkdir(parents=True, exist_ok=True)
    
    traceLikName=".".join(inputLan.name.split(".")[0:-1])+".lik"
    traceLikFile=cwd/pathlib.Path(traceLikName)
    
    modelLikName=vlmcName+".vlmc.lik"
    modelLikFile=cwd/pathlib.Path(modelLikName)
    
    uEMSC=computeMuEMSC(traceLik=traceLikFile,modelLik=modelLikFile)
    emsctime=time.time()-st
    return uEMSC,emsctime

### Road Traffic Fines Management Example

In [6]:
def reproduceVLMVExp():
    matching_logs = list(trainLogDir.rglob("*.xes.gz[0-9].xes.gz"))
    #Loop over the logs
    for log in matching_logs:
        #if("Hospital" in str(log) or "testlogdata" in str(log) or "test log" in str(log)):
        #    continue
        if(Path(log).name.split('.')[0] not in lognames):
            print(f"skipped {Path(log).name}")
            continue
        #create Model from Log
        #inputTrace=trainLogDir/Path("SERVICES.csv1.xes.gz0.xes.gz")
        inputTrace=log
        mctime=createVLMC(inputFile=inputTrace)
        
        #Process Test Trace
        testTrace=prepareTestDate(inputTrace=inputTrace)
        readInputFile(inputFile=testTrace)
        
        #Compute TestLanguage LikelyHood
        vlmcName=".".join(inputTrace.name.split(".")[0:-2])
        traceLan=ldir/Path(inputTrace.name.split(".")[0])/Path(f"{vlmcName}_trace.lan")
        testLan=ldir/Path(inputTrace.name.split(".")[0])/Path(f"{vlmcName}_test_trace.lan")
        liktime=getVLMCLikelyhood(vlmcName=vlmcName,traceFile=testLan)
        
        #Compute uEMSC
        uEMSC,uemsctime=uEMSCVLMC(inputLan=testLan,vlmcName=vlmcName)
    
        uEMSCFile=ldir/Path(inputTrace.name.split(".")[0])/Path(f"{vlmcName}.uemsc")
        uEMSCTimeFile=ldir/Path(inputTrace.name.split(".")[0])/Path(f"{vlmcName}.time")
        np.savetxt(str(uEMSCFile),[uEMSC])
        np.savetxt(str(uEMSCTimeFile),[mctime,liktime,uemsctime])


### Collect VLMC Results

In [8]:
def collectVLMCRes():
    matching_files = list(ldir.rglob("*.uemsc"))
    results=[]
    # Print the results
    for file in matching_files:
        filePath=Path(file)
        Logname=filePath.name.split(".")[0]
        splitIdx=int(re.findall(r'gz(\d+)\.uemsc',filePath.name)[0])
        uemsc=np.loadtxt(filePath)
        #print(Logname,splitIdx,uemsc)
        results+=[[Logname,splitIdx,uemsc]]
    return pd.DataFrame(results,columns=["logname","split","uemsc"])

In [9]:
def collectVLMCTime():
    matching_files = list(ldir.rglob("*.time"))
    results=[]
    # Print the results
    for file in matching_files:
        filePath=Path(file)
        Logname=filePath.name.split(".")[0]
        splitIdx=int(re.findall(r'gz(\d+)\.time',filePath.name)[0])
        time=np.loadtxt(filePath)
        #print(Logname,splitIdx,uemsc)
        results+=[[Logname,splitIdx,time[0],time[1],time[2]]]
    return pd.DataFrame(results,columns=["logname","split","mctime","liktime","uemsctime"])

### Reproduce Papers Plots ###

In [None]:
#lognames=baselineRes["logname"].unique()
plt.rcParams.update({'font.size': 18})
# Create a figure
medianprops = dict(linestyle=None, linewidth=0, color='firebrick')
meanprops = dict(linestyle="-", linewidth=1.5, color='orange')

#fig, ax = plt.subplots(len(lognames),1, figsize=(9, 30))  # 1 row, 2 columns of subplots
for i,lname in enumerate(lognames):
    #bestMethod=baselineRes[baselineRes['logname']==lname].groupby(by=['dis-tech', 'stoch-tech'])['zemsc'].mean().idxmax()
    bestMethods=btech[(btech["logname"]==lname)]
    x=[]
    names=[]
    for bm in bestMethods.to_numpy():
        names+=[f"{bm[1]}_{bm[2]}"]
        values=baselineRes[(baselineRes["dis-tech"]==bm[1]) & (baselineRes["stoch-tech"]==bm[2]) & (baselineRes["logname"]==bm[0])]["zemsc"].values
        x+=[values.tolist()]

    x+=[vlmcRes[vlmcRes["logname"]==lname]["uemsc"].values]
    names+=["VLMC"]

    for nidx in range(len(names)-1):
        ks=stats.ks_2samp(x[nidx], x[-1])
        if(ks.pvalue<0.05 or len(x[nidx])==0 ):
            #names[nidx]+="-r"
            pass
            
    fig=plt.figure(figsize=(10, 4))
    plt.boxplot(x,tick_labels=names,showmeans=True,meanline=True,medianprops=medianprops,meanprops=meanprops)
    plt.ylabel("uEMSC")
    plt.grid()
    plt.xticks(rotation=30)
    plt.savefig(f'./plots/{lname}.pdf',bbox_inches='tight', pad_inches=0)  