# Process Mining for the visualization in Celonis

In [2]:
import pandas as pd
import re 
import os
import glob
PATH = '../data/data_parquet'


In [4]:
data = pd.concat([pd.read_parquet(f) for f in glob.glob(f"{PATH}/combined/*.parquet")], ignore_index=True)

In [5]:
KEEP_COLUMNS = ["id", "creation_time", "formatted_creation_time", "airline_code", "flight_number", "action_name", "header_line", "flight_id"]
data = data.drop(columns=[column for column in data.columns if column not in KEEP_COLUMNS])

In [6]:
data.head()

Unnamed: 0,creation_time,formatted_creation_time,airline_code,flight_number,action_name,header_line,flight_id
0,2024-05-07 05:52:54,07-05-2024 05:52:54,MN,1061,StorePaxDataAction,"2024-05-07 05:52:54,127 INFO [ea380e340254f09...",MN-1061-2024-8-5-DUB
1,2024-05-07 05:52:58,07-05-2024 05:52:58,MN,1119,StorePaxDataAction,"2024-05-07 05:52:58,084 INFO [c426c6873162f9f...",MN-1119-2024-8-5-DUB
2,2024-05-07 05:52:58,07-05-2024 05:52:58,MN,1119,StorePaxDataAction,"2024-05-07 05:52:58,087 INFO [c426c6873162f9f...",MN-1119-2024-8-5-DUB
3,2024-05-07 05:53:02,07-05-2024 05:53:02,MN,1162,StorePaxDataAction,"2024-05-07 05:53:02,194 INFO [22be48812e5c854...",MN-1162-2024-8-5-DUB
4,2024-05-07 05:53:02,07-05-2024 05:53:02,MN,1162,StorePaxDataAction,"2024-05-07 05:53:02,195 INFO [22be48812e5c854...",MN-1162-2024-8-5-DUB


### Extract the timestamp from header_line

In [7]:
def extract_time(time):
    match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}', time)
    if match:
        return match.group(0)
    return None

data['extracted_time'] = data['header_line'].apply(extract_time)

DROP2_COLUMNS = ['header_line', 'creation_time', 'formatted_creation_time']
data.drop(columns=DROP2_COLUMNS, inplace=True)
print(data.head())

data.to_csv('../data/data_process/data_process_final/final_data.csv', index=False)

  airline_code  flight_number         action_name             flight_id  \
0           MN           1061  StorePaxDataAction  MN-1061-2024-8-5-DUB   
1           MN           1119  StorePaxDataAction  MN-1119-2024-8-5-DUB   
2           MN           1119  StorePaxDataAction  MN-1119-2024-8-5-DUB   
3           MN           1162  StorePaxDataAction  MN-1162-2024-8-5-DUB   
4           MN           1162  StorePaxDataAction  MN-1162-2024-8-5-DUB   

            extracted_time  
0  2024-05-07 05:52:54,127  
1  2024-05-07 05:52:58,084  
2  2024-05-07 05:52:58,087  
3  2024-05-07 05:53:02,194  
4  2024-05-07 05:53:02,195  


### Filtering for the different airlines for Celonis

In [9]:
airline_codes = data['airline_code'].unique()

output_directory = '../process/data/data_final/final_data_by_airline'
os.makedirs(output_directory, exist_ok=True)

for code in airline_codes:
    filtered_df = data[data['airline_code'] == code]
    new_filename = f"combined_data_{code}.csv"
    filtered_df.to_csv(f"../data/data_process/data_process_final/combined_data_{code}.csv", index=False)