# Process Mining for the visualization in Celonis

In [3]:
import pandas as pd
import re 

### Concatenate the datasets

In [4]:
import pandas as pd

continents = ["europe", "asia", "america"]

# Load and concatenate data from different continents
europe = pd.read_parquet('../data/data_parquet/europe_data.parquet')
asia = pd.read_parquet('../data/data_parquet/asia_data.parquet')
america = pd.read_parquet('../data/data_parquet/america_data.parquet')

data = pd.concat([europe, asia, america])



### Convert the the time and add milliseconds

In [5]:
# convert 'creation_time' to datetime and correct the format
data['creation_time'] = pd.to_datetime(data['creation_time'], errors='coerce')

# format date and insert next to 'creation_time'
data.insert(data.columns.get_loc('creation_time') + 1, 'formatted_creation_time', data['creation_time'].dt.strftime('%d-%m-%Y %H:%M:%S.%f').str[:-3])



### Add new flight_id

In [6]:
data['flight_id'] = data["airline_code"].astype(str) + "-" + data['flight_number'].astype(str) + "-" + data['creation_time'].dt.year.astype(str) + "-" + data["flight_date"].astype(str) + "-" + data['creation_time'].dt.month.astype(str)

### Drop unnecessary columns

In [7]:
DROP_COLUMNS = ['entry_details', 'user_name', 'flight_date', 'departure_airport']
data = data.drop(columns=DROP_COLUMNS)

In [8]:
data.head()

Unnamed: 0,id,creation_time,formatted_creation_time,airline_code,flight_number,action_name,header_line,flight_id
0,137524484,2024-04-30 04:01:47,30-04-2024 04:01:47.000,MN,1262,AssignLoadplanAction,"2024-04-30 04:01:47,383 INFO [a277234c22fa2e5...",MN-1262-2024-30-4
1,137524940,2024-04-30 04:01:50,30-04-2024 04:01:50.000,MN,1262,AssignLoadplanAction,"2024-04-30 04:01:50,188 INFO [3b152cbdf5b057e...",MN-1262-2024-30-4
2,137524943,2024-04-30 04:01:50,30-04-2024 04:01:50.000,MN,1262,AssignLoadplanAction,"2024-04-30 04:01:50,193 INFO [3b152cbdf5b057e...",MN-1262-2024-30-4
3,137524964,2024-04-30 04:05:32,30-04-2024 04:05:32.000,MN,1630,AssignLoadplanAction,"2024-04-30 04:05:32,214 INFO [52735a0dd84d57d...",MN-1630-2024-30-4
4,137525021,2024-04-30 04:02:12,30-04-2024 04:02:12.000,MN,1202,AssignLoadplanAction,"2024-04-30 04:02:12,081 INFO [8d65801e1dbb10e...",MN-1202-2024-30-4


### Save the file

In [9]:
# Save the concatenated dataset to a new Parquet file
data.to_parquet('../process/data/data_parquet/combined_data.parquet')

# Save as CSV file
data.to_csv('../process/data/data_csv/combined_data.csv', index=False)

### Extract the timestamp from header_line

In [10]:
import pandas as pd
import re

# CSV-Datei laden
file_path = '../process/data/data_csv/combined_data.csv'
df = pd.read_csv(file_path)

def extract_time(lol):
    match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}', lol)
    if match:
        return match.group(0)
    return None

# Neue Spalte mit der extrahierten Zeit erstellen
df['extracted_time'] = df['header_line'].apply(extract_time)

# Das Ergebnis in derselben Datei speichern
df.to_csv(file_path, index=False)

# Ausgabe
print("Zeitinformationen wurden extrahiert und in eine neue Spalte eingefügt.")

Zeitinformationen wurden extrahiert und in eine neue Spalte eingefügt.


In [12]:
df.columns

Index(['id', 'creation_time', 'formatted_creation_time', 'airline_code',
       'flight_number', 'action_name', 'header_line', 'flight_id',
       'extracted_time'],
      dtype='object')

In [13]:
df.head()

Unnamed: 0,id,creation_time,formatted_creation_time,airline_code,flight_number,action_name,header_line,flight_id,extracted_time
0,137524484,2024-04-30 04:01:47,30-04-2024 04:01:47.000,MN,1262,AssignLoadplanAction,"2024-04-30 04:01:47,383 INFO [a277234c22fa2e5...",MN-1262-2024-30-4,"2024-04-30 04:01:47,383"
1,137524940,2024-04-30 04:01:50,30-04-2024 04:01:50.000,MN,1262,AssignLoadplanAction,"2024-04-30 04:01:50,188 INFO [3b152cbdf5b057e...",MN-1262-2024-30-4,"2024-04-30 04:01:50,188"
2,137524943,2024-04-30 04:01:50,30-04-2024 04:01:50.000,MN,1262,AssignLoadplanAction,"2024-04-30 04:01:50,193 INFO [3b152cbdf5b057e...",MN-1262-2024-30-4,"2024-04-30 04:01:50,193"
3,137524964,2024-04-30 04:05:32,30-04-2024 04:05:32.000,MN,1630,AssignLoadplanAction,"2024-04-30 04:05:32,214 INFO [52735a0dd84d57d...",MN-1630-2024-30-4,"2024-04-30 04:05:32,214"
4,137525021,2024-04-30 04:02:12,30-04-2024 04:02:12.000,MN,1202,AssignLoadplanAction,"2024-04-30 04:02:12,081 INFO [8d65801e1dbb10e...",MN-1202-2024-30-4,"2024-04-30 04:02:12,081"
