In [15]:
# import necessary libraries
import pandas as pd
import re
from airportsdata import load


In [16]:
# list of continents
continents = ["europe", "asia", "america"]

# load and concatenate data from different continents
europe = pd.read_parquet('../data/data_parquet/europe_data.parquet')
asia = pd.read_parquet('../data/data_parquet/asia_data.parquet')
america = pd.read_parquet('../data/data_parquet/america_data.parquet')


data = pd.concat([europe, asia, america])

In [17]:
# convert 'creation_time' to datetime and correct the format
data['creation_time'] = pd.to_datetime(data['creation_time'], errors='coerce')

# format date and insert next to 'creation_time'
data.insert(data.columns.get_loc('creation_time') + 1, 'formatted_creation_time', data['creation_time'].dt.strftime('%d-%m-%Y %H:%M:%S'))

In [18]:
data['flight_id'] = data["airline_code"].astype(str) + "-" + data['flight_number'].astype(str) + "-" + data['creation_time'].dt.year.astype(str) + "-" + data["flight_date"].astype(str) + "-" + data['creation_time'].dt.month.astype(str)

In [19]:
DROP_COLUMNS = ["flight_date", "id"]
data = data.drop(columns=DROP_COLUMNS)

In [20]:
# get airport information from airportsdata
airport_data = load('IATA')

# function to get city, country, lat, and lon from airport code using airportsdata
def get_airport_info(airport_code):
    if airport_code in airport_data:
        airport_info = airport_data[airport_code]
        return airport_info['city'], airport_info['country'], round(airport_info['lat'], 5), round(airport_info['lon'], 5)
    else:
        return None, None, None, None

# apply the function to the departure_airport column
data['departure_city'], data['departure_country'], data['departure_lat'], data['departure_lon'] = zip(*data['departure_airport'].apply(get_airport_info))

# insert after departure_airport
data.insert(data.columns.get_loc('departure_airport') + 1, 'departure_city', data.pop('departure_city'))
data.insert(data.columns.get_loc('departure_airport') + 2, 'departure_country', data.pop('departure_country'))
data.insert(data.columns.get_loc('departure_airport') + 3, 'departure_lat', data.pop('departure_lat'))
data.insert(data.columns.get_loc('departure_airport') + 4, 'departure_lon', data.pop('departure_lon'))


In [21]:
# extract information from the header_line column
stepIDs = []
action_modes = []
log_levels = []
header_lines = data["header_line"].values

for header in header_lines:
    # extract stepID
    match = re.search(r"\[(.*?)\]", header)
    if match:
        stepID = match.group(1)
        stepIDs.append(stepID)
    else:
        stepIDs.append(None)
    
    # determine action_mode
    if "Received" in header:
        action_modes.append("Received")
    elif "Saved" in header:
        action_modes.append("Saved")
    elif "Sent" in header:
        action_modes.append("Sent")
    else:
        action_modes.append(None)
    
    # extract log level
    log_level_match = re.search(r'INFO|DEBUG|ERROR|WARNING', header)
    log_levels.append(log_level_match.group(0) if log_level_match else None)
    
    # extract flight information
    flight_info_match = re.search(r'\[(.*?)\]', header)
    flight_info = flight_info_match.group(1) if flight_info_match else None

data["stepID"] = stepIDs
data["action_mode"] = action_modes

# insert after header_line
data.insert(data.columns.get_loc('header_line') + 1, 'stepID', data.pop('stepID'))
data.insert(data.columns.get_loc('header_line') + 2, 'action_mode', data.pop('action_mode'))

# add the extracted information to the dataframe
data['log_level'] = log_levels

# insert additional columns after header_line
data.insert(data.columns.get_loc('header_line') + 3, 'log_level', data.pop('log_level'))
# drop the header_line column
data.drop(columns=['header_line'], inplace=True)


In [22]:
data = data[~((data["action_mode"] == "Received") & (data["action_name"] == "CreateZFWMessageAction"))]

In [23]:
# function to extract the arrival_airport value
def extract_arrival_airport(entry_details):
    # regex pattern to find the arrivalStation tag and extract its value
    pattern = r'<arrivalStation>(.*?)</arrivalStation>'
    match = re.search(pattern, entry_details)
    if match:
        return match.group(1)
    return None

# extract 'arrival_airport' only for 'CreateZFWMessageAction' and 'Sent'
data['arrival_airport'] = data.apply(lambda row: extract_arrival_airport(row['entry_details']) if (row['action_name'] == 'CreateZFWMessageAction' and row['action_mode'] == 'Sent') else None, axis=1)

# create a dictionary to store the mappings
arrival_airport_dict = {}

# fill the dictionary with mappings of 'flight_number' and 'departure_airport' to 'arrival_airport'
for idx, row in data.iterrows():
    if pd.notna(row['arrival_airport']):
        key = (row['flight_number'], row['departure_airport'])
        arrival_airport_dict[key] = row['arrival_airport']

# apply the mappings from the dictionary to the dataframe
def get_arrival_airport(row):
    key = (row['flight_number'], row['departure_airport'])
    return arrival_airport_dict.get(key, None)

data['arrival_airport'] = data.apply(get_arrival_airport, axis=1)

# check if the 'arrival_airport' values were correctly extracted
print(data[['flight_number', 'departure_airport', 'arrival_airport']].drop_duplicates())

# display the unique values of the 'arrival_airport' column
unique_arrival_airports = data['arrival_airport'].unique()
print("Unique values in 'arrival_airport':")
print(unique_arrival_airports)

In [None]:
data.head()

Unnamed: 0,creation_time,formatted_creation_time,airline_code,flight_number,departure_airport,departure_city,departure_country,departure_lat,departure_lon,user_name,action_name,stepID,action_mode,log_level,entry_details,flight_id,arrival_airport
459794,2024-04-30 04:05:04,30-04-2024 04:05:04,MN,1045,MAN,Manchester,GB,53.3537,-2.27495,service-acco,CreateZFWMessageAction,ba653ba04189284d,Sent,INFO,Receiver queue : loadsystem/master/queue/...,MN-1045-2024-29-4,JFK
459796,2024-04-30 04:05:04,30-04-2024 04:05:04,MN,1045,MAN,Manchester,GB,53.3537,-2.27495,service-acco,CreateZFWMessageAction,ba653ba04189284d,Sent,INFO,Receiver queue : loadsystem/master/queue/...,MN-1045-2024-29-4,JFK
459798,2024-04-30 04:05:09,30-04-2024 04:05:09,MN,1592,DUB,Dublin,IE,53.4213,-6.27007,service-acco,CreateZFWMessageAction,1eb7ca10727068bb,Sent,INFO,Receiver queue : loadsystem/master/queue/...,MN-1592-2024-30-4,MAD
459800,2024-04-30 04:05:09,30-04-2024 04:05:09,MN,1592,DUB,Dublin,IE,53.4213,-6.27007,service-acco,CreateZFWMessageAction,1eb7ca10727068bb,Sent,INFO,Receiver queue : loadsystem/master/queue/...,MN-1592-2024-30-4,MAD
459802,2024-04-30 04:02:55,30-04-2024 04:02:55,MN,1520,DUB,Dublin,IE,53.4213,-6.27007,human,CreateZFWMessageAction,b207bbfe3e4f9f10,Sent,INFO,Receiver queue : loadsystem/master/queue/...,MN-1520-2024-30-4,CDG


In [None]:
print(data.shape)
print("_________________________________________")
print(data.dtypes)

(29501, 17)
_________________________________________
creation_time              datetime64[ns]
formatted_creation_time            object
airline_code                       object
flight_number                       int64
departure_airport                  object
departure_city                     object
departure_country                  object
departure_lat                     float64
departure_lon                     float64
user_name                          object
action_name                        object
stepID                             object
action_mode                        object
log_level                          object
entry_details                      object
flight_id                          object
arrival_airport                    object
dtype: object


In [None]:
data.to_parquet('../data/data_parquet/processed_data_combined.parquet', index=False)

In [None]:
# anzeigen der eindeutigen action_names
unique_actions = data['action_name'].unique()
unique_actions

array(['CreateZFWMessageAction'], dtype=object)

In [None]:
processed_data_path = '../data/data_parquet/processed_data_combined.parquet'

data1 = pd.read_parquet(processed_data_path)

filtered_data = data1[data1['action_name'] == 'CreateZFWMessageAction']

# count the number of rows
count = filtered_data.shape[0]

print(f"Number of rows with action 'CreateZFWMessageAction': {count}")

# filter the data for the action CreateZFWMessageAction
filtered_data = data1[data1['action_name'] == 'CreateZFWMessageAction']

# count the number of rows for each action_mode
counts = filtered_data['action_mode'].value_counts()

print("Number of rows for each action_mode in 'CreateZFWMessageAction':")
print(counts)


Number of rows with action 'CreateZFWMessageAction': 29501
Number of rows for each action_mode in 'CreateZFWMessageAction':
action_mode
Sent     21176
Saved     8325
Name: count, dtype: int64
