In [22]:
import pandas as pd
import json
def load_config(path):
    with open(path, 'r') as file:
        return json.load(file)

config_path = './pandas_config.json'
config = load_config(config_path)
parquet_config = config['parquet']

# Setting up the engine as a global constant
ENGINE = parquet_config['engine']
COMPRESSION = parquet_config['compression']
FILE_PATH = "./data/data_parquet"
CONTINENTS = ["asia", "america", "europe"]

In [23]:
df = None
for continent in CONTINENTS:
    subset_df = pd.read_parquet(FILE_PATH+f"/{continent}_data.parquet", engine=ENGINE)
    if continent == CONTINENTS[0]:
        df = subset_df
    else:
        df = pd.concat([df, subset_df], ignore_index=True)


In [24]:
filtered_actions = df[df['action_name'] == 'CreateZFWMessageAction']
filtered_actions[["id", "entry_details", "flight_number"]]["entry_details"].to_csv("entry_details.csv", index=False)

In [None]:
flight_numbers = filtered_actions["flight_number"].unique()

In [25]:
import re
flights = {}
for flight in flight_numbers:
    flight_df = filtered_actions[filtered_actions["flight_number"] == flight]
    flight_df.sort_values(by="creation_time", inplace=True)
    values = flight_df["entry_details"].values
    pattern = r"^ZFW (\d+) KG"

    # Using list comprehension to extract digits from matching patterns
    estimated_zfw_values = [re.match(pattern, value).group(1) for value in values if re.match(pattern, value)]
    
    pattern = r"<actualZFW>\s*(\d+)\s*</actualZFW>"
    actual_zfw_values = [re.search(pattern, value).group(1) for value in values if re.search(pattern, value)]

    flights[flight] = {"estimated_zfw": estimated_zfw_values, "actual_zfw": actual_zfw_values}