In [2]:
# import necessary libraries
import pandas as pd
import re
from airportsdata import load


In [3]:
# list of continents
continents = ["europe", "asia", "america"]

# load and concatenate data from different continents
europe = pd.read_parquet('../data/data_parquet/europe_data.parquet')
asia = pd.read_parquet('../data/data_parquet/asia_data.parquet')
america = pd.read_parquet('../data/data_parquet/america_data.parquet')


data = pd.concat([europe, asia, america])

In [4]:
# convert 'creation_time' to datetime and correct the format
data['creation_time'] = pd.to_datetime(data['creation_time'], errors='coerce')

# format date and insert next to 'creation_time'
data.insert(data.columns.get_loc('creation_time') + 1, 'formatted_creation_time', data['creation_time'].dt.strftime('%d-%m-%Y %H:%M:%S'))

In [5]:
# get airport information from airportsdata
airport_data = load('IATA')

# function to get city, country, lat, and lon from airport code using airportsdata
def get_airport_info(airport_code):
    if airport_code in airport_data:
        airport_info = airport_data[airport_code]
        return airport_info['city'], airport_info['country'], round(airport_info['lat'], 5), round(airport_info['lon'], 5)
    else:
        return None, None, None, None

# apply the function to the departure_airport column
data['departure_city'], data['departure_country'], data['departure_lat'], data['departure_lon'] = zip(*data['departure_airport'].apply(get_airport_info))

# insert after departure_airport
data.insert(data.columns.get_loc('departure_airport') + 1, 'departure_city', data.pop('departure_city'))
data.insert(data.columns.get_loc('departure_airport') + 2, 'departure_country', data.pop('departure_country'))
data.insert(data.columns.get_loc('departure_airport') + 3, 'departure_lat', data.pop('departure_lat'))
data.insert(data.columns.get_loc('departure_airport') + 4, 'departure_lon', data.pop('departure_lon'))


In [6]:
# extract information from the header_line column
stepIDs = []
action_modes = []
log_levels = []
header_lines = data["header_line"].values

for header in header_lines:
    # extract stepID
    match = re.search(r"\[(.*?)\]", header)
    if match:
        stepID = match.group(1)
        stepIDs.append(stepID)
    else:
        stepIDs.append(None)
    
    # determine action_mode
    if "Received" in header:
        action_modes.append("Received")
    elif "Saved" in header:
        action_modes.append("Saved")
    elif "Sent" in header:
        action_modes.append("Sent")
    else:
        action_modes.append(None)
    
    # extract log level
    log_level_match = re.search(r'INFO|DEBUG|ERROR|WARNING', header)
    log_levels.append(log_level_match.group(0) if log_level_match else None)
    
    # extract flight information
    flight_info_match = re.search(r'\[(.*?)\]', header)
    flight_info = flight_info_match.group(1) if flight_info_match else None

data["stepID"] = stepIDs
data["action_mode"] = action_modes

# insert after header_line
data.insert(data.columns.get_loc('header_line') + 1, 'stepID', data.pop('stepID'))
data.insert(data.columns.get_loc('header_line') + 2, 'action_mode', data.pop('action_mode'))

# add the extracted information to the dataframe
data['log_level'] = log_levels

# insert additional columns after header_line
data.insert(data.columns.get_loc('header_line') + 3, 'log_level', data.pop('log_level'))
# drop the header_line column
data.drop(columns=['header_line'], inplace=True)


In [7]:
data.head()

Unnamed: 0,id,creation_time,formatted_creation_time,airline_code,flight_number,flight_date,departure_airport,departure_city,departure_country,departure_lat,departure_lon,user_name,action_name,stepID,action_mode,log_level,entry_details
0,137524484,2024-04-30 04:01:47,30-04-2024 04:01:47,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,a277234c22fa2e5d,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...
1,137524940,2024-04-30 04:01:50,30-04-2024 04:01:50,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,3b152cbdf5b057ed,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...
2,137524943,2024-04-30 04:01:50,30-04-2024 04:01:50,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,3b152cbdf5b057ed,Saved,INFO,com.systemone.lc2.manualloadplanning.dto.LoadD...
3,137524964,2024-04-30 04:05:32,30-04-2024 04:05:32,MN,1630,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,52735a0dd84d57d0,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...
4,137525021,2024-04-30 04:02:12,30-04-2024 04:02:12,MN,1202,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,8d65801e1dbb10e7,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...


In [22]:
import pandas as pd
import xml.etree.ElementTree as ET

def extract_arrival_station(xml_string):
    try:
        # Finde den Anfang des XML-Dokuments und extrahiere es
        xml_start = xml_string.find('<?xml')
        if xml_start == -1:
            return None
        xml_string = xml_string[xml_start:]
        
        # Parse das XML
        root = ET.fromstring(xml_string)
        arrival_station = root.find('.//arrivalStation').text
        return arrival_station
    except ET.ParseError as e:
        print(f"ParseError: {e}")
        return None
    except AttributeError as e:
        print(f"AttributeError: {e}")
        return None

# Extrahiere 'arrival_airport' nur für 'CreateZFWMessageAction' und 'sent'
data['arrival_airport'] = data.apply(lambda row: extract_arrival_station(row['entry_details']) if (row['action_name'] == 'CreateZFWMessageAction' and row['action_mode'] == 'Sent') else None, axis=1)

# Erstelle ein Dictionary, um die Zuordnungen zu speichern
arrival_airport_dict = {}

# Fülle das Dictionary mit Zuordnungen von 'flight_number' und 'departure_airport' zu 'arrival_airport'
for idx, row in data.iterrows():
    if pd.notna(row['arrival_airport']):
        key = (row['flight_number'], row['departure_airport'])
        arrival_airport_dict[key] = row['arrival_airport']

# Wende die Zuordnungen aus dem Dictionary auf den DataFrame an
def get_arrival_airport(row):
    key = (row['flight_number'], row['departure_airport'])
    return arrival_airport_dict.get(key, None)

data['arrival_airport'] = data.apply(get_arrival_airport, axis=1)

# Überprüfe, ob die 'arrival_airport' Werte korrekt extrahiert wurden
print(data[['flight_number', 'departure_airport', 'arrival_airport']].drop_duplicates())

# Zeige die eindeutigen Werte der Spalte 'arrival_airport'
unique_arrival_airports = data['arrival_airport'].unique()
print("Unique values in 'arrival_airport':")
print(unique_arrival_airports)

# Speichern des aktualisierten DataFrames
data.to_csv('path_to_save_updated_data.csv', index=False)

print("Processing completed and data saved.")


ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 37, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column 0
ParseError: junk after document element: line 42, column

In [15]:
data.head(1000)

Unnamed: 0,id,creation_time,formatted_creation_time,airline_code,flight_number,flight_date,departure_airport,departure_city,departure_country,departure_lat,departure_lon,user_name,action_name,stepID,action_mode,log_level,entry_details,arrival_airport
0,137524484,2024-04-30 04:01:47,30-04-2024 04:01:47,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,a277234c22fa2e5d,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...,
1,137524940,2024-04-30 04:01:50,30-04-2024 04:01:50,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,3b152cbdf5b057ed,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...,
2,137524943,2024-04-30 04:01:50,30-04-2024 04:01:50,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,3b152cbdf5b057ed,Saved,INFO,com.systemone.lc2.manualloadplanning.dto.LoadD...,
3,137524964,2024-04-30 04:05:32,30-04-2024 04:05:32,MN,1630,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,52735a0dd84d57d0,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...,
4,137525021,2024-04-30 04:02:12,30-04-2024 04:02:12,MN,1202,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,8d65801e1dbb10e7,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,137562308,2024-04-30 08:33:09,30-04-2024 08:33:09,MN,1061,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,be0bf83de7015271,Received,INFO,com.systemone.lc2.loadplan.dto.PositionAssignm...,
996,137562311,2024-04-30 08:33:09,30-04-2024 08:33:09,MN,1061,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,be0bf83de7015271,Saved,INFO,com.systemone.lc2.manualloadplanning.dto.LoadD...,
997,137562379,2024-04-30 08:34:37,30-04-2024 08:34:37,MN,1496,1,DUB,Dublin,IE,53.4213,-6.27007,service-acco,ASMMsgProcessor,72855f71a061931e,Received,INFO,"<?xml version=""1.0"" encoding=""UTF-8""?><ns2:OSS...",
998,137562381,2024-04-30 08:34:37,30-04-2024 08:34:37,MN,1496,1,DUB,Dublin,IE,53.4213,-6.27007,service-acco,AssignLCCAction,72855f71a061931e,Received,INFO,,


In [7]:
print(data.shape)
print("_________________________________________")
print(data.dtypes)

(2248299, 17)
_________________________________________
id                                  int64
creation_time              datetime64[ns]
formatted_creation_time            object
airline_code                       object
flight_number                       int64
flight_date                         int64
departure_airport                  object
departure_city                     object
departure_country                  object
departure_lat                     float64
departure_lon                     float64
user_name                          object
action_name                        object
stepID                             object
action_mode                        object
log_level                          object
entry_details                      object
dtype: object


In [8]:
data.to_parquet('../data/data_parquet/processed_data_combined.parquet', index=False)

In [9]:
# anzeigen der eindeutigen action_names
unique_actions = data['action_name'].unique()
unique_actions

array(['AssignLoadplanAction', 'AssignLCCAction', 'ASMMsgProcessor',
       'AutoLoadBulkAction', 'AutoLoadULDAction', 'CargoFinalActionTDM',
       'AssignUnassignViewAction', 'AutomaticNotificationAction',
       'CargoFinalAction', 'CalculateWeightAndTrimAction',
       'CreateLoadingInstructionAction', 'ChatConfirmMessageAction',
       'ChatSendMessageAction', 'CreateAndSendFuelOrderAction',
       'CreateBaggageLoadItemsAction', 'CreateLoadsheetAction',
       'ChangeFlightLegStateAction', 'CloseLegAction',
       'CreatePostDepartureMessagesAction',
       'CreateAndSendUldOrdMessageAction', 'ClearFlightsAction',
       'CreateZFWMessageAction', 'CrewMsgProcessor',
       'PAXBOOKINGINMsgProcessor', 'FlightPlanFiguresInMsgProcessor',
       'EstimateStorePaxDataAction', 'RampFinalAction',
       'InternalCreateLoadingInstructionAct',
       'InternalCreateLoadsheetAction', 'ResetLoadingListRecordsAction',
       'GetCabinConfigurationsAction', 'ReopenLegAction',
       'FuelData

In [7]:
processed_data_path = '../data/data_parquet/processed_data_combined.parquet'

data1 = pd.read_parquet(processed_data_path)

filtered_data = data1[data1['action_name'] == 'CreateZFWMessageAction']
# count the number of rows
count = filtered_data.shape[0]

print(f"Number of rows with action 'CreateZFWMessageAction': {count}")

# filter the data for the action CreateZFWMessageAction
filtered_data = data1[data1['action_name'] == 'CreateZFWMessageAction']

# count the number of rows for each action_mode
counts = filtered_data['action_mode'].value_counts()

print("Number of rows for each action_mode in 'CreateZFWMessageAction':")
print(counts)


Number of rows with action 'CreateZFWMessageAction': 50677
Number of rows for each action_mode in 'CreateZFWMessageAction':
action_mode
Received    21176
Sent        21176
Saved        8325
Name: count, dtype: int64


In [23]:
import xml.etree.ElementTree as ET

xml_string = '''
<ns19:ZFWFigures xmlns:ns2="http://www.lsb.de/iocc/passengerBookingFigures" xmlns:ns4="http://www.systemone.com/iocc/lss/CateringLsAcknowledge" xmlns:ns3="http://www.lsb.de/iocc/BaggageUnitMessage" xmlns:ns6="http://www.lsb.de/iocc/LoadInstructionXML" xmlns:ns20="http://www.lsb.de/iocc/PilotNameINT" xmlns:ns5="http://www.lsb.de/iocc/Email" xmlns:ns8="http://www.lsb.de/iocc/PassengerCheckIn" xmlns:ns7="http://www.lsb.de/iocc/LoadSheetInXml" xmlns:ns13="http://www.lsb.de/iocc/AircraftPositionFigures" xmlns:ns9="http://www.lsb.de/iocc/CkiStatusInformation" xmlns:ns12="http://www.systemone.com/iocc/lss/AcarsLsAcknowledge" xmlns:ns11="http://www.lsb.de/iocc/AcarsInXml" xmlns:ns10="http://www.iata.org/IATA/2007/00" xmlns:ns21="http://www.lsb.de/iocc/IncomingNTMMessage" xmlns:ns17="http://www.lsb.de/iocc/RampFuelFigures" xmlns:ns16="http://www.lsb.de/iocc/FlightPlanFigures" xmlns:ns15="http://www.lsb.de/iocc/CrewFiguresMessageINT" xmlns:ns14="http://www.lsb.de/iocc/CargoForecastMessage" xmlns:ns19="http://www.lsb.de/iocc/zeroFuelWeightMessage" xmlns:ns18="http://www.lsb.de/iocc/RequestMessageFigures">
    <loadsystemHeader>
        <airline>MN</airline>
        <flightNumber>1592</flightNumber>
        <legNumber>1</legNumber>
        <arrivalStation>MAD</arrivalStation>
        <departureStation>DUB</departureStation>
        <flightDateLocal>2024-04-30</flightDateLocal>
        <flightDateUTC>2024-04-30</flightDateUTC>
        <std>300520</std>
    </loadsystemHeader>
    <timestamp>2024-04-30T04:05:09Z</timestamp>
    <planningStatus>2</planningStatus>
    <revisionNumber>43</revisionNumber>
    <weightUnit>K</weightUnit>
    <dryOperatingWeight>43294</dryOperatingWeight>
    <estimTotalTrafficLoad>14931</estimTotalTrafficLoad>
    <baggageIndicator>true</baggageIndicator>
    <mailIndicator>true</mailIndicator>
    <transitLoadIndicator>true</transitLoadIndicator>
    <tailTankIndicator>false</tailTankIndicator>
    <actualZFW>58225</actualZFW>
    <numberPassenger>160</numberPassenger>
    <cargoWeight>898</cargoWeight>
    <mailWeight>0</mailWeight>
    <baggageWeight>1356</baggageWeight>
    <paxWeight>12677</paxWeight>
    <basicWeight>41889</basicWeight>
    <basicIndex>+51.50</basicIndex>
    <userID>service-account-lss</userID>
    <systemZFWMessageIndicator>true</systemZFWMessageIndicator>
    <manualCalculationIndicator>false</manualCalculationIndicator>
    <ns19:ZFWPaxPerClass>
        <classCode>J</classCode>
        <classCapacity>4</classCapacity>
        <numberOfPax>0</numberOfPax>
    </ns19:ZFWPaxPerClass>
    <ns19:ZFWPaxPerClass>
        <classCode>Y</classCode>
        <classCapacity>168</classCapacity>
        <numberOfPax>160</numberOfPax>
    </ns19:ZFWPaxPerClass>
</ns19:ZFWFigures>
'''

# Parse the XML
root = ET.fromstring(xml_string)

# Define the namespaces
namespace = {
    'ns19': 'http://www.lsb.de/iocc/zeroFuelWeightMessage'
}

# Find the arrivalStation element
arrival_station = root.find('.//ns19:loadsystemHeader/ns19:arrivalStation', namespace)

# Print the text content of the element
if arrival_station is not None:
    print(arrival_station.text)
else:
    print("arrivalStation not found")


arrivalStation not found
