In [29]:
# Import necessary libraries
import pandas as pd
import re
from airportsdata import load


In [30]:
# List of continents
continents = ["europe", "asia", "america"]

# Load and concatenate data from different continents
europe = pd.read_parquet('../aircraft_load/data/europe_data.parquet')
asia = pd.read_parquet('../aircraft_load/data/asia_data.parquet')
america = pd.read_parquet('../aircraft_load/data/america_data.parquet')

data = pd.concat([europe, asia, america])

In [31]:
# convert 'creation_time' to datetime and correct the format
data['creation_time'] = pd.to_datetime(data['creation_time'], errors='coerce')

# format date and insert next to 'creation_time'
data.insert(data.columns.get_loc('creation_time') + 1, 'formatted_creation_time', data['creation_time'].dt.strftime('%d-%m-%Y %H:%M:%S'))

In [32]:
# get airport information from airportsdata
airport_data = load('IATA')

# function to get city, country, lat, and lon from airport code using airportsdata
def get_airport_info(airport_code):
    if airport_code in airport_data:
        airport_info = airport_data[airport_code]
        return airport_info['city'], airport_info['country'], round(airport_info['lat'], 5), round(airport_info['lon'], 5)
    else:
        return None, None, None, None

# apply the function to the departure_airport column
data['departure_city'], data['departure_country'], data['departure_lat'], data['departure_lon'] = zip(*data['departure_airport'].apply(get_airport_info))

# insert after departure_airport
data.insert(data.columns.get_loc('departure_airport') + 1, 'departure_city', data.pop('departure_city'))
data.insert(data.columns.get_loc('departure_airport') + 2, 'departure_country', data.pop('departure_country'))
data.insert(data.columns.get_loc('departure_airport') + 3, 'departure_lat', data.pop('departure_lat'))
data.insert(data.columns.get_loc('departure_airport') + 4, 'departure_lon', data.pop('departure_lon'))


In [33]:
# extract information from the header_line column
stepIDs = []
action_modes = []
header_lines = data["header_line"].values

for header in header_lines:
    # extract stepID
    match = re.search(r"\[(.*?)\]", header)
    if match:
        stepID = match.group(1)
        stepIDs.append(stepID)
    else:
        stepIDs.append(None)
    
    # determine action_mode
    if "Received" in header:
        action_modes.append("Received")
    elif "Saved" in header:
        action_modes.append("Saved")
    elif "Sent" in header:
        action_modes.append("Sent")
    else:
        action_modes.append(None)

data["stepID"] = stepIDs
data["action_mode"] = action_modes

# insert after header_line
data.insert(data.columns.get_loc('header_line') + 1, 'stepID', data.pop('stepID'))
data.insert(data.columns.get_loc('header_line') + 2, 'action_mode', data.pop('action_mode'))


In [34]:
data.head()


Unnamed: 0,id,creation_time,formatted_creation_time,airline_code,flight_number,flight_date,departure_airport,departure_city,departure_country,departure_lat,departure_lon,user_name,action_name,header_line,stepID,action_mode,entry_details
0,137524484,2024-04-30 04:01:47,30-04-2024 04:01:47,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,"2024-04-30 04:01:47,383 INFO [a277234c22fa2e5...",a277234c22fa2e5d,Received,com.systemone.lc2.loadplan.dto.PositionAssignm...
1,137524940,2024-04-30 04:01:50,30-04-2024 04:01:50,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,"2024-04-30 04:01:50,188 INFO [3b152cbdf5b057e...",3b152cbdf5b057ed,Received,com.systemone.lc2.loadplan.dto.PositionAssignm...
2,137524943,2024-04-30 04:01:50,30-04-2024 04:01:50,MN,1262,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,"2024-04-30 04:01:50,193 INFO [3b152cbdf5b057e...",3b152cbdf5b057ed,Saved,com.systemone.lc2.manualloadplanning.dto.LoadD...
3,137524964,2024-04-30 04:05:32,30-04-2024 04:05:32,MN,1630,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,"2024-04-30 04:05:32,214 INFO [52735a0dd84d57d...",52735a0dd84d57d0,Received,com.systemone.lc2.loadplan.dto.PositionAssignm...
4,137525021,2024-04-30 04:02:12,30-04-2024 04:02:12,MN,1202,30,DUB,Dublin,IE,53.4213,-6.27007,human,AssignLoadplanAction,"2024-04-30 04:02:12,081 INFO [8d65801e1dbb10e...",8d65801e1dbb10e7,Received,com.systemone.lc2.loadplan.dto.PositionAssignm...


In [35]:
print(data.shape)
print("____________________")
print(data.dtypes)

(2248299, 17)
____________________
id                                  int64
creation_time              datetime64[ns]
formatted_creation_time            object
airline_code                       object
flight_number                       int64
flight_date                         int64
departure_airport                  object
departure_city                     object
departure_country                  object
departure_lat                     float64
departure_lon                     float64
user_name                          object
action_name                        object
header_line                        object
stepID                             object
action_mode                        object
entry_details                      object
dtype: object


In [36]:
data.to_parquet('data/data_parquet/processed_data_combined.parquet', index=False)
