In [8]:
import pandas as pd
import re
from IPython.display import display

# Load the CSV file
file_path = '/Users/nikyakovlev/Documents/GitHub/aircraft_load/data_engineering/niks_data_mata/RampFinalAction/RampFinalAction_entries_MNOP.csv'  # Update this path to your CSV file
df = pd.read_csv(file_path)

# Regex pattern to match the required fields
pattern = re.compile(
    r'(?P<Type>\w+)\s+(?P<Destination>\w+)\s+(?P<Bag_pieces>\d+)\s+(?P<Bag_weight>[\d\.]+\sKG)'
)

# Function to extract and categorize baggage data by Type
def extract_and_categorize_baggage_data(text):
    matches = pattern.finditer(text)
    result = {}
    for match in matches:
        data = match.groupdict()
        type_key = data['Type'].lower()  # e.g., loadtable, cki, sum
        result[f"{type_key}_destination"] = data['Destination']
        result[f"{type_key}_bag_pieces"] = int(data['Bag_pieces'])
        result[f"{type_key}_bag_weight"] = data['Bag_weight']
    return result

# Apply the function to the appropriate column (assuming 'entry_details')
extracted_data = df['entry_details'].apply(extract_and_categorize_baggage_data)

# Convert the result to a DataFrame
baggage_data_df = pd.DataFrame(extracted_data.tolist())

# Include the additional columns
df['timestamp'] = df['creation_time']
df['creation_time'] = pd.to_datetime(df['creation_time']).dt.to_period('M').astype(str)
df['creation_time'] = df['creation_time'].str.replace('-04', '-05')
df['combined'] = df['creation_time'] + '_' + df['airline_code'] + '_' + df['flight_number'].astype(str) + '_' + df['flight_date'].astype(str)

# Add the departure airport column to the baggage_data_df
baggage_data_df['departure_airport'] = df['departure_airport']

# Add the combined column and timestamp to the baggage_data_df
baggage_data_df['combined'] = df['combined']
baggage_data_df['timestamp'] = df['timestamp']

# Display the DataFrame
display(baggage_data_df)

# Save the DataFrame to a CSV file
baggage_data_df.to_csv('extracted_baggage_data.csv', index=False)


Unnamed: 0,cki_destination,cki_bag_pieces,cki_bag_weight,loadtable_destination,loadtable_bag_pieces,loadtable_bag_weight,departure_airport,combined,timestamp
0,,,,,,,DUB,2024-05_MN_1680.0_30.0,2024-04-30 04:46:19
1,,,,,,,DUB,2024-05_MN_1680.0_30.0,2024-04-30 04:46:17
2,,,,,,,DUB,2024-05_MN_1680.0_30.0,2024-04-30 04:46:17
3,,,,,,,DUB,2024-05_MN_1680.0_30.0,2024-04-30 04:46:19
4,,,,,,,DUB,2024-05_MN_1602.0_30.0,2024-04-30 04:54:12
...,...,...,...,...,...,...,...,...,...
5607,,,,,,,DUB,2024-05_MN_1392.0_7.0,2024-05-07 08:58:12
5608,,,,,,,DUB,2024-05_MN_1392.0_7.0,2024-05-07 08:58:09
5609,HAM,93.0,1209.00 KG,HAM,85.0,1105.00 KG,DUB,2024-05_MN_1392.0_7.0,2024-05-07 08:58:09
5610,,,,,,,DUB,2024-05_MN_1392.0_7.0,2024-05-07 08:58:09


In [10]:
import pandas as pd
from IPython.display import display

# Load the extracted baggage data CSV file
file_path = '/Users/nikyakovlev/Documents/GitHub/aircraft_load/data_engineering/niks_data_mata/RampFinalAction/extracted_baggage_data.csv'  # Update this path to your CSV file
df = pd.read_csv(file_path)

# Filter out rows where any CKI-related columns are missing
filtered_df = df.dropna(subset=['cki_destination', 'cki_bag_pieces', 'cki_bag_weight'])

# Display the filtered DataFrame
display(filtered_df)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('filtered_cki_baggage_data_MNOP.csv', index=False)


Unnamed: 0,cki_destination,cki_bag_pieces,cki_bag_weight,loadtable_destination,loadtable_bag_pieces,loadtable_bag_weight,departure_airport,combined,timestamp
10,FCO,79.0,1027.00 KG,,,,DUB,2024-05_MN_1402.0_30.0,2024-04-30 04:56:36
17,NAP,125.0,1625.00 KG,NAP,123.0,1599.00 KG,DUB,2024-05_MN_1450.0_30.0,2024-04-30 04:57:06
22,MAD,77.0,1001.00 KG,MAD,79.0,1027.00 KG,DUB,2024-05_MN_1592.0_30.0,2024-04-30 04:58:29
27,LHR,88.0,1144.00 KG,LHR,86.0,1118.00 KG,DUB,2024-05_MN_1152.0_30.0,2024-04-30 05:08:00
30,FCO,79.0,1027.00 KG,,,,DUB,2024-05_MN_1402.0_30.0,2024-04-30 05:10:57
...,...,...,...,...,...,...,...,...,...
5588,CDG,133.0,1729.00 KG,CDG,117.0,1521.00 KG,DUB,2024-05_MN_1524.0_7.0,2024-05-07 08:16:26
5591,LHR,104.0,1352.00 KG,LHR,105.0,1365.00 KG,DUB,2024-05_MN_1158.0_7.0,2024-05-07 08:31:55
5593,CDG,133.0,1729.00 KG,CDG,127.0,1651.00 KG,DUB,2024-05_MN_1524.0_7.0,2024-05-07 08:36:59
5603,AMS,111.0,1443.00 KG,AMS,109.0,1417.00 KG,DUB,2024-05_MN_1604.0_7.0,2024-05-07 08:48:52


In [11]:
import pandas as pd
from IPython.display import display

# Load the CSV file with the baggage data
file_path = '/Users/nikyakovlev/Documents/GitHub/aircraft_load/data_engineering/niks_data_mata/RampFinalAction/filtered_cki_baggage_data_MNOP.csv'  # Update this path to your CSV file
df = pd.read_csv(file_path)

# Assuming 'timestamp' is the column that determines the newest entry.
# Convert 'timestamp' to datetime if it's not already
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort the data by 'combined' and 'timestamp' (most recent first)
df = df.sort_values(by=['combined', 'timestamp'], ascending=[True, False])

# Drop duplicates keeping the first (most recent) entry for each 'combined' value
df = df.drop_duplicates(subset='combined', keep='first')

# Display the cleaned DataFrame
display(df)

# Save the cleaned DataFrame to a new CSV file
df.to_csv('latest_entries_baggage_data_MNOP.csv', index=False)


Unnamed: 0,cki_destination,cki_bag_pieces,cki_bag_weight,loadtable_destination,loadtable_bag_pieces,loadtable_bag_weight,departure_airport,combined,timestamp
158,MAN,231.0,3465.00 KG,MAN,229.0,3435.00 KG,BGI,2024-05_MN_1030.0_1.0,2024-05-01 21:47:16
372,MAN,195.0,2925.00 KG,MAN,202.0,10301030.00 KG,BGI,2024-05_MN_1030.0_3.0,2024-05-03 21:51:51
603,MAN,226.0,3390.00 KG,MAN,224.0,3360.00 KG,BGI,2024-05_MN_1030.0_5.0,2024-05-05 21:38:23
244,MAN,161.0,2415.00 KG,MAN,168.0,2520.00 KG,MCO,2024-05_MN_1034.0_2.0,2024-05-02 21:15:59
67,MAN,207.0,3105.00 KG,MAN,210.0,3150.00 KG,MCO,2024-05_MN_1034.0_30.0,2024-04-30 21:20:06
...,...,...,...,...,...,...,...,...,...
672,LPA,154.0,2002.00 KG,LPA,155.0,2015.00 KG,DUB,2024-05_MN_1782.0_6.0,2024-05-06 13:42:03
446,DLM,168.0,2184.00 KG,DLM,167.0,2171.00 KG,DUB,2024-05_MN_1790.0_4.0,2024-05-04 12:12:24
233,ADB,150.0,1950.00 KG,ADB,145.0,1885.00 KG,DUB,2024-05_MN_1798.0_2.0,2024-05-02 17:07:58
462,ADB,154.0,2002.00 KG,ADB,151.0,1963.00 KG,DUB,2024-05_MN_1798.0_4.0,2024-05-04 14:09:29


In [12]:
import pandas as pd
from IPython.display import display

# Load the CSV file containing the baggage data
file_path = '/Users/nikyakovlev/Documents/GitHub/aircraft_load/data_engineering/niks_data_mata/RampFinalAction/latest_entries_baggage_data_MNOP.csv'  # Update this path to your CSV file
df = pd.read_csv(file_path)

# Filter out rows where any LOADTABLE-related columns are missing
filtered_df = df.dropna(subset=['loadtable_destination', 'loadtable_bag_pieces', 'loadtable_bag_weight'])

# Display the filtered DataFrame
display(filtered_df)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('filtered_loadtable_baggage_data_MNOP_new.csv', index=False)


Unnamed: 0,cki_destination,cki_bag_pieces,cki_bag_weight,loadtable_destination,loadtable_bag_pieces,loadtable_bag_weight,departure_airport,combined,timestamp
0,MAN,231.0,3465.00 KG,MAN,229.0,3435.00 KG,BGI,2024-05_MN_1030.0_1.0,2024-05-01 21:47:16
1,MAN,195.0,2925.00 KG,MAN,202.0,10301030.00 KG,BGI,2024-05_MN_1030.0_3.0,2024-05-03 21:51:51
2,MAN,226.0,3390.00 KG,MAN,224.0,3360.00 KG,BGI,2024-05_MN_1030.0_5.0,2024-05-05 21:38:23
3,MAN,161.0,2415.00 KG,MAN,168.0,2520.00 KG,MCO,2024-05_MN_1034.0_2.0,2024-05-02 21:15:59
4,MAN,207.0,3105.00 KG,MAN,210.0,3150.00 KG,MCO,2024-05_MN_1034.0_30.0,2024-04-30 21:20:06
...,...,...,...,...,...,...,...,...,...
574,LPA,154.0,2002.00 KG,LPA,155.0,2015.00 KG,DUB,2024-05_MN_1782.0_6.0,2024-05-06 13:42:03
575,DLM,168.0,2184.00 KG,DLM,167.0,2171.00 KG,DUB,2024-05_MN_1790.0_4.0,2024-05-04 12:12:24
576,ADB,150.0,1950.00 KG,ADB,145.0,1885.00 KG,DUB,2024-05_MN_1798.0_2.0,2024-05-02 17:07:58
577,ADB,154.0,2002.00 KG,ADB,151.0,1963.00 KG,DUB,2024-05_MN_1798.0_4.0,2024-05-04 14:09:29
