# Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Load Data

In [2]:
data_path = '../data/'

In [3]:
# Import relevant tables
export_packstuecke_mva = pd.read_excel(data_path + 'Export_Packstuecke_MVA.xlsx')
export_tbl_mva = pd.read_excel(data_path + 'Export_TBL_MVA.xlsx')

In [4]:
# Merge relevant tables
data = pd.merge(export_tbl_mva, export_packstuecke_mva, left_on='ID', right_on='TBL_MVA_ID', how='outer')

# Data Cleaning and Translating

In [5]:
# Rename column names
data = data.rename(columns={'ID_x': 'id', 'ID_y': 'packstuecke_mva_id', 'TBL_MVA_ID': 'tbl_mva_id', 'PACKSTUECK_ID': 'package_id', 'LAENGE_IN_CM': 'length_in_cm', 'BREITE_IN_CM': 'width_in_cm', 'HOEHE_IN_CM': 'height_in_cm', 'GEWICHT_IN_KG': 'weight_in_kg', 'PACKSTUECKART': 'package_type', 'AUFTRAGSNUMMER': 'order_number', 'EINGANGSDATUM_UHRZEIT': 'receipt_date_time', 'LIEFERSCHEINNUMMER': 'delivery_receipt_number', 'STATUS': 'status', 'VERPACKT_DATUM_UHRZEIT': 'packed_date_time', 'SONDERFAHRT': 'special_trip', 'LIEFERSCHEIN_DATUM_UHRZEIT': 'delivery_number_date_time', 'BEREITGESTELLT_DATUM_UHRZEIT': 'provided_date_time', 'TA_DATUM_UHRZEIT': 'transport_order_date_time', 'NOTIZ': 'note', 'AUFTRAGANNAHME_DATUM_UHRZEIT': 'order_acceptance_date_time', 'DISPLAYNAME': 'display_name', 'LAND': 'country', 'GELOESCHT': 'deleted', 'BEMERKUNG_GELOESCHT': 'note_deleted', 'ARCHIV': 'archive', 'GELOESCHT_DATUM_UHRZEIT': 'deleted_date_time', 'TRACKING_NUMMER': 'tracking_number', 'DIENSTLEISTER': 'service_provider', 'KLAERUNG_LS': 'clarification_delivery_number', 'KLAERUNG_TA': 'clarification_transport_order', 'GEFAHRGUT': 'dangerous_goods', 'PO_NUMMER': 'purchase_order_number', 'ANGEKUENDIGT': 'announced', 'PRIO': 'priority', 'VERSENDER': 'sender'})

In [6]:
# Remove rows where "deleted" == 1
data = data.drop(data.loc[data['deleted'] == 1].index)

In [7]:
# Replace status from German to English
data['status'] = data['status'].replace('angeliefert', 'delivered')
data['status'] = data['status'].replace('verpackt', 'packed')
data['status'] = data['status'].replace('LS erstellt', 'delivery number created')
data['status'] = data['status'].replace('bereitgestellt', 'provided')
data['status'] = data['status'].replace('TA erstellt', 'transport order created')
data['status'] = data['status'].replace('in Klärung LS', 'in clarification delivery number')
data['status'] = data['status'].replace('in Klärung TA', 'in clarification transport order')
data['status'] = data['status'].replace('in Bearbeitung', 'in provision of the shipment')

In [8]:
# Replace package type from German to English
data['package_type'] = data['package_type'].replace('CAR', 'carton')
data['package_type'] = data['package_type'].replace('BEH', 'container')
data['package_type'] = data['package_type'].replace('PAL', 'pallet')
data['package_type'] = data['package_type'].replace('GBP', 'grid box')

In [9]:
# Clean country values
data['country'] = data['country'].replace('de', 'DE')
data['country'] = data['country'].replace('IN - Bidadi', 'IN')
data['country'] = data['country'].replace('DE - FCA Brucker', 'DE')
data['country'] = data['country'].replace('Tr', 'TR')
data['country'] = data['country'].replace('DE ', 'DE')
data['country'] = data['country'].replace(' DE', 'DE')
data['country'] = data['country'].replace(' AT ', 'AT')
data['country'] = data['country'].replace('^DE', 'DE')
data['country'] = data['country'].replace('DE - DAP | Schenker', 'DE')
data['country'] = data['country'].replace('Österreich', 'AT')
data['country'] = data['country'].replace('DE - Schenker/UPS', 'DE')
data['country'] = data['country'].replace('USA', 'US')
data['country'] = data['country'].replace('de- Koller', 'DE')
data['country'] = data['country'].replace('dE', 'DE')
data['country'] = data['country'].replace('De', 'DE')
data['country'] = data['country'].replace('D', 'DE')
data['country'] = data['country'].replace('at', 'AT')
data['country'] = data['country'].replace('AUT', 'AT')

# Remove rows where "country" has weird values
data = data.drop(data.loc[data['country'] == 'TNT | 070275454 | FCA'].index)
data = data.drop(data.loc[data['country'] == 'DER'].index)
data = data.drop(data.loc[data['country'] == 'FCA'].index)

In [10]:
# Filtering Rows

required_columns = ['id', 'special_trip', 'receipt_date_time', 'length_in_cm', 'width_in_cm', 'height_in_cm', 'weight_in_kg', 'package_type', 'packed_date_time', 'country', 'order_acceptance_date_time', 'delivery_number_date_time', 'provided_date_time', 'transport_order_date_time']
# Select rows with non-null values in the specified columns
data = data.dropna(subset=required_columns)

In [11]:
# Remove rows having empty value
data.dropna(inplace=True)

In [12]:
# Remove empty values in country
data = data.dropna(subset=['country'])

In [13]:
# Export table
data.to_excel(data_path + 'data.xlsx', index=False)

# Feaure Engineering

In [14]:
processed_data = data.copy()

In [15]:
# Feature extraction from timestamps

processed_data['receipt_date_time'] = pd.to_datetime(processed_data['receipt_date_time'])
processed_data['packed_date_time'] = pd.to_datetime(processed_data['packed_date_time'])
processed_data['order_acceptance_date_time'] = pd.to_datetime(processed_data['order_acceptance_date_time'])
processed_data['delivery_number_date_time'] = pd.to_datetime(processed_data['delivery_number_date_time'])
processed_data['provided_date_time'] = pd.to_datetime(processed_data['provided_date_time'])
processed_data['transport_order_date_time'] = pd.to_datetime(processed_data['transport_order_date_time'])

time_columns = ['receipt_date_time', 'packed_date_time', 'order_acceptance_date_time', 'delivery_number_date_time', 'provided_date_time']
name_mappings = {
    'receipt_date_time': 'receipt',
    'packed_date_time': 'packed',
    'order_acceptance_date_time': 'order_acceptance',
    'delivery_number_date_time': 'delivery_number',
    'provided_date_time': 'provided',
}
for col in time_columns:
    base_name = name_mappings[col]
    processed_data[f'{base_name}_day'] = processed_data[col].dt.day
    processed_data[f'{base_name}_month'] = processed_data[col].dt.month
    processed_data[f'{base_name}_year'] = processed_data[col].dt.year
    processed_data[f'{base_name}_hour'] = processed_data[col].dt.hour
    processed_data[f'{base_name}_minute'] = processed_data[col].dt.minute
    processed_data[f'{base_name}_second'] = processed_data[col].dt.second
    processed_data[f'{base_name}_weekday'] = processed_data[col].dt.weekday
    processed_data[f'{base_name}_time_of_day'] = pd.cut(
        processed_data[col].dt.hour,
        bins=[0, 6, 12, 18, 24],
        labels=['night', 'morning', 'afternoon', 'evening'],
        right=False
    )
    processed_data[f'{base_name}_week_of_year'] = processed_data[col].dt.isocalendar().week
    processed_data[f'{base_name}_is_weekend'] = (processed_data[col].dt.weekday >= 5).astype(int)
    processed_data[f'{base_name}_quarter'] = processed_data[col].dt.quarter
    processed_data[f'{base_name}_is_business_hour'] = processed_data[col].dt.hour.between(9, 17).astype(int)

processed_data['time_diff_receipt_to_packed'] = (processed_data['packed_date_time'] - processed_data['receipt_date_time']).dt.total_seconds()
processed_data['time_diff_packed_to_acceptance'] = (processed_data['order_acceptance_date_time'] - processed_data['packed_date_time']).dt.total_seconds()
processed_data['time_diff_acceptance_to_delivery'] = (processed_data['delivery_number_date_time'] - processed_data['order_acceptance_date_time']).dt.total_seconds()
processed_data['time_diff_delivery_to_provided'] = (processed_data['provided_date_time'] - processed_data['delivery_number_date_time']).dt.total_seconds()

In [16]:
# Feature extraction from length, width, height

processed_data['length_in_cm'] = processed_data['length_in_cm'].astype(float)
processed_data['width_in_cm'] = processed_data['width_in_cm'].astype(float)
processed_data['height_in_cm'] = processed_data['height_in_cm'].astype(float)
processed_data['weight_in_kg'] = processed_data['weight_in_kg'].astype(float)

# Calculate volume => Volume = length × width × height
processed_data['volume'] = processed_data['length_in_cm'] * processed_data['width_in_cm'] * processed_data['height_in_cm']

# Calculate density => Density = volume / weight
processed_data['density'] = processed_data['weight_in_kg'] / processed_data['volume']

In [17]:
# Select specific columns
selected_columns = ['id',
             'special_trip',
             'receipt_day',
             'receipt_month',
             'receipt_year',
             'receipt_hour',
             'receipt_minute',
             'receipt_second',
             'receipt_weekday',
             'receipt_week_of_year',
             'receipt_time_of_day',
             'receipt_is_weekend',
             'receipt_quarter',
             'receipt_is_business_hour',
             'receipt_date_time',
             'length_in_cm',
             'width_in_cm',
             'height_in_cm',
             'weight_in_kg',
             'volume',
             'density',
             'package_type',
             'time_diff_receipt_to_packed',
             'packed_day',
             'packed_month',
             'packed_year',
             'packed_hour',
             'packed_minute',
             'packed_second',
             'packed_weekday',
             'packed_time_of_day',
             'packed_week_of_year',
             'packed_is_weekend',
             'packed_quarter',
             'packed_is_business_hour',
             'packed_date_time',
             'country',
             'time_diff_packed_to_acceptance',
             'order_acceptance_day',
             'order_acceptance_month',
             'order_acceptance_year',
             'order_acceptance_hour',
             'order_acceptance_minute',
             'order_acceptance_second',
             'order_acceptance_weekday',
             'order_acceptance_time_of_day',
             'order_acceptance_week_of_year',
             'order_acceptance_is_weekend',
             'order_acceptance_quarter',
             'order_acceptance_is_business_hour',
             'order_acceptance_date_time',
             'time_diff_acceptance_to_delivery',
             'delivery_number_day',
             'delivery_number_month',
             'delivery_number_year',
             'delivery_number_hour',
             'delivery_number_minute',
             'delivery_number_second',
             'delivery_number_weekday',
             'delivery_number_time_of_day',
             'delivery_number_week_of_year',
             'delivery_number_is_weekend',
             'delivery_number_quarter',
             'delivery_number_is_business_hour',
             'delivery_number_date_time',
             'time_diff_delivery_to_provided',
             'provided_day',
             'provided_month',
             'provided_year',
             'provided_hour',
             'provided_minute',
             'provided_second',
             'provided_weekday',
             'provided_time_of_day',
             'provided_week_of_year',
             'provided_is_weekend',
             'provided_quarter',
             'provided_is_business_hour',
             'provided_date_time',
             'transport_order_date_time'
             ]

processed_data = processed_data[selected_columns]

In [18]:
# print all columns of data
pd.set_option('display.max_columns', None)

processed_data.head()

Unnamed: 0,id,special_trip,receipt_day,receipt_month,receipt_year,receipt_hour,receipt_minute,receipt_second,receipt_weekday,receipt_week_of_year,receipt_time_of_day,receipt_is_weekend,receipt_quarter,receipt_is_business_hour,receipt_date_time,length_in_cm,width_in_cm,height_in_cm,weight_in_kg,volume,density,package_type,time_diff_receipt_to_packed,packed_day,packed_month,packed_year,packed_hour,packed_minute,packed_second,packed_weekday,packed_time_of_day,packed_week_of_year,packed_is_weekend,packed_quarter,packed_is_business_hour,packed_date_time,country,time_diff_packed_to_acceptance,order_acceptance_day,order_acceptance_month,order_acceptance_year,order_acceptance_hour,order_acceptance_minute,order_acceptance_second,order_acceptance_weekday,order_acceptance_time_of_day,order_acceptance_week_of_year,order_acceptance_is_weekend,order_acceptance_quarter,order_acceptance_is_business_hour,order_acceptance_date_time,time_diff_acceptance_to_delivery,delivery_number_day,delivery_number_month,delivery_number_year,delivery_number_hour,delivery_number_minute,delivery_number_second,delivery_number_weekday,delivery_number_time_of_day,delivery_number_week_of_year,delivery_number_is_weekend,delivery_number_quarter,delivery_number_is_business_hour,delivery_number_date_time,time_diff_delivery_to_provided,provided_day,provided_month,provided_year,provided_hour,provided_minute,provided_second,provided_weekday,provided_time_of_day,provided_week_of_year,provided_is_weekend,provided_quarter,provided_is_business_hour,provided_date_time,transport_order_date_time
1897,3095,0,15,9,2023,10,49,16,4,37,morning,0,3,1,2023-09-15 10:49:16,40.0,30.0,14.0,1.78,16800.0,0.000106,carton,78.0,15,9,2023,10,50,34,4,morning,37,0,3,1,2023-09-15 10:50:34,BR,242832.0,18,9,2023,6,17,46,0,morning,38,0,3,0,2023-09-18 06:17:46,489.0,18,9,2023,6,25,55,0,morning,38,0,3,0,2023-09-18 06:25:55,79873.0,19,9,2023,4,37,8,1,night,38,0,3,0,2023-09-19 04:37:08,2024-02-19 11:47:54
1898,3096,0,15,9,2023,10,57,41,4,37,morning,0,3,1,2023-09-15 10:57:41,40.0,30.0,14.0,2.36,16800.0,0.00014,carton,40.0,15,9,2023,10,58,21,4,morning,37,0,3,1,2023-09-15 10:58:21,BR,242919.0,18,9,2023,6,27,0,0,morning,38,0,3,0,2023-09-18 06:27:00,211.0,18,9,2023,6,30,31,0,morning,38,0,3,0,2023-09-18 06:30:31,80327.0,19,9,2023,4,49,18,1,night,38,0,3,0,2023-09-19 04:49:18,2024-02-19 11:42:50
1899,3097,0,15,9,2023,11,1,46,4,37,morning,0,3,1,2023-09-15 11:01:46,40.0,30.0,14.0,2.4,16800.0,0.000143,carton,32.0,15,9,2023,11,2,18,4,morning,37,0,3,1,2023-09-15 11:02:18,BR,242908.0,18,9,2023,6,30,46,0,morning,38,0,3,0,2023-09-18 06:30:46,222.0,18,9,2023,6,34,28,0,morning,38,0,3,0,2023-09-18 06:34:28,80813.0,19,9,2023,5,1,21,1,night,38,0,3,0,2023-09-19 05:01:21,2024-02-19 11:43:02
1936,3163,0,19,9,2023,11,33,28,1,38,morning,0,3,1,2023-09-19 11:33:28,40.0,30.0,20.0,2.66,24000.0,0.000111,carton,38.0,19,9,2023,11,34,6,1,morning,38,0,3,1,2023-09-19 11:34:06,BR,82858.0,20,9,2023,10,35,4,2,morning,38,0,3,1,2023-09-20 10:35:04,1134.0,20,9,2023,10,53,58,2,morning,38,0,3,1,2023-09-20 10:53:58,746.0,20,9,2023,11,6,24,2,morning,38,0,3,1,2023-09-20 11:06:24,2024-02-19 11:43:13
1937,3164,0,19,9,2023,11,42,18,1,38,morning,0,3,1,2023-09-19 11:42:18,40.0,30.0,20.0,8.2,24000.0,0.000342,carton,20.0,19,9,2023,11,42,38,1,morning,38,0,3,1,2023-09-19 11:42:38,BR,87016.0,20,9,2023,11,52,54,2,morning,38,0,3,1,2023-09-20 11:52:54,369.0,20,9,2023,11,59,3,2,morning,38,0,3,1,2023-09-20 11:59:03,69846.0,21,9,2023,7,23,9,3,morning,38,0,3,0,2023-09-21 07:23:09,2024-02-19 11:43:23


In [19]:
# Get the first row to get idea of columns and values
first_row = processed_data.iloc[0]

# Create a dictionary of column-value pairs
column_value_pairs = {}
for column in processed_data.columns:
    column_value_pairs[column] = first_row[column]

# Print the column-value pairs
for column, value in column_value_pairs.items():
    print(f"{column}: {value}")


id: 3095
special_trip: 0
receipt_day: 15
receipt_month: 9
receipt_year: 2023
receipt_hour: 10
receipt_minute: 49
receipt_second: 16
receipt_weekday: 4
receipt_week_of_year: 37
receipt_time_of_day: morning
receipt_is_weekend: 0
receipt_quarter: 3
receipt_is_business_hour: 1
receipt_date_time: 2023-09-15 10:49:16
length_in_cm: 40.0
width_in_cm: 30.0
height_in_cm: 14.0
weight_in_kg: 1.78
volume: 16800.0
density: 0.00010595238095238096
package_type: carton
time_diff_receipt_to_packed: 78.0
packed_day: 15
packed_month: 9
packed_year: 2023
packed_hour: 10
packed_minute: 50
packed_second: 34
packed_weekday: 4
packed_time_of_day: morning
packed_week_of_year: 37
packed_is_weekend: 0
packed_quarter: 3
packed_is_business_hour: 1
packed_date_time: 2023-09-15 10:50:34
country: BR
time_diff_packed_to_acceptance: 242832.0
order_acceptance_day: 18
order_acceptance_month: 9
order_acceptance_year: 2023
order_acceptance_hour: 6
order_acceptance_minute: 17
order_acceptance_second: 46
order_acceptance_week

In [20]:
# Export table
processed_data.to_excel(data_path + 'processed_data.xlsx', index=False)