# Import Libraries

In [21]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle
from sklearn.preprocessing import StandardScaler

# Load and Process Data

In [22]:
directory = '../data/'

In [23]:
data = pd.read_excel(directory + 'processed_data.xlsx')

In [24]:
data.dropna(inplace=True)

In [25]:
selected_columns = ['special_trip',
             'receipt_day',
             'receipt_month',
             'receipt_year',
             'receipt_hour',
             'receipt_minute',
             'receipt_second',
             'receipt_weekday',
             'receipt_week_of_year',
             'receipt_time_of_day',
             'receipt_is_weekend',
             'receipt_quarter',
             'receipt_is_business_hour',
             'receipt_date_time',
             'length_in_cm',
             'width_in_cm',
             'height_in_cm',
             'weight_in_kg',
             'volume',
             'density',
             'package_type',
             'time_diff_receipt_to_packed',
             'packed_day',
             'packed_month',
             'packed_year',
             'packed_hour',
             'packed_minute',
             'packed_second',
             'packed_weekday',
             'packed_time_of_day',
             'packed_week_of_year',
             'packed_is_weekend',
             'packed_quarter',
             'packed_is_business_hour',
             'packed_date_time',
             'country',
             'time_diff_packed_to_acceptance',
             'order_acceptance_day',
             'order_acceptance_month',
             'order_acceptance_year',
             'order_acceptance_hour',
             'order_acceptance_minute',
             'order_acceptance_second',
             'order_acceptance_weekday',
             'order_acceptance_time_of_day',
             'order_acceptance_week_of_year',
             'order_acceptance_is_weekend',
             'order_acceptance_quarter',
             'order_acceptance_is_business_hour',
             'order_acceptance_date_time',
             'time_diff_acceptance_to_delivery',
             'delivery_number_day',
             'delivery_number_month',
             'delivery_number_year',
             'delivery_number_hour',
             'delivery_number_minute',
             'delivery_number_second',
             'delivery_number_weekday',
             'delivery_number_time_of_day',
             'delivery_number_week_of_year',
             'delivery_number_is_weekend',
             'delivery_number_quarter',
             'delivery_number_is_business_hour',
             'delivery_number_date_time',
             'time_diff_delivery_to_provided',
             'provided_day',
             'provided_month',
             'provided_year',
             'provided_hour',
             'provided_minute',
             'provided_second',
             'provided_weekday',
             'provided_time_of_day',
             'provided_week_of_year',
             'provided_is_weekend',
             'provided_quarter',
             'provided_is_business_hour',
             'provided_date_time',
             'transport_order_date_time'
             ]

data = data[selected_columns]

In [26]:
num_empty_rows = data['country'].isna().sum()
print(num_empty_rows)

0


In [27]:
data = data.dropna(subset=['country'])

In [28]:
num_empty_rows = data['country'].isna().sum()
print(num_empty_rows)

0


In [29]:
first_row = data.iloc[0]

column_value_pairs = {}
for column in data.columns:
    column_value_pairs[column] = first_row[column]

for column, value in column_value_pairs.items():
    print(f"{column}: {value}")

special_trip: 0
receipt_day: 12
receipt_month: 5
receipt_year: 2023
receipt_hour: 4
receipt_minute: 57
receipt_second: 57
receipt_weekday: 4
receipt_week_of_year: 19
receipt_time_of_day: night
receipt_is_weekend: 0
receipt_quarter: 2
receipt_is_business_hour: 0
receipt_date_time: 2023-05-12 04:57:57
length_in_cm: 25
width_in_cm: 20
height_in_cm: 16
weight_in_kg: 1.0
volume: 8000
density: 0.000125
package_type: carton
time_diff_receipt_to_packed: 26
packed_day: 12
packed_month: 5
packed_year: 2023
packed_hour: 4
packed_minute: 58
packed_second: 23
packed_weekday: 4
packed_time_of_day: night
packed_week_of_year: 19
packed_is_weekend: 0
packed_quarter: 2
packed_is_business_hour: 0
packed_date_time: 2023-05-12 04:58:23
country: DE
time_diff_packed_to_acceptance: 255824
order_acceptance_day: 15
order_acceptance_month: 5
order_acceptance_year: 2023
order_acceptance_hour: 4
order_acceptance_minute: 2
order_acceptance_second: 7
order_acceptance_weekday: 0
order_acceptance_time_of_day: night
or

In [30]:
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,special_trip,receipt_day,receipt_month,receipt_year,receipt_hour,receipt_minute,receipt_second,receipt_weekday,receipt_week_of_year,receipt_time_of_day,receipt_is_weekend,receipt_quarter,receipt_is_business_hour,receipt_date_time,length_in_cm,width_in_cm,height_in_cm,weight_in_kg,volume,density,package_type,time_diff_receipt_to_packed,packed_day,packed_month,packed_year,packed_hour,packed_minute,packed_second,packed_weekday,packed_time_of_day,packed_week_of_year,packed_is_weekend,packed_quarter,packed_is_business_hour,packed_date_time,country,time_diff_packed_to_acceptance,order_acceptance_day,order_acceptance_month,order_acceptance_year,order_acceptance_hour,order_acceptance_minute,order_acceptance_second,order_acceptance_weekday,order_acceptance_time_of_day,order_acceptance_week_of_year,order_acceptance_is_weekend,order_acceptance_quarter,order_acceptance_is_business_hour,order_acceptance_date_time,time_diff_acceptance_to_delivery,delivery_number_day,delivery_number_month,delivery_number_year,delivery_number_hour,delivery_number_minute,delivery_number_second,delivery_number_weekday,delivery_number_time_of_day,delivery_number_week_of_year,delivery_number_is_weekend,delivery_number_quarter,delivery_number_is_business_hour,delivery_number_date_time,time_diff_delivery_to_provided,provided_day,provided_month,provided_year,provided_hour,provided_minute,provided_second,provided_weekday,provided_time_of_day,provided_week_of_year,provided_is_weekend,provided_quarter,provided_is_business_hour,provided_date_time,transport_order_date_time
0,0,12,5,2023,4,57,57,4,19,night,0,2,0,2023-05-12 04:57:57,25,20,16,1.0,8000,0.000125,carton,26,12,5,2023,4,58,23,4,night,19,0,2,0,2023-05-12 04:58:23,DE,255824,15,5,2023,4,2,7,0,night,20,0,2,0,2023-05-15 04:02:07,500,15,5,2023,4,10,27,0,night,20,0,2,0,2023-05-15 04:10:27,91320,16,5,2023,5,32,27,1,night,20,0,2,0,2023-05-16 05:32:27,2023-05-16 09:20:43
1,0,12,5,2023,4,59,24,4,19,night,0,2,0,2023-05-12 04:59:24,25,20,16,2.0,8000,0.00025,carton,52,12,5,2023,5,0,16,4,night,19,0,2,0,2023-05-12 05:00:16,DE,256214,15,5,2023,4,10,30,0,night,20,0,2,0,2023-05-15 04:10:30,359,15,5,2023,4,16,29,0,night,20,0,2,0,2023-05-15 04:16:29,91096,16,5,2023,5,34,45,1,night,20,0,2,0,2023-05-16 05:34:45,2023-05-16 09:29:22
2,0,12,5,2023,5,2,54,4,19,night,0,2,0,2023-05-12 05:02:54,25,20,16,11.0,8000,0.001375,carton,17,12,5,2023,5,3,11,4,night,19,0,2,0,2023-05-12 05:03:11,DE,256400,15,5,2023,4,16,31,0,night,20,0,2,0,2023-05-15 04:16:31,315,15,5,2023,4,21,46,0,night,20,0,2,0,2023-05-15 04:21:46,90914,16,5,2023,5,37,0,1,night,20,0,2,0,2023-05-16 05:37:00,2023-05-16 09:37:26
3,0,12,5,2023,5,5,38,4,19,night,0,2,0,2023-05-12 05:05:38,37,28,16,2.0,16576,0.000121,carton,39,12,5,2023,5,6,17,4,night,19,0,2,0,2023-05-12 05:06:17,DE,256576,15,5,2023,4,22,33,0,night,20,0,2,0,2023-05-15 04:22:33,348,15,5,2023,4,28,21,0,night,20,0,2,0,2023-05-15 04:28:21,90677,16,5,2023,5,39,38,1,night,20,0,2,0,2023-05-16 05:39:38,2023-05-16 10:28:49
4,0,12,5,2023,5,8,49,4,19,night,0,2,0,2023-05-12 05:08:49,25,20,16,1.0,8000,0.000125,carton,69,12,5,2023,5,9,58,4,night,19,0,2,0,2023-05-12 05:09:58,DE,256706,15,5,2023,4,28,24,0,night,20,0,2,0,2023-05-15 04:28:24,329,15,5,2023,4,33,53,0,night,20,0,2,0,2023-05-15 04:33:53,90502,16,5,2023,5,42,15,1,night,20,0,2,0,2023-05-16 05:42:15,2023-05-16 10:28:52


In [31]:
datetime_features = [
    'receipt_date_time', 'packed_date_time', 'order_acceptance_date_time',
    'delivery_number_date_time', 'provided_date_time', 'transport_order_date_time'
]

for feature in datetime_features:
    data[feature] = pd.to_datetime(data[feature]).astype(np.int64) // 10**9  

In [32]:


numerical_features = [
    'receipt_day', 'receipt_month', 'receipt_year', 'receipt_hour', 'receipt_minute', 'receipt_second',
    'receipt_weekday', 'receipt_week_of_year', 'receipt_quarter',
    'length_in_cm', 'width_in_cm', 'height_in_cm', 'weight_in_kg', 'volume', 'density',
    'time_diff_receipt_to_packed', 'packed_day', 'packed_month', 'packed_year', 'packed_hour', 'packed_minute',
    'packed_second', 'packed_weekday', 'packed_week_of_year', 'packed_quarter',
    'packed_date_time', 'time_diff_packed_to_acceptance', 'order_acceptance_day',
    'order_acceptance_month', 'order_acceptance_year', 'order_acceptance_hour', 'order_acceptance_minute',
    'order_acceptance_second', 'order_acceptance_weekday', 'order_acceptance_week_of_year',
    'order_acceptance_quarter', 'order_acceptance_date_time',
    'time_diff_acceptance_to_delivery', 'delivery_number_day', 'delivery_number_month', 'delivery_number_year',
    'delivery_number_hour', 'delivery_number_minute', 'delivery_number_second', 'delivery_number_weekday',
    'delivery_number_week_of_year', 'delivery_number_quarter',
    'delivery_number_date_time', 'time_diff_delivery_to_provided', 'provided_day', 'provided_month', 'provided_year',
    'provided_hour', 'provided_minute', 'provided_second', 'provided_weekday', 'provided_week_of_year',
    'provided_quarter', 'provided_date_time', 'transport_order_date_time'
]

scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [33]:
categorical_features = [
    'package_type', 'country', 'receipt_time_of_day', 'packed_time_of_day',
    'order_acceptance_time_of_day', 'delivery_number_time_of_day',
    'provided_time_of_day'
]

data = pd.get_dummies(data, columns=categorical_features, drop_first=True)  

boolean_columns = data.select_dtypes(include='bool').columns
data[boolean_columns] = data[boolean_columns].astype(int)

In [34]:
selected_columns = ['time_diff_packed_to_acceptance',
                    'order_acceptance_day',
                    'order_acceptance_month',
                    'order_acceptance_year',
                    'order_acceptance_week_of_year',
                    'order_acceptance_date_time',
                    'time_diff_acceptance_to_delivery',
                    'delivery_number_month',
                    'delivery_number_year',
                    'delivery_number_week_of_year',
                    'delivery_number_date_time',
                    'time_diff_delivery_to_provided',
                    'provided_year',
                    'provided_week_of_year',
                    'provided_date_time',
                    'transport_order_date_time'
                    ]

data = data[selected_columns]

In [35]:
first_row = data.iloc[0]

column_value_pairs = {}
for column in data.columns:
    column_value_pairs[column] = first_row[column]

for column, value in column_value_pairs.items():
    print(f"{column}: {value}")

time_diff_packed_to_acceptance: 0.06456652563415152
order_acceptance_day: -0.054036962954983614
order_acceptance_month: -0.4420379221561615
order_acceptance_year: 0.022298757502824774
order_acceptance_week_of_year: -0.4432018071612445
order_acceptance_date_time: 0.005591541392269052
time_diff_acceptance_to_delivery: -0.024915147448683866
delivery_number_month: -0.4458021863061159
delivery_number_year: 0.003594156408422552
delivery_number_week_of_year: -0.44632973583552976
delivery_number_date_time: -0.01642905374756736
time_diff_delivery_to_provided: 0.011897354796569013
provided_year: 0.010584355733558723
provided_week_of_year: -0.45154582295080037
provided_date_time: -0.008059934273702586
transport_order_date_time: 0.044109371656956976


# Store Data

In [36]:
X = data.drop(columns=['transport_order_date_time'])
y = data['transport_order_date_time']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
with open(directory + 'data_split.pkl', 'wb') as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)