# Import Libraries

In [14]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle
from sklearn.preprocessing import StandardScaler

# Load and Process Data

In [9]:
data_path = '../data/'

In [10]:
# Import data
data = pd.read_excel(data_path + 'processed_data.xlsx')

In [11]:
# Select specific columns
selected_columns = ['special_trip',
             'receipt_day',
             'receipt_month',
             'receipt_year',
             'receipt_hour',
             'receipt_minute',
             'receipt_second',
             'receipt_weekday',
             'receipt_week_of_year',
             'receipt_time_of_day',
             'receipt_is_weekend',
             'receipt_quarter',
             'receipt_is_business_hour',
             'receipt_date_time',
             'length_in_cm',
             'width_in_cm',
             'height_in_cm',
             'weight_in_kg',
             'volume',
             'density',
             'package_type',
             'time_diff_receipt_to_packed',
             'packed_day',
             'packed_month',
             'packed_year',
             'packed_hour',
             'packed_minute',
             'packed_second',
             'packed_weekday',
             'packed_time_of_day',
             'packed_week_of_year',
             'packed_is_weekend',
             'packed_quarter',
             'packed_is_business_hour',
             'packed_date_time',
             'country',
             'time_diff_packed_to_acceptance',
             'order_acceptance_day',
             'order_acceptance_month',
             'order_acceptance_year',
             'order_acceptance_hour',
             'order_acceptance_minute',
             'order_acceptance_second',
             'order_acceptance_weekday',
             'order_acceptance_time_of_day',
             'order_acceptance_week_of_year',
             'order_acceptance_is_weekend',
             'order_acceptance_quarter',
             'order_acceptance_is_business_hour',
             'order_acceptance_date_time',
             'time_diff_acceptance_to_delivery',
             'delivery_number_day',
             'delivery_number_month',
             'delivery_number_year',
             'delivery_number_hour',
             'delivery_number_minute',
             'delivery_number_second',
             'delivery_number_weekday',
             'delivery_number_time_of_day',
             'delivery_number_week_of_year',
             'delivery_number_is_weekend',
             'delivery_number_quarter',
             'delivery_number_is_business_hour',
             'delivery_number_date_time',
             'time_diff_delivery_to_provided',
             'provided_day',
             'provided_month',
             'provided_year',
             'provided_hour',
             'provided_minute',
             'provided_second',
             'provided_weekday',
             'provided_time_of_day',
             'provided_week_of_year',
             'provided_is_weekend',
             'provided_quarter',
             'provided_is_business_hour',
             'provided_date_time',
             'transport_order_date_time'
             ]

data = data[selected_columns]

In [15]:
# Deal with date time features. Convert them to Unix timestamps

datetime_features = [
    'receipt_date_time', 'packed_date_time', 'order_acceptance_date_time',
    'delivery_number_date_time', 'provided_date_time', 'transport_order_date_time'
]

for feature in datetime_features:
    data[feature] = pd.to_datetime(data[feature]).astype(np.int64) // 10**9  # convert to seconds

In [16]:
# Deal with numerical features. Do feature scaling

numerical_features = [
    'receipt_day', 'receipt_month', 'receipt_year', 'receipt_hour', 'receipt_minute', 'receipt_second',
    'receipt_weekday', 'receipt_week_of_year', 'receipt_quarter',
    'length_in_cm', 'width_in_cm', 'height_in_cm', 'weight_in_kg', 'volume', 'density',
    'time_diff_receipt_to_packed', 'packed_day', 'packed_month', 'packed_year', 'packed_hour', 'packed_minute',
    'packed_second', 'packed_weekday', 'packed_week_of_year', 'packed_quarter',
    'packed_date_time', 'time_diff_packed_to_acceptance', 'order_acceptance_day',
    'order_acceptance_month', 'order_acceptance_year', 'order_acceptance_hour', 'order_acceptance_minute',
    'order_acceptance_second', 'order_acceptance_weekday', 'order_acceptance_week_of_year',
    'order_acceptance_quarter', 'order_acceptance_date_time',
    'time_diff_acceptance_to_delivery', 'delivery_number_day', 'delivery_number_month', 'delivery_number_year',
    'delivery_number_hour', 'delivery_number_minute', 'delivery_number_second', 'delivery_number_weekday',
    'delivery_number_week_of_year', 'delivery_number_quarter',
    'delivery_number_date_time', 'time_diff_delivery_to_provided', 'provided_day', 'provided_month', 'provided_year',
    'provided_hour', 'provided_minute', 'provided_second', 'provided_weekday', 'provided_week_of_year',
    'provided_quarter', 'provided_date_time', 'transport_order_date_time'
]

scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [17]:
# Deal with categorical data.

categorical_features = [
    'package_type', 'country', 'receipt_time_of_day', 'packed_time_of_day',
    'order_acceptance_time_of_day', 'delivery_number_time_of_day',
    'provided_time_of_day'
]

data = pd.get_dummies(data, columns=categorical_features, drop_first=True)  # Drop the first category to avoid multicollinearity

# Convert boolean to integers for one-hot encoded features
boolean_columns = data.select_dtypes(include='bool').columns
data[boolean_columns] = data[boolean_columns].astype(int)

In [18]:
# Select specific columns
selected_columns = ['time_diff_packed_to_acceptance',
                    'order_acceptance_day',
                    'order_acceptance_month',
                    'order_acceptance_year',
                    'order_acceptance_week_of_year',
                    'order_acceptance_date_time',
                    'time_diff_acceptance_to_delivery',
                    'delivery_number_month',
                    'delivery_number_year',
                    'delivery_number_week_of_year',
                    'delivery_number_date_time',
                    'time_diff_delivery_to_provided',
                    'provided_year',
                    'provided_week_of_year',
                    'provided_date_time',
                    'transport_order_date_time'
                    ]

data = data[selected_columns]

# Store Data

In [19]:
# Separate target variable (y) and predictor variables (X)
X = data.drop(columns=['transport_order_date_time'])
y = data['transport_order_date_time']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Store the data split using pickle
with open(data_path + 'data_split.pkl', 'wb') as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)