# Import libraries

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')


# Load data

In [2]:
print("Loading CSV files...")
orders = pd.read_csv('orders.csv')
customers = pd.read_csv('customers.csv')
order_items = pd.read_csv('order_items.csv')
geolocation = pd.read_csv('geolocation.csv')

Loading CSV files...


# Prepare data

In [3]:

print("Preparing data...")
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
orders['order_delivered_customer_date'] = pd.to_datetime(orders['order_delivered_customer_date'])
orders['order_estimated_delivery_date'] = pd.to_datetime(orders['order_estimated_delivery_date'])
orders['order_delivered_carrier_date'] = pd.to_datetime(orders['order_delivered_carrier_date'])

Preparing data...


In [4]:
orders.head(5)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26


# Target: delivery_delay

In [5]:
orders['delivery_time'] = (orders['order_delivered_customer_date'] - 
                          orders['order_purchase_timestamp']).dt.total_seconds() / (24 * 3600)
orders['delivery_delay'] = np.where(orders['order_delivered_customer_date'] > 
                                   orders['order_estimated_delivery_date'], 1, 0)

# Features

In [6]:
features = orders[['order_id', 'customer_id', 'order_purchase_timestamp', 'delivery_time', 
                  'delivery_delay', 'order_delivered_carrier_date']].copy()
features = features.merge(customers[['customer_id', 'customer_state']], on='customer_id', how='left')
freight = order_items.groupby('order_id')['freight_value'].mean().reset_index()
features = features.merge(freight, on='order_id', how='left')
geo = geolocation.drop_duplicates(subset=['geolocation_zip_code_prefix']).copy()
customer_geo = customers.merge(geo[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']], 
                              left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
features = features.merge(customer_geo[['customer_id', 'geolocation_lat', 'geolocation_lng']], 
                         on='customer_id', how='left')

# Preprocessing

In [7]:
print("Preprocessing data...")
# Drop rows where target or delivery_time is missing (can't impute these)
features = features.dropna(subset=['delivery_time', 'delivery_delay'])

Preprocessing data...


# Encode categorical feature

In [8]:
le = LabelEncoder()
features['customer_state_encoded'] = le.fit_transform(features['customer_state'])

In [9]:
# Extract hour
features['purchase_hour'] = features['order_purchase_timestamp'].dt.hour

# Add shipping_lead_time
features['shipping_lead_time'] = (features['order_delivered_carrier_date'] - 
                                 features['order_purchase_timestamp']).dt.total_seconds() / (24 * 3600)

# Check for NaNs in features
print("Checking for NaNs in features:")
print(features[['purchase_hour', 'customer_state_encoded', 'freight_value', 
                'geolocation_lat', 'geolocation_lng', 'shipping_lead_time']].isna().sum())

# Impute missing values
imputer = SimpleImputer(strategy='median')
X = features[['purchase_hour', 'customer_state_encoded', 'freight_value', 
             'geolocation_lat', 'geolocation_lng', 'shipping_lead_time']]
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Target
y = features['delivery_delay']

# Verify no NaNs remain
print("NaNs after imputation:")
print(X_imputed.isna().sum())

Checking for NaNs in features:
purchase_hour               0
customer_state_encoded      0
freight_value               0
geolocation_lat           264
geolocation_lng           264
shipping_lead_time          1
dtype: int64
NaNs after imputation:
purchase_hour             0
customer_state_encoded    0
freight_value             0
geolocation_lat           0
geolocation_lng           0
shipping_lead_time        0
dtype: int64


# Apply SMOTE

In [10]:
print("Applying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_imputed, y)

Applying SMOTE to balance classes...


# Split data & Train Random Forest

In [11]:

print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)


print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

Splitting data...
Training Random Forest...


# Evaluate

In [12]:

print("Evaluating model...")
rf_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, rf_pred)
precision = precision_score(y_test, rf_pred)
recall = recall_score(y_test, rf_pred)

print("\nRandom Forest Results (Improved):")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Classification Report:")
print(classification_report(y_test, rf_pred))

Evaluating model...

Random Forest Results (Improved):
Accuracy: 0.91
Precision: 0.91
Recall: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     17683
           1       0.91      0.92      0.91     17777

    accuracy                           0.91     35460
   macro avg       0.91      0.91      0.91     35460
weighted avg       0.91      0.91      0.91     35460



# Save predictions

In [13]:
print("Saving predictions...")
# Predict on original (non-balanced) data
features['rf_delay_pred'] = rf_model.predict(X_imputed)
output = features[['order_id', 'delivery_time', 'delivery_delay', 'rf_delay_pred']]
output.to_csv('delay_predictions_improved.csv', index=False)
print("Predictions saved to 'delay_predictions_improved.csv'")

# Feature importance
print("\nFeature Importance:")
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(importance)

Saving predictions...
Predictions saved to 'delay_predictions_improved.csv'

Feature Importance:
                  Feature  Importance
5      shipping_lead_time    0.238259
4         geolocation_lng    0.190559
2           freight_value    0.189178
3         geolocation_lat    0.178751
0           purchase_hour    0.161159
1  customer_state_encoded    0.042093
