# 02 - Delivery Delay Prediction Model

## ðŸ“Œ Objective
Build and evaluate machine learning models to predict whether an order will be delayed (`Is_Delayed`).

---


In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib

# --------------------------
# LOAD DATA
# --------------------------
df = pd.read_csv(r"C:/Users/HP/Downloads/Data_analyst_learning/amazon_project/data/cleaned_amazon_delivery.csv")

print("Columns:", df.columns.tolist())
print(df.head())

# --------------------------
# FEATURES AND TARGET
# --------------------------
numeric_cols = ["Agent_Age", "Agent_Rating"]
categorical_cols = ["Weather", "Traffic", "Vehicle", "Area", "Category"]

X = df[numeric_cols + categorical_cols]
y = df["Is_Delayed"]

# --------------------------
# TRAIN-TEST SPLIT
# --------------------------
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# PREPROCESSOR
# --------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ]
)

# --------------------------
# PIPELINE (FINAL)
# --------------------------
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss"
    ))
])

# --------------------------
# TRAIN MODEL
# --------------------------
pipeline.fit(X_train, Y_train)

print("Training complete!")

# --------------------------
# SAVE FINAL MODEL
# --------------------------
SAVE_PATH = "C:/Users/HP/Downloads/Data_analyst_learning/amazon_project/models/DELAY_MODEL_FINAL.pkl"
joblib.dump(pipeline, SAVE_PATH)

print("MODEL SAVED AT:", SAVE_PATH)


Columns: ['Order_ID', 'Agent_Age', 'Agent_Rating', 'Store_Latitude', 'Store_Longitude', 'Drop_Latitude', 'Drop_Longitude', 'Order_Date', 'Order_Time', 'Pickup_Time', 'Weather', 'Traffic', 'Vehicle', 'Area', 'Delivery_Time', 'Category', 'Delivery_Duration_Minutes', 'Is_Delayed', 'Churned']
        Order_ID  Agent_Age  Agent_Rating  Store_Latitude  Store_Longitude  \
0  ialx566343618         37           4.9       22.745049        75.892471   
1  akqg208421122         34           4.5       12.913041        77.683237   
2  njpu434582536         23           4.4       12.914264        77.678400   
3  rjto796129700         38           4.7       11.003669        76.976494   
4  zguw716275638         32           4.6       12.972793        80.249982   

   Drop_Latitude  Drop_Longitude  Order_Date Order_Time          Pickup_Time  \
0      22.765049       75.912471  2022-03-19   11:30:00  2022-03-19 11:45:00   
1      13.043041       77.813237  2022-03-25   19:45:00  2022-03-25 19:50:00   
2