# Freight Logistics - On Time / Delayed Project

## Import Necessary Dependencies

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
df = pd.read_csv('/kaggle/input/arogoai/AI ML Internship Training Data.xlsx - freight_delivery_realistic_data.csv')
df.head()

Unnamed: 0,Shipment ID,Origin,Destination,Shipment Date,Planned Delivery Date,Actual Delivery Date,Vehicle Type,Distance (km),Weather Conditions,Traffic Conditions,Delayed
0,SHIP000000,Jaipur,Mumbai,2023-04-26,2023-05-01,2023-05-02,Trailer,1603,Rain,Light,Yes
1,SHIP000001,Bangalore,Delhi,2023-02-09,2023-02-13,2023-02-17,Trailer,1237,Storm,Moderate,Yes
2,SHIP000002,Mumbai,Chennai,2023-09-19,2023-09-25,2023-09-25,Truck,1863,Clear,Light,No
3,SHIP000003,Hyderabad,Ahmedabad,2023-04-01,2023-04-05,2023-04-05,Container,1374,Clear,Light,No
4,SHIP000004,Chennai,Kolkata,2023-11-24,2023-11-26,2023-11-28,Container,676,Clear,Heavy,Yes


## Data Cleaning

In [3]:
# Identify Missing Values for each feature
df.isnull().sum()

Shipment ID                0
Origin                     0
Destination                0
Shipment Date              0
Planned Delivery Date      0
Actual Delivery Date       0
Vehicle Type             597
Distance (km)              0
Weather Conditions         0
Traffic Conditions         0
Delayed                    0
dtype: int64

In [4]:
# statistics 
df.describe()

Unnamed: 0,Distance (km)
count,20000.0
mean,1101.66375
std,520.717873
min,200.0
25%,649.75
50%,1102.0
75%,1551.0
max,2000.0


In [5]:
# get the unique values in vehicle type
df['Vehicle Type'].unique()

array(['Trailer', 'Truck', 'Container', 'Lorry', nan], dtype=object)

In [6]:
# get the total number of training samples in training set
df.shape

(20000, 11)

Grouping the ranges of Distance into "Short", "Med", "long" and "Very Long" based on its statitics

In [7]:
bins = [199, 650, 1102, 1551, 2000]
labels = ['Short', 'Medium', 'Long', 'Very_long']
df['Distance_Group'] = pd.cut(df['Distance (km)'], bins=bins, labels=labels, right=True)
df.head()

Unnamed: 0,Shipment ID,Origin,Destination,Shipment Date,Planned Delivery Date,Actual Delivery Date,Vehicle Type,Distance (km),Weather Conditions,Traffic Conditions,Delayed,Distance_Group
0,SHIP000000,Jaipur,Mumbai,2023-04-26,2023-05-01,2023-05-02,Trailer,1603,Rain,Light,Yes,Very_long
1,SHIP000001,Bangalore,Delhi,2023-02-09,2023-02-13,2023-02-17,Trailer,1237,Storm,Moderate,Yes,Long
2,SHIP000002,Mumbai,Chennai,2023-09-19,2023-09-25,2023-09-25,Truck,1863,Clear,Light,No,Very_long
3,SHIP000003,Hyderabad,Ahmedabad,2023-04-01,2023-04-05,2023-04-05,Container,1374,Clear,Light,No,Long
4,SHIP000004,Chennai,Kolkata,2023-11-24,2023-11-26,2023-11-28,Container,676,Clear,Heavy,Yes,Medium


## Handling Missing Values

In [8]:
# Assuming Vehicle were majorly assigned based on "Distances"
# Get the most frequent Vehicle Type in each Distance Group
vehicle_type_by_distance = df.groupby('Distance_Group')['Vehicle Type'].agg(lambda x: x.mode()[0])
print(vehicle_type_by_distance)

Distance_Group
Short          Lorry
Medium         Lorry
Long           Truck
Very_long    Trailer
Name: Vehicle Type, dtype: object


  vehicle_type_by_distance = df.groupby('Distance_Group')['Vehicle Type'].agg(lambda x: x.mode()[0])


In [9]:
# Fill missing 'Vehicle Type' based on the 'Distance_Group'
df['Vehicle Type'] = df.apply(
    lambda row: vehicle_type_by_distance[row['Distance_Group']] 
    if pd.isna(row['Vehicle Type']) else row['Vehicle Type'], axis=1)

In [10]:
df.isnull().sum()

Shipment ID              0
Origin                   0
Destination              0
Shipment Date            0
Planned Delivery Date    0
Actual Delivery Date     0
Vehicle Type             0
Distance (km)            0
Weather Conditions       0
Traffic Conditions       0
Delayed                  0
Distance_Group           0
dtype: int64

## Feature Engineering

In [11]:
df.head()

Unnamed: 0,Shipment ID,Origin,Destination,Shipment Date,Planned Delivery Date,Actual Delivery Date,Vehicle Type,Distance (km),Weather Conditions,Traffic Conditions,Delayed,Distance_Group
0,SHIP000000,Jaipur,Mumbai,2023-04-26,2023-05-01,2023-05-02,Trailer,1603,Rain,Light,Yes,Very_long
1,SHIP000001,Bangalore,Delhi,2023-02-09,2023-02-13,2023-02-17,Trailer,1237,Storm,Moderate,Yes,Long
2,SHIP000002,Mumbai,Chennai,2023-09-19,2023-09-25,2023-09-25,Truck,1863,Clear,Light,No,Very_long
3,SHIP000003,Hyderabad,Ahmedabad,2023-04-01,2023-04-05,2023-04-05,Container,1374,Clear,Light,No,Long
4,SHIP000004,Chennai,Kolkata,2023-11-24,2023-11-26,2023-11-28,Container,676,Clear,Heavy,Yes,Medium


**Splits out datetime feature into day, month, delay**

In [12]:
# ensure that datetime feature are in proper format
df['Shipment Date'] = pd.to_datetime(df['Shipment Date'])
df['Planned Delivery Date'] = pd.to_datetime(df['Planned Delivery Date'])
df['Actual Delivery Date'] = pd.to_datetime(df['Actual Delivery Date'])

In [13]:
df['Shipment Day'] = df['Shipment Date'].dt.day
df['Shipment Month'] = df['Shipment Date'].dt.month
df['Planned Delivery Day'] = df['Planned Delivery Date'].dt.day
df['Planned Delivery Month'] = df['Planned Delivery Date'].dt.month
df['Actual Delivery Day'] = df['Actual Delivery Date'].dt.day
df['Actual Delivery Month'] = df['Actual Delivery Date'].dt.month
df.head()

Unnamed: 0,Shipment ID,Origin,Destination,Shipment Date,Planned Delivery Date,Actual Delivery Date,Vehicle Type,Distance (km),Weather Conditions,Traffic Conditions,Delayed,Distance_Group,Shipment Day,Shipment Month,Planned Delivery Day,Planned Delivery Month,Actual Delivery Day,Actual Delivery Month
0,SHIP000000,Jaipur,Mumbai,2023-04-26,2023-05-01,2023-05-02,Trailer,1603,Rain,Light,Yes,Very_long,26,4,1,5,2,5
1,SHIP000001,Bangalore,Delhi,2023-02-09,2023-02-13,2023-02-17,Trailer,1237,Storm,Moderate,Yes,Long,9,2,13,2,17,2
2,SHIP000002,Mumbai,Chennai,2023-09-19,2023-09-25,2023-09-25,Truck,1863,Clear,Light,No,Very_long,19,9,25,9,25,9
3,SHIP000003,Hyderabad,Ahmedabad,2023-04-01,2023-04-05,2023-04-05,Container,1374,Clear,Light,No,Long,1,4,5,4,5,4
4,SHIP000004,Chennai,Kolkata,2023-11-24,2023-11-26,2023-11-28,Container,676,Clear,Heavy,Yes,Medium,24,11,26,11,28,11


## Data Preprocessing 

**Encode Categorical Feature: "Origin" into Numerical using label encoding**

In [14]:
origin_city_counts = df['Origin'].value_counts()
origin_rank_city = { city : i for i,city in enumerate(origin_city_counts.index)}

df['Origin_rank'] = df['Origin'].map(origin_rank_city)

In [15]:
des_city_counts = df['Destination'].value_counts()
des_city_rank = { city : i for i,city in enumerate(des_city_counts.index)}

df['Destination_rank'] = df['Destination'].map(des_city_rank)

In [16]:
df.head()

Unnamed: 0,Shipment ID,Origin,Destination,Shipment Date,Planned Delivery Date,Actual Delivery Date,Vehicle Type,Distance (km),Weather Conditions,Traffic Conditions,Delayed,Distance_Group,Shipment Day,Shipment Month,Planned Delivery Day,Planned Delivery Month,Actual Delivery Day,Actual Delivery Month,Origin_rank,Destination_rank
0,SHIP000000,Jaipur,Mumbai,2023-04-26,2023-05-01,2023-05-02,Trailer,1603,Rain,Light,Yes,Very_long,26,4,1,5,2,5,9,7
1,SHIP000001,Bangalore,Delhi,2023-02-09,2023-02-13,2023-02-17,Trailer,1237,Storm,Moderate,Yes,Long,9,2,13,2,17,2,3,6
2,SHIP000002,Mumbai,Chennai,2023-09-19,2023-09-25,2023-09-25,Truck,1863,Clear,Light,No,Very_long,19,9,25,9,25,9,1,1
3,SHIP000003,Hyderabad,Ahmedabad,2023-04-01,2023-04-05,2023-04-05,Container,1374,Clear,Light,No,Long,1,4,5,4,5,4,5,4
4,SHIP000004,Chennai,Kolkata,2023-11-24,2023-11-26,2023-11-28,Container,676,Clear,Heavy,Yes,Medium,24,11,26,11,28,11,4,9


In [17]:
LE = LabelEncoder()
df['Vehicle Type'] = LE.fit_transform(df['Vehicle Type'])
df['Weather Conditions'] = LE.fit_transform(df['Weather Conditions'])
df['Traffic Conditions'] = LE.fit_transform(df['Traffic Conditions'])
df['Distance Group'] = LE.fit_transform(df['Distance_Group'])
df['Delayed'] = LE.fit_transform(df['Delayed'])

In [18]:
df.head()

Unnamed: 0,Shipment ID,Origin,Destination,Shipment Date,Planned Delivery Date,Actual Delivery Date,Vehicle Type,Distance (km),Weather Conditions,Traffic Conditions,...,Distance_Group,Shipment Day,Shipment Month,Planned Delivery Day,Planned Delivery Month,Actual Delivery Day,Actual Delivery Month,Origin_rank,Destination_rank,Distance Group
0,SHIP000000,Jaipur,Mumbai,2023-04-26,2023-05-01,2023-05-02,2,1603,2,1,...,Very_long,26,4,1,5,2,5,9,7,3
1,SHIP000001,Bangalore,Delhi,2023-02-09,2023-02-13,2023-02-17,2,1237,3,2,...,Long,9,2,13,2,17,2,3,6,0
2,SHIP000002,Mumbai,Chennai,2023-09-19,2023-09-25,2023-09-25,3,1863,0,1,...,Very_long,19,9,25,9,25,9,1,1,3
3,SHIP000003,Hyderabad,Ahmedabad,2023-04-01,2023-04-05,2023-04-05,0,1374,0,1,...,Long,1,4,5,4,5,4,5,4,0
4,SHIP000004,Chennai,Kolkata,2023-11-24,2023-11-26,2023-11-28,0,676,0,0,...,Medium,24,11,26,11,28,11,4,9,1


In [19]:
# Feature removal
df = df.drop(columns=['Origin', 'Destination','Shipment ID','Shipment Date', 'Actual Delivery Date', 'Planned Delivery Date', 'Distance (km)', 'Distance_Group'])
df.head()

Unnamed: 0,Vehicle Type,Weather Conditions,Traffic Conditions,Delayed,Shipment Day,Shipment Month,Planned Delivery Day,Planned Delivery Month,Actual Delivery Day,Actual Delivery Month,Origin_rank,Destination_rank,Distance Group
0,2,2,1,1,26,4,1,5,2,5,9,7,3
1,2,3,2,1,9,2,13,2,17,2,3,6,0
2,3,0,1,0,19,9,25,9,25,9,1,1,3
3,0,0,1,0,1,4,5,4,5,4,5,4,0
4,0,0,0,1,24,11,26,11,28,11,4,9,1


## Split the DataFrame into Feature-Target

In [20]:
Y = df['Delayed']
X = df.drop(columns=['Delayed'])

In [21]:
X.head()

Unnamed: 0,Vehicle Type,Weather Conditions,Traffic Conditions,Shipment Day,Shipment Month,Planned Delivery Day,Planned Delivery Month,Actual Delivery Day,Actual Delivery Month,Origin_rank,Destination_rank,Distance Group
0,2,2,1,26,4,1,5,2,5,9,7,3
1,2,3,2,9,2,13,2,17,2,3,6,0
2,3,0,1,19,9,25,9,25,9,1,1,3
3,0,0,1,1,4,5,4,5,4,5,4,0
4,0,0,0,24,11,26,11,28,11,4,9,1


In [22]:
Y.head()

0    1
1    1
2    0
3    0
4    1
Name: Delayed, dtype: int64

## Splits the DataFrame into Training and Testing Dataset


In [23]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [24]:
print(f"Shape of X1_train: {x_train.shape}")
print(f"Shape of Y1_train: {y_train.shape}\n")

print(f"Shape of X1_test: {x_test.shape}")
print(f"Shape of Y1_test: {y_test.shape}\n")

Shape of X1_train: (16000, 12)
Shape of Y1_train: (16000,)

Shape of X1_test: (4000, 12)
Shape of Y1_test: (4000,)



## Model Selection

In [25]:
## Model 1 : Logistic Regression on DataFrame with Datetime
LR_model = LogisticRegression()
LR_model.fit(x_train, y_train)
ypred_LR = LR_model.predict(x_test)

accuracy_LR = accuracy_score(y_test, ypred_LR)
print(f'Accuracy: {accuracy_LR}\n')

print(classification_report(y_test, ypred_LR))

Accuracy: 0.62675

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1059
           1       0.70      0.85      0.77      2941

    accuracy                           0.63      4000
   macro avg       0.35      0.43      0.39      4000
weighted avg       0.52      0.63      0.57      4000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
rf_model = RandomForestClassifier(n_estimators = 100, random_state=42)
rf_model.fit(x_train, y_train)
ypred_rf = rf_model.score(x_test, y_test)
print(f"Random Forest Accuracy: {ypred_rf}")

Random Forest Accuracy: 0.9675


In [27]:
xg_model = XGBClassifier(random_state = 42)
xg_model.fit(x_train, y_train)
ypred_xg = xg_model.score(x_test, y_test)
print(f" XGBoost Classifier Accuracy: {ypred_xg}")

 XGBoost Classifier Accuracy: 0.998


## Download Model

In [28]:
with open('xgb_model.pkl', 'wb') as file:
    pickle.dump(xg_model, file)

## Check - Load the model

In [29]:
with open('xgb_model.pkl', 'rb') as file:
    model = pickle.load(file)
print('Sucessfully Loaded!!')

Sucessfully Loaded!!


In [30]:
def prediction(origin, destination, shipment_date, planned_delivery_date, actual_delivery_date,
               vehicle_type, distance, weather_conditions, traffic_conditions, df):
    # Feature Engineering
    shipment_day, shipment_month, shipment_year = shipment_date.split('-')
    planned_day, planned_month, planned_year = planned_delivery_date.split('-')
    actual_day, actual_month, actual_year = actual_delivery_date.split('-')

    # Label Encoding
    origin_counts = df['Origin'].value_counts()
    origin_rank = {city: i for i, city in enumerate(origin_counts.index)}
    
    destination_counts = df['Destination'].value_counts()
    destination_rank = {city: i for i, city in enumerate(destination_counts.index)}
    
    origin_encoded = origin_rank.get(origin, -1)
    destination_encoded = destination_rank.get(destination, -1)

    # Distance Binning
    bins = [650, 1102, 1551, 2000]
    labels = [1, 2, 0, 3]
    distance = labels[-1] if distance >= bins[-1] else next(labels[i] for i, bin in enumerate(bins) if distance < bin)

    # Categorical Rankings
    traffic_rank = {'heavy': 0, 'light': 1, 'moderate': 2}
    weather_rank = {'clear': 0, 'fog': 1, 'rain': 2, 'storm': 3}
    vehicle_type_rank = {'container': 0, 'lorry': 1, 'trailer': 2, 'truck': 3}

    vehicle_type_encoded = vehicle_type_rank.get(vehicle_type.lower(), -1)
    weather_condition_encoded = weather_rank.get(weather_conditions.lower(), -1)
    traffic_condition_encoded = traffic_rank.get(traffic_conditions.lower(), -1)

    # Prepare Feature List
    feature_names = [
    'Vehicle Type', 'Weather Conditions', 'Traffic Conditions',
    'Shipment Day', 'Shipment Month',
    'Planned Delivery Day', 'Planned Delivery Month',
    'Actual Delivery Day', 'Actual Delivery Month',
    'Origin_rank', 'Destination_rank', 'Distance Group'
    ]
    features = [vehicle_type_encoded, weather_condition_encoded, traffic_condition_encoded, 
                 int(shipment_day), int(shipment_month), int(planned_day), int(planned_month), 
                 int(actual_day), int(actual_month), origin_encoded, destination_encoded, distance]

    df_features = pd.DataFrame([features],columns=feature_names)

    # Load Model
    with open('xgb_model.pkl', 'rb') as file:
        model = pickle.load(file)

    # Predict Delay
    delayed = model.predict(df_features)[0]
    if delayed == 1:
        print('Prediction: Delayed')
    else:
        print('Prediction: On Time!')


In [31]:
df_test = pd.read_csv('/kaggle/input/arogoai/AI ML Internship Training Data.xlsx - freight_delivery_realistic_data.csv')

In [32]:
origin = "Hyderabad"
destination = "Ahmedabad"
shipment_date = "2023-04-01"
planned_delivery_date = "2023-04-05"
actual_delivery_date = "2023-04-05"
vehicle_type = "Container"
distance = 1374
weather_conditions = "Clear"
traffic_conditions = "Light"

prediction(origin, destination, shipment_date, planned_delivery_date, actual_delivery_date,
               vehicle_type, distance, weather_conditions, traffic_conditions, df_test)

Prediction: On Time!
