<a href="https://colab.research.google.com/github/amalsalilan/DataInsight_Sales_Data_Analysis_and_Visualization_Infosys_Internship_Dec2024/blob/K-Pavitra/Shipping_delay_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [5]:
df=pd.read_excel("/content/walmart Retail Data.xlsx")#loading dataset
df


Unnamed: 0,City,Customer Age,Customer Name,Customer Segment,Discount,Number of Records,Order Date,Order ID,Order Priority,Order Quantity,...,Profit,Region,Row ID,Sales,Ship Date,Ship Mode,Shipping Cost,State,Unit Price,Zip Code
0,McKeesport,,Jessica Myrick,Small Business,0.10,1,2012-01-01,28774,High,32,...,-111.80,East,4031,180.36,2012-01-02,Regular Air,4.69,Pennsylvania,5.98,15131
1,Bowie,,Matt Collister,Home Office,0.08,1,2012-01-01,13729,Not Specified,9,...,-342.91,East,1914,872.48,2012-01-03,Express Air,35.00,Maryland,95.99,20715
2,Napa,,Alan Schoenberger,Corporate,0.00,1,2012-01-02,37537,Low,4,...,-193.08,West,5272,1239.06,2012-01-02,Delivery Truck,48.80,California,291.73,94559
3,Montebello,,Elizabeth Moffitt,Consumer,0.08,1,2012-01-02,44069,Critical,43,...,247.79,West,6225,614.80,2012-01-02,Regular Air,1.97,California,15.04,90640
4,Napa,,Alan Schoenberger,Corporate,0.07,1,2012-01-02,37537,Low,43,...,-1049.85,West,5273,4083.19,2012-01-04,Delivery Truck,45.00,California,100.98,94559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8394,Fairfield,95.0,Tony Molinari,Corporate,0.10,1,2015-12-30,50950,Not Specified,35,...,-15.07,West,7142,448.10,2015-12-30,Express Air,4.51,California,13.48,94533
8395,Harker Heights,95.0,Matt Hagelstein,Home Office,0.09,1,2015-12-30,25542,Low,37,...,-18.66,Central,3583,257.46,2015-12-30,Express Air,4.23,Texas,7.28,76543
8396,Riverview,95.0,Theresa Swint,Consumer,0.10,1,2015-12-30,45127,Medium,10,...,-1.29,South,6361,14.15,2015-12-30,Regular Air,0.70,Florida,1.48,33569
8397,Nicholasville,95.0,Maribeth Yedwab,Home Office,0.09,1,2015-12-30,49344,Low,1,...,-745.20,South,6916,803.33,2015-12-30,Regular Air,24.49,Kentucky,832.81,40356


In [6]:
#calculating shipping delays
df['Shipping Delay'] = (df['Ship Date'] - df['Order Date']).dt.days  # Calculate the difference and extract days
df['Shipping Delay']

Unnamed: 0,Shipping Delay
0,1
1,2
2,0
3,0
4,2
...,...
8394,0
8395,0
8396,0
8397,0


In [7]:
unique_values = df['Ship Mode'].unique()

# Print the unique values
print(unique_values)

['Regular Air' 'Express Air' 'Delivery Truck']


In [8]:
#defining threshold based on ship mode
delay_thresholds={
    "Regular Air":0,
    "Express Air":1,
    "Delivery Truck":2
}



In [9]:
df['Expected Delay'] = df['Ship Mode'].map(delay_thresholds)
df['Target_Classification'] = (df['Shipping Delay'] > df['Expected Delay']).astype(int)
df = df.dropna(subset=['Target_Classification'])

In [10]:
df['Expected Delay']

Unnamed: 0,Expected Delay
0,0
1,1
2,2
3,0
4,2
...,...
8394,1
8395,1
8396,0
8397,0


In [11]:
# Selecting Features for Classification
features = ['Order Priority', 'Ship Mode', 'Region', 'Shipping Cost', 'Order Quantity', 'Discount', 'Profit']#features that might or might not effect the shipping delay
X = df[features]
y = df['Target_Classification']
df['Target_Classification'].value_counts()#(1)->delayed (0)->on time

Unnamed: 0_level_0,count
Target_Classification,Unnamed: 1_level_1
1,6303
0,2096


In [12]:
# Encode Categorical Features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = encoder.fit_transform(X[['Order Priority', 'Ship Mode', 'Region']])
X_encoded = np.hstack((X_encoded, X[['Shipping Cost', 'Order Quantity', 'Discount', 'Profit']].values))

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [13]:
# Train Classification Model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [14]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8285714285714286
              precision    recall  f1-score   support

           0       0.71      0.52      0.60       417
           1       0.85      0.93      0.89      1263

    accuracy                           0.83      1680
   macro avg       0.78      0.72      0.75      1680
weighted avg       0.82      0.83      0.82      1680



In [15]:
def predict_delay_status(data):
    df = pd.DataFrame([data])

    # Encode categorical features
    X_categorical = encoder.transform(df[['Order Priority', 'Ship Mode', 'Region']])
    X_numerical = df[['Shipping Cost', 'Order Quantity', 'Discount', 'Profit']].values
    X_encoded = np.hstack((X_categorical, X_numerical))

    # Predict delay status
    prediction = clf.predict(X_encoded)
    return "Delayed" if prediction[0] == 1 else "On-Time"

In [16]:
df

Unnamed: 0,City,Customer Age,Customer Name,Customer Segment,Discount,Number of Records,Order Date,Order ID,Order Priority,Order Quantity,...,Sales,Ship Date,Ship Mode,Shipping Cost,State,Unit Price,Zip Code,Shipping Delay,Expected Delay,Target_Classification
0,McKeesport,,Jessica Myrick,Small Business,0.10,1,2012-01-01,28774,High,32,...,180.36,2012-01-02,Regular Air,4.69,Pennsylvania,5.98,15131,1,0,1
1,Bowie,,Matt Collister,Home Office,0.08,1,2012-01-01,13729,Not Specified,9,...,872.48,2012-01-03,Express Air,35.00,Maryland,95.99,20715,2,1,1
2,Napa,,Alan Schoenberger,Corporate,0.00,1,2012-01-02,37537,Low,4,...,1239.06,2012-01-02,Delivery Truck,48.80,California,291.73,94559,0,2,0
3,Montebello,,Elizabeth Moffitt,Consumer,0.08,1,2012-01-02,44069,Critical,43,...,614.80,2012-01-02,Regular Air,1.97,California,15.04,90640,0,0,0
4,Napa,,Alan Schoenberger,Corporate,0.07,1,2012-01-02,37537,Low,43,...,4083.19,2012-01-04,Delivery Truck,45.00,California,100.98,94559,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8394,Fairfield,95.0,Tony Molinari,Corporate,0.10,1,2015-12-30,50950,Not Specified,35,...,448.10,2015-12-30,Express Air,4.51,California,13.48,94533,0,1,0
8395,Harker Heights,95.0,Matt Hagelstein,Home Office,0.09,1,2015-12-30,25542,Low,37,...,257.46,2015-12-30,Express Air,4.23,Texas,7.28,76543,0,1,0
8396,Riverview,95.0,Theresa Swint,Consumer,0.10,1,2015-12-30,45127,Medium,10,...,14.15,2015-12-30,Regular Air,0.70,Florida,1.48,33569,0,0,0
8397,Nicholasville,95.0,Maribeth Yedwab,Home Office,0.09,1,2015-12-30,49344,Low,1,...,803.33,2015-12-30,Regular Air,24.49,Kentucky,832.81,40356,0,0,0


In [17]:
# Example Usage
sample_input = {
    'Order Priority': 'Low',
    'Ship Mode': 'Delivery Truck',
    'Region': 'West',
    'Shipping Cost':48.8,
    'Order Quantity': 4,
    'Discount':0,
    'Profit': 193.08
}

predicted_status = predict_delay_status(sample_input)
print(f"Predicted Shipping Status: {predicted_status}")

Predicted Shipping Status: On-Time


In [18]:
import joblib
# Save the model
filename = 'shipping_delay_model.joblib'
joblib.dump(clf, filename)

['shipping_delay_model.joblib']