<a href="https://colab.research.google.com/github/ankita1120/almabetter/blob/publicBranch/FedEx_Logistics_Performance_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


## Upload Dataset


In [None]:
# Upload Dataset
df = pd.read_csv('/content/SCMS_Delivery_History_Dataset.csv')

print(df.head())

## Display basic information

In [None]:
# Display basic information
print("DataSet Shape:" , df.shape)
print("DataSet Info:" , df.info())
print("DataSet Describe:" , df.describe())
print("\n Misssing values:" , df.isnull().sum())

# Handle missing values

In [None]:
# Handle missing values for Freight Cost
if 'Freight Cost (USD)' in df.columns:
    # Attempt to convert the column to numeric, handling errors
    df['Freight Cost (USD)'] = pd.to_numeric(df['Freight Cost (USD)'], errors='coerce')

    if df['Freight Cost (USD)'].notna().sum() > 0:
        df['Freight Cost (USD)'].fillna(df['Freight Cost (USD)'].mean(), inplace=True)
    else:
        print("Warning: 'Freight Cost (USD)' column contains only NaN values.")
else:
    print("Warning: 'Freight Cost (USD)' column not found in DataFrame.")

# Rename columns for consistency
df.rename(columns={'PO Sent to Vendor Date': 'Order_Date',
                   'Delivered to Client Date': 'Delivery_Date'}, inplace=True)

# Convert date columns safely
date_columns = ['Order_Date', 'Delivery_Date']
for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    else:
        print(f"Warning: '{col}' column not found in DataFrame.")

# Ensure both Order_Date and Delivery_Date exist before calculating duration
if 'Order_Date' in df.columns and 'Delivery_Date' in df.columns:
    df["Delivery_Duration"] = (df["Delivery_Date"] - df["Order_Date"]).dt.days
    df["Delivery_Duration"] = df["Delivery_Duration"].fillna(-1)  # Assign -1 to invalid durations
else:
    print("Warning: One or both date columns are missing.")

# Remove duplicates
df.drop_duplicates(inplace=True)

# Basic Statistics
print("\nBasic Statistics:\n", df.describe())


# EDA
# Shipment Method Analysis


In [None]:
# Shipment Mode Analysis
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x="Shipment Mode", palette="coolwarm")
plt.title("Shipment Mode Distribution")
plt.xticks(rotation=45)
plt.show()

# Freight Cost Distribution

In [None]:
# Freight Cost Distribution
plt.figure(figsize=(10, 5))
# Changed 'Shipment_Method' to 'Shipment Mode'
sns.boxplot(data=df, x="Shipment Mode", y="Freight Cost (USD)", palette="viridis")
plt.title("Freight Cost by Shipment Method")
plt.xticks(rotation=45)
plt.show()

# Interactive Geo-Map of Shipments


In [None]:
# Create interactive shipment distribution map
fig = px.scatter_geo(df, locations="Country",
                     locationmode="country names",  # Uses country names instead of lat/lon
                     color="Shipment Mode",
                     size="Freight Cost (USD)",
                     hover_name="Country",
                     title="Shipment Distribution Map")

fig.show()

# Correlation Matrix


In [None]:
# Compute correlation matrix for numerical columns
numeric_df = df.select_dtypes(include=["number"])  # Selects only numeric columns
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Numerical Features")
plt.show()


In [None]:
# Insights
print("\nKey Insights:")
print("- The most frequently used shipment mode is:", df["Shipment Mode"].mode()[0])
# Replace 'Delivery Time (Days)' with 'Delivery_Duration'
print("- Average delivery duration:", df["Delivery_Duration"].mean(), "days")
print("- Highest freight cost recorded:", df["Freight Cost (USD)"].max())