In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SupplyChain_BigData_Analytics") \
    .getOrCreate()

spark


## LOAD THE DATASET

In [4]:
supply = spark.read.csv(
    "DataCoSupplyChainDataset.csv",
    header = True,
    inferSchema=True
)
supply.show(5)
supply.printSchema()

+------------------------+-----------------------------+-----------------+------------------+----------------+------------------+-----------+--------------+-------------+-----------------------+
|Days for shipping (real)|Days for shipment (scheduled)|Benefit per order|Sales per customer| Delivery Status|Late_delivery_risk|Category Id| Category Name|Customer City|order date (DateOrders)|
+------------------------+-----------------------------+-----------------+------------------+----------------+------------------+-----------+--------------+-------------+-----------------------+
|                       3|                            4|            91.25|       314.6400146|Advance shipping|                 0|         73|Sporting Goods|       Caguas|        1/31/2018 22:56|
|                       5|                            4|     -249.0899963|       311.3599854|   Late delivery|                 1|         73|Sporting Goods|       Caguas|        1/13/2018 12:27|
|                       4

In [None]:
## DATA MANAGEMENT & CLEANING
## Reanme columns

In [None]:
from pyspark.sql.functions import col

supply_clean = supply.select(
    col("Days for shipping (real)").alias("real_shipping_days"),
    col("Days for shipment (scheduled)").alias("scheduled_shipping_days"),
    col("Benefit per order").alias("benefit_per_order"),
    col("Sales per customer").alias("sales_per_customer"),
    col("Delivery Status").alias("delivery_status"),
    col("Late_delivery_risk").alias("late_delivery_risk"),
    col("Category Id").alias("category_id"),
    col("Category Name").alias("category_name"),
    col("Customer City").alias("customer_city"),
    col("order date (DateOrders)").alias("order_date")
)



In [8]:
supply_clean.printSchema()
supply_clean.show(5)

root
 |-- real_shipping_days: integer (nullable = true)
 |-- scheduled_shipping_days: integer (nullable = true)
 |-- benefit_per_order: double (nullable = true)
 |-- sales_per_customer: double (nullable = true)
 |-- delivery_status: string (nullable = true)
 |-- late_delivery_risk: integer (nullable = true)
 |-- category_id: integer (nullable = true)
 |-- category_name: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- order_date: string (nullable = true)

+------------------+-----------------------+-----------------+------------------+----------------+------------------+-----------+--------------+-------------+---------------+
|real_shipping_days|scheduled_shipping_days|benefit_per_order|sales_per_customer| delivery_status|late_delivery_risk|category_id| category_name|customer_city|     order_date|
+------------------+-----------------------+-----------------+------------------+----------------+------------------+-----------+--------------+-------------+------

## CREATE SQL VIEW

In [11]:
supply_clean.createOrReplaceTempView("supply_chain")

## DELIVERY DELAY ANALYSIS

### Calculate Delay Difference

In [16]:
delay_analysis = spark.sql("""
SELECT
    real_shipping_days,
    scheduled_shipping_days,
    (real_shipping_days - scheduled_shipping_days) AS delay_days,
    delivery_status,
    late_delivery_risk
FROM supply_chain
""")

delay_analysis.show(5)


+------------------+-----------------------+----------+----------------+------------------+
|real_shipping_days|scheduled_shipping_days|delay_days| delivery_status|late_delivery_risk|
+------------------+-----------------------+----------+----------------+------------------+
|                 3|                      4|        -1|Advance shipping|                 0|
|                 5|                      4|         1|   Late delivery|                 1|
|                 4|                      4|         0|Shipping on time|                 0|
|                 3|                      4|        -1|Advance shipping|                 0|
|                 2|                      4|        -2|Advance shipping|                 0|
+------------------+-----------------------+----------+----------------+------------------+
only showing top 5 rows



### ON-TIME vs LATE DELIVERY ANALYSIS

In [18]:
delivery_perf = spark.sql("""
SELECT
     delivery_status,
     COUNT(*) AS total_orders
FROM supply_chain
GROUP BY delivery_status
""")

delivery_perf.show()

+-----------------+------------+
|  delivery_status|total_orders|
+-----------------+------------+
| Shipping on time|       32196|
| Advance shipping|       41592|
|Shipping canceled|        7754|
|    Late delivery|       98977|
+-----------------+------------+



## LATE DELIVERY RISK ANALYSIS

In [22]:
late_delivery_risk_analysis = spark.sql("""
SELECT
      late_delivery_risk,
      COUNT(*) AS total_orders
FROM supply_chain
GROUP BY late_delivery_risk
""")

late_delivery_risk_analysis.show()

+------------------+------------+
|late_delivery_risk|total_orders|
+------------------+------------+
|                 1|       98977|
|                 0|       81542|
+------------------+------------+



## CATEGORY-WISE PERFORMANCE ANALYSIS
### Sales by Category

In [28]:
category_sales = spark.sql("""
SELECT
    category_name,
    ROUND(SUM(sales_per_customer),2) AS total_sales
FROM supply_chain
GROUP BY category_name
ORDER BY total_sales DESc
""")

category_sales.show()

+--------------------+-----------+
|       category_name|total_sales|
+--------------------+-----------+
|             Fishing| 6226935.29|
|              Cleats|  3982856.5|
|    Camping & Hiking| 3700783.63|
|    Cardio Equipment|  3320250.7|
|     Women's Apparel|  2828708.5|
|        Water Sports| 2798044.18|
|      Men's Footwear| 2598494.42|
|Indoor/Outdoor Games| 2596454.02|
|       Shop By Sport| 1177185.63|
|           Computers|   595395.0|
|         Electronics|  333327.26|
|            Cameras |  240496.68|
|              Garden|  231765.46|
| Children's Clothing|  209268.38|
|              Crafts|  200704.87|
|      Girls' Apparel|  136206.83|
|    Women's Clothing|  126006.93|
|         Accessories|  119712.55|
|      Sporting Goods|  105063.61|
|         Golf Gloves|   104787.4|
+--------------------+-----------+
only showing top 20 rows



###  Profit by category

In [27]:
category_profit = spark.sql("""
SELECT 
     category_name,
     ROUND(SUM(benefit_per_order), 2) AS total_profit
FROM supply_chain
GROUP BY category_name
ORDER BY total_profit DESC
""")

category_profit.show(10)

+--------------------+------------+
|       category_name|total_profit|
+--------------------+------------+
|             Fishing|   756220.77|
|              Cleats|   494636.92|
|    Camping & Hiking|   427455.57|
|    Cardio Equipment|    383011.1|
|     Women's Apparel|   350421.03|
|        Water Sports|   325146.96|
|Indoor/Outdoor Games|   318451.43|
|      Men's Footwear|   311902.82|
|       Shop By Sport|   129813.96|
|           Computers|    69656.81|
+--------------------+------------+
only showing top 10 rows



## DEMAND ANALYSIS
### (TIME-BASED)

In [33]:
demand_trend = spark.sql("""
SELECT
     order_date,
     ROUND(SUM(sales_per_customer),2) AS daily_sales
FROM supply_chain
GROUP BY order_date
ORDER BY order_date
""")

demand_trend.show(10)
         
    

+----------------+-----------+
|      order_date|daily_sales|
+----------------+-----------+
|01-01-2015 00:00|     239.98|
|01-01-2015 00:21|     529.38|
|01-01-2015 01:03|     620.87|
|01-01-2015 01:24|     987.07|
|01-01-2015 02:06|     525.52|
|01-01-2015 02:27|     587.77|
|01-01-2015 02:48|     587.96|
|01-01-2015 03:09|     525.77|
|01-01-2015 03:30|      846.8|
|01-01-2015 03:51|    1196.38|
+----------------+-----------+
only showing top 10 rows



In [36]:
category_sales.toPandas().to_csv(
    "category_sales_summary.csv", index=False
)

delivery_perf.toPandas().to_csv(
    "delivery_performance_summary.csv", index=False
)

demand_trend.toPandas().to_csv(
    "demand_trend_summary.csv", index=False
)

