In [0]:
from pyspark.sql.functions import *

In [0]:
dbutils.fs.unmount("/mnt/globmartdata")


In [0]:
# Define Azure Blob Storage credentials
container_name = "globmart-data"
account_name = "adbpractice01"
storage_account_key = "5qIWgDuPJoWnHDbZ4tKi4MRCHvgMb6TJX4wV/HWZiB4tw78ootGC3d+xLBW2IHdxIkJbv4PsKDiA+ASt+rR7FQ=="

# Unmount first if already mounted (optional)
# dbutils.fs.unmount("/mnt/globmartdata")

# Mount the Blob Storage container
dbutils.fs.mount(
    source = f"wasbs://{container_name}@{account_name}.blob.core.windows.net",
    mount_point = "/mnt/globmartdata",
    extra_configs = {f"fs.azure.account.key.{account_name}.blob.core.windows.net": storage_account_key}
)

print("âœ… Mount successful!")


In [0]:
display(dbutils.fs.ls("/mnt/globmartdata/globmart-data_part01"))


In [0]:
orders_path = 'dbfs:/mnt/globmartdata/globmart-data_part01/orders.csv'
orders_df = spark.read.option("header","true").option("inferSchema","true").csv(orders_path)

In [0]:
orders_df.display()

In [0]:
# Base folder path
base_path = "/mnt/globmartdata/globmart-data_part01/"

# Load each CSV into a separate DataFrame
addresses_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "addresses.csv")
customers_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "customers.csv")
orders_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "orders.csv")
order_items_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "orders_items.csv")
payment_methods_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "payment_methods.csv")
payments_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "payments.csv")
products_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "products.csv")
returns_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "returns.csv")
shipping_tier_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "shipping_tier.csv")
suppliers_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "suppliers.csv")

# Example: display orders
display(orders_df)


In [0]:
addresses_df.display()

In [0]:
customers_df.display()


In [0]:
order_items_df.display()


In [0]:
products_df.display()


In [0]:
payments_df.display()


In [0]:
display(suppliers_df)

# Pyspark

###         1. Analyze Customer Payment Method Preferences

**To understand which payment methods are preferred by customers, guiding decisions on payment gateway partnerships and offers.**


In [0]:

payment_method_preference = (
    payments_df
    .join(payment_methods_df, "PaymentMethodID")
    .groupBy("MethodName")
    .count()
    .orderBy(col("count").desc())
)

payment_method_preference.display()

In [0]:
# Step 1: Calculate the Total Order Value
order_values = (
    payments_df
    .withColumn("TotalOrderValue", col("GiftCardAmount") + col("CouponAmount"))
)

# Step 2: Join with Orders to Analyze the Most Common Channels for High-Value Orders
high_value_orders_by_channel = (
    order_values
    .join(orders_df, "OrderID")
    .filter(col("TotalOrderValue") > 500)  # Example threshold for high-value orders
    .groupBy(orders_df.OrderChannel)
    .count()
    .orderBy(col("count").desc())
)

high_value_orders_by_channel.display()

In [0]:
order_items_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(base_path + "orders_items.csv")


#### Local Temporary View

In [0]:
# Create the temporary view from your DataFrame
order_items_df.createOrReplaceTempView("order_items")

In [0]:
%sql

SELECT * FROM order_items;

In [0]:
%sql
SELECT 
    ProductID,
    SUM(Quantity) AS TotalQuantitySold
FROM 
    order_items
GROUP BY 
    ProductID
ORDER BY 
    TotalQuantitySold DESC;

#### Global Temporary view

In [0]:
# Convert the DataFrames into SQL temporary views to enable SQL querying
orders_df.createOrReplaceGlobalTempView("orders")
order_items_df.createOrReplaceGlobalTempView("order_items")
payments_df.createOrReplaceGlobalTempView("payments")

#### Count of Orders by Payment Method and Order Channel

In [0]:
%sql
SELECT 
    o.OrderChannel,
    p.PaymentMethodID,
    COUNT(o.OrderID) AS NumberOfOrders
FROM 
    global_temp.orders o
JOIN 
    global_temp.payments p ON o.OrderID = p.OrderID
GROUP BY 
    o.OrderChannel, p.PaymentMethodID
ORDER BY 
    NumberOfOrders DESC;

#### Analyzing the Impact of Product Variety on Total Sales and Delivery Delays Across Order Channels

In [0]:
%sql
WITH ProductVariety AS (
    SELECT 
        oi.OrderID,
        COUNT(DISTINCT oi.ProductID) AS NumberOfProducts
    FROM 
        global_temp.order_items oi
    GROUP BY 
        oi.OrderID
),

OrderSales AS (
    SELECT 
        p.OrderID,
        SUM(p.GiftCardAmount + p.CouponAmount) AS TotalSalesAmount
    FROM 
        global_temp.payments p
    GROUP BY 
        p.OrderID
),

DeliveryDelays AS (
    SELECT 
        o.OrderID,
        DATEDIFF(o.ActualDeliveryDate, o.ExpectedDeliveryDate) AS DeliveryDelayDays,
        o.OrderChannel
    FROM 
        global_temp.orders o
)

SELECT 
    dv.OrderChannel,
    pv.NumberOfProducts,
    AVG(os.TotalSalesAmount) AS AvgTotalSalesAmount,
    AVG(dv.DeliveryDelayDays) AS AvgDeliveryDelay
FROM 
    ProductVariety pv
JOIN 
    OrderSales os ON pv.OrderID = os.OrderID
JOIN 
    DeliveryDelays dv ON pv.OrderID = dv.OrderID
GROUP BY 
    dv.OrderChannel, pv.NumberOfProducts
ORDER BY 
    dv.OrderChannel, pv.NumberOfProducts DESC;

# Batch_Processing_Part_01-SecondPart


### Data Ingestion and Transformation in Databricks
### 

**Initial data exploration**

**Goals:** 

Create dataframes by ingesting the data into Databricks
Gain a comprehensive understanding of the provided data.
Perform thorough exploration to uncover any issues or patterns.
Perform data wrangling to answer business queries.
 
**Outcomes:**

Explore the data and share your insights
Answer the business problems asked below

In [0]:
display(customers_df)

In [0]:
display(orders_df)

In [0]:
payments_df.printSchema()
orders_df.printSchema()
customers_df.printSchema()


### Write a query to identify top 10 customers by total spend.
### 


In [0]:
%sql
SELECT 
    c.CustomerID,
    c.FirstName,
    c.LastName,
    COUNT(DISTINCT o.OrderID) AS NumberOfOrders,
    SUM(COALESCE(p.GiftCardAmount, 0) + COALESCE(p.CouponAmount, 0)) AS TotalSpend
FROM 
    customers c
JOIN 
    orders o ON c.CustomerID = o.CustomerID
JOIN 
    payments p ON o.OrderID = p.OrderID
GROUP BY 
    c.CustomerID, c.FirstName, c.LastName
ORDER BY 
    TotalSpend DESC
LIMIT 10;


### Write a query to determine the most popular shipping tier for orders.
### 


In [0]:
%sql
SELECT 
    st.TierName AS Shipping_Tier,
    COUNT(o.OrderID) AS Total_Orders
FROM orders o
JOIN shipping_tier st 
    ON o.ShippingTierID = st.ShippingTierID
GROUP BY st.TierName
ORDER BY Total_Orders DESC
LIMIT 1;


### Calculate Running Total of Sales for Each Product.
### 


In [0]:
%sql
SELECT 
    p.Product_Name,
    SUM(oi.Quantity * pr.Discounted_Price) AS Total_Sales,
    SUM(SUM(oi.Quantity * pr.Discounted_Price)) 
        OVER (PARTITION BY p.Product_Name ORDER BY p.Product_Name ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) 
        AS Running_Total
FROM 
    order_items oi
JOIN 
    products pr ON oi.ProductID = pr.Product_ID
JOIN 
    orders o ON oi.OrderID = o.OrderID
JOIN 
    products p ON oi.ProductID = p.Product_ID
GROUP BY 
    p.Product_Name
ORDER BY 
    Running_Total DESC;


In [0]:
%sql
-- Customer-level summary report
SELECT
    CONCAT(c.FirstName, ' ', c.LastName) AS customer_name,
    
    -- Total number of orders
    COUNT(DISTINCT o.OrderID) AS tot_orders,
    
    -- Total number of returns
    COUNT(DISTINCT r.OrderId) AS tot_returns,
    
    -- Total order value
    ROUND(SUM(oi.Quantity * p.Discounted_Price), 2) AS order_value,
    
    -- Average basket size (total units / total orders), rounded down
    FLOOR(SUM(oi.Quantity) / COUNT(DISTINCT o.OrderID)) AS avg_basket_size,
    
    -- Average basket value (total order value / total orders), rounded to 2 decimals
    ROUND(SUM(oi.Quantity * p.Discounted_Price) / COUNT(DISTINCT o.OrderID), 2) AS avg_basket_value,
    
    -- Length of stay in days (difference between first and last order date)
    DATEDIFF(MAX(o.OrderDate), MIN(o.OrderDate)) AS length_of_stay_days,
    
    -- Average purchase frequency (days per order), rounded to nearest integer
    ROUND(DATEDIFF(MAX(o.OrderDate), MIN(o.OrderDate)) / (COUNT(DISTINCT o.OrderID)-1)) AS order_purchase_frequency

FROM 
    customers c
JOIN 
    orders o 
ON 
    c.CustomerID = o.CustomerID
JOIN 
    order_items oi 
ON 
    o.OrderID = oi.OrderID
JOIN 
    products p 
ON 
    oi.ProductID = p.Product_ID
LEFT JOIN 
    returns r 
ON 
    o.OrderID = r.OrderId

GROUP BY 
    c.FirstName, c.LastName
ORDER BY 
    customer_name;


In [0]:
%sql
-- Create a global temporary view for number of products per supplier
CREATE GLOBAL TEMP VIEW supplier_product_counts AS
SELECT
    s.SupplierID,
    s.SupplierName,
    COUNT(DISTINCT oi.ProductID) AS NumberOfProducts
FROM
    suppliers s
LEFT JOIN
    orders o
ON
    s.SupplierID = o.SupplierID
LEFT JOIN
    orders_items oi
ON
    o.OrderID = oi.OrderID
GROUP BY
    s.SupplierID,
    s.SupplierName
ORDER BY
    NumberOfProducts DESC;


In [0]:
%sql
SELECT * FROM global_temp.supplier_product_counts;


### Write code to identify products with an average rating of 4.5 or higher.

In [0]:
# Filter products with Product_Rating >= 4.5
high_rated_products_df = products_df.filter(col("Product_Rating") >= 4.5)

high_rated_products_df.show()

### Calculate the number of days between the order placement and shipping date for each order.

In [0]:
# Adding a new column for days between order placement and shipping
orders_with_days_df = orders_df.withColumn("Days_to_Ship", datediff(col("ShippingDate"), col("OrderDate")))

orders_with_days_df.select("OrderID", "OrderDate", "ShippingDate", "Days_to_Ship").show()


### Calculate the month-over-month growth rate in sales.
The month-over-month (MoM) growth rate in sales measures the percentage change in sales from one month to the next.
### 


In [0]:
from pyspark.sql.window import Window

# Join to get product prices
sales_df = order_items_df.join(products_df, order_items_df.ProductID == products_df.Product_ID, "left")

# Calculate total sales for each order item
sales_df = sales_df.withColumn("SalesAmount", col("Quantity") * col("Discounted_Price"))

# Join with orders to get OrderDate
orders_df = spark.read.format("csv").option("header", "true").option("inferSchema","true").load("/mnt/deamzndata/orders.csv")
sales_df = sales_df.join(orders_df.select("OrderID", "OrderDate"), "OrderID", "left")

# Aggregate sales by Year and Month
monthly_sales_df = sales_df.groupBy(year("OrderDate").alias("Year"), month("OrderDate").alias("Month")) \
    .agg(_sum("SalesAmount").alias("TotalSales")) \
    .orderBy("Year", "Month")

# Calculate previous month sales
window = Window.orderBy("Year", "Month")
monthly_sales_df = monthly_sales_df.withColumn("PrevMonthSales", lag("TotalSales").over(window))

# Calculate MoM growth rate
monthly_sales_df = monthly_sales_df.withColumn("MoM_Growth_Rate", round((col("TotalSales") - col("PrevMonthSales")) / col("PrevMonthSales") * 100, 2))

monthly_sales_df.show()


### Globalmart wants the following report to be created to analyze its product performance and gain insights:
### 
### 

In [0]:
from pyspark.sql import functions as F

# Step 1: Join orders_items with products to get product details and revenue
product_df = order_items_df.join(
    products_df, 
    order_items_df.ProductID == products_df.Product_ID,
    "inner"
).select(
    products_df.Product_ID,
    products_df.Product_Name,
    order_items_df.OrderID,
    order_items_df.Quantity,
    (order_items_df.Quantity * products_df.Discounted_Price).alias("Revenue")
)

# Step 2: Aggregate product-level metrics
product_report_df = product_df.groupBy("Product_ID", "Product_Name").agg(
    F.countDistinct("OrderID").alias("Total_Orders"),
    F.sum("Quantity").alias("Total_Units_Sold"),
    F.sum("Revenue").alias("Total_Revenue"),
    F.round(F.avg("Revenue" / F.col("Quantity")), 2).alias("Avg_Price")
)

# Step 3: Calculate total returns per product
product_returns_df = order_items_df.join(
    returns_df, "OrderID", "left"
).groupBy("ProductID").agg(
    F.count("OrderID").alias("Total_Returns")
)

# Step 4: Combine product metrics with returns and calculate return rate
final_product_report_df = product_report_df.join(
    product_returns_df,
    product_report_df.Product_ID == product_returns_df.ProductID,
    "left"
).withColumn(
    "Total_Returns", F.coalesce(F.col("Total_Returns"), F.lit(0))
).withColumn(
    "Return_rate", F.round((F.col("Total_Returns") / F.col("Total_Orders")) * 100, 2)
).select(
    "Product_ID",
    "Product_Name",
    "Total_Orders",
    "Total_Units_Sold",
    "Total_Revenue",
    "Avg_Price",
    "Total_Returns",
    "Return_rate"
)

display(final_product_report_df)


In [0]:
cust

### Based on the total spending, classify customers into different loyalty tiers (e.g., Silver, Gold, Platinum). The criteria can be as follows:
### 

In [0]:
# Step 1: Calculate total spend per customer
customer_spending_df = orders_df.join(payments_df, "OrderID") \
    .groupBy("CustomerID") \
    .agg(
        _sum(col("CouponAmount") + col("GiftCardAmount")).alias("TotalSpent")
    )

# Step 2: Classify customers into loyalty tiers
customer_loyalty_df = customer_spending_df.withColumn(
    "LoyaltyTier",
    when(col("TotalSpent") > 1000, lit("Platinum"))
    .when((col("TotalSpent") >= 500) & (col("TotalSpent") <= 1000), lit("Gold"))
    .otherwise(lit("Silver"))
)

display(customer_loyalty_df)
