In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, count
from pyspark.sql.types import DateType

In [0]:
spark = SparkSession.builder.appName("CustomerOrdersAnalysis").getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7f60a0139010>

In [0]:
customers_df = spark.read.csv("/FileStore/tables/customers-1.csv", header=True, inferSchema=True)
orders_df = spark.read.csv("/FileStore/tables/orders-2.csv", header=True, inferSchema=True)

In [0]:
print("Customers Schema:")
customers_df.printSchema()
print("\nOrders Schema:")
orders_df.printSchema()

Customers Schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)


Orders Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: timestamp (nullable = true)



In [0]:
# 3. Add TotalAmount column
orders_df = orders_df.withColumn("TotalAmount", col("Quantity") * col("Price"))
display(orders_df)

OrderID,CustomerID,Product,Quantity,Price,OrderDate,TotalAmount
1001,101,Laptop,1,70000,2024-01-05T00:00:00.000Z,70000
1002,102,Mobile,2,25000,2024-02-10T00:00:00.000Z,50000
1003,103,Desk,1,10000,2024-03-15T00:00:00.000Z,10000
1004,101,Mouse,3,1000,2024-04-01T00:00:00.000Z,3000
1005,104,Monitor,1,12000,2024-04-25T00:00:00.000Z,12000


In [0]:
# 4. Join DataFrames on CustomerID
joined_df = orders_df.join(customers_df, "CustomerID", "inner")
display(joined_df)

CustomerID,OrderID,Product,Quantity,Price,OrderDate,TotalAmount,Name,City,Age
101,1001,Laptop,1,70000,2024-01-05T00:00:00.000Z,70000,Aditi,Mumbai,28
102,1002,Mobile,2,25000,2024-02-10T00:00:00.000Z,50000,Rohan,Delhi,35
103,1003,Desk,1,10000,2024-03-15T00:00:00.000Z,10000,Meena,Bangalore,41
101,1004,Mouse,3,1000,2024-04-01T00:00:00.000Z,3000,Aditi,Mumbai,28
104,1005,Monitor,1,12000,2024-04-25T00:00:00.000Z,12000,Kabir,Hyderabad,30


In [0]:
# 5. Filter orders where TotalAmount > 20000
high_val = joined_df.filter(col("TotalAmount") > 20000)
print("\nOrders with TotalAmount > 20000:")
display(high_val)


Orders with TotalAmount > 20000:


CustomerID,OrderID,Product,Quantity,Price,OrderDate,TotalAmount,Name,City,Age
101,1001,Laptop,1,70000,2024-01-05T00:00:00.000Z,70000,Aditi,Mumbai,28
102,1002,Mobile,2,25000,2024-02-10T00:00:00.000Z,50000,Rohan,Delhi,35


In [0]:
# 6. Show customers who placed more than 1 order
mul_orders = joined_df.groupBy("CustomerID", "Name").agg(count("OrderID").alias("OrderCount")) \
    .filter(col("OrderCount") > 1)
print("\nCustomers with more than 1 order:")
display(mul_orders)


Customers with more than 1 order:


CustomerID,Name,OrderCount
101,Aditi,2


In [0]:
# 7. Group orders by City and get average order value
avg_orde = joined_df.groupBy("City").agg(avg("TotalAmount").alias("AvgOrderValue"))
print("\nAverage order value by city:")
display(avg_orde)


Average order value by city:


City,AvgOrderValue
Bangalore,10000.0
Delhi,50000.0
Mumbai,36500.0
Hyderabad,12000.0


In [0]:
# 8. Sort orders by OrderDate in descending order
sorted_orders = joined_df.orderBy(col("OrderDate").desc())
print("\nOrders sorted by date (descending):")
display(sorted_orders)


Orders sorted by date (descending):


CustomerID,OrderID,Product,Quantity,Price,OrderDate,TotalAmount,Name,City,Age
104,1005,Monitor,1,12000,2024-04-25T00:00:00.000Z,12000,Kabir,Hyderabad,30
101,1004,Mouse,3,1000,2024-04-01T00:00:00.000Z,3000,Aditi,Mumbai,28
103,1003,Desk,1,10000,2024-03-15T00:00:00.000Z,10000,Meena,Bangalore,41
102,1002,Mobile,2,25000,2024-02-10T00:00:00.000Z,50000,Rohan,Delhi,35
101,1001,Laptop,1,70000,2024-01-05T00:00:00.000Z,70000,Aditi,Mumbai,28


In [0]:
# 9. Write final result as Parquet partitioned by City
joined_df.write.partitionBy("City").mode("overwrite").parquet("output/customer_orders_analysis.parquet")

In [0]:
# 10. Create temporary view and run Spark SQL
joined_df.createOrReplaceTempView("customer_orders")
spark.sql("SELECT * FROM customer_orders").show()

+----------+-------+--------+--------+-----+-------------------+-----------+------+---------+---+
|CustomerID|OrderID| Product|Quantity|Price|          OrderDate|TotalAmount|  Name|     City|Age|
+----------+-------+--------+--------+-----+-------------------+-----------+------+---------+---+
|       101|   1001|  Laptop|       1|70000|2024-01-05 00:00:00|      70000| Aditi|   Mumbai| 28|
|       102|   1002|  Mobile|       2|25000|2024-02-10 00:00:00|      50000| Rohan|    Delhi| 35|
|       103|   1003|    Desk|       1|10000|2024-03-15 00:00:00|      10000| Meena|Bangalore| 41|
|       101|   1004|   Mouse|       3| 1000|2024-04-01 00:00:00|       3000| Aditi|   Mumbai| 28|
|       104|   1005| Monitor|       1|12000|2024-04-25 00:00:00|      12000| Kabir|Hyderabad| 30|
+----------+-------+--------+--------+-----+-------------------+-----------+------+---------+---+



In [0]:
# Total sales by customer
print("\nTotal sales by customer:")
spark.sql("""
    select c.customerid, c.name, sum(o.totalamount) as totalspent
    from customer_orders o
    join customer_orders c on o.customerid = c.customerid
    group by c.customerid, c.name
    order by totalspent desc
""").show()

# count of products per city
print("\ncount of products per city:")
spark.sql("""
    select city, count(product) as productcount
    from customer_orders
    group by city
    order by productcount desc
""").show()

# top 2 cities by revenue
print("\ntop 2 cities by revenue:")
spark.sql("""
    select city, sum(totalamount) as totalrevenue
    from customer_orders
    group by city
    order by totalrevenue desc
    limit 2
""").show()


Total sales by customer:
+----------+------+----------+
|customerid|  name|totalspent|
+----------+------+----------+
|       101| Aditi|    146000|
|       102| Rohan|     50000|
|       104| Kabir|     12000|
|       103| Meena|     10000|
+----------+------+----------+


count of products per city:
+---------+------------+
|     city|productcount|
+---------+------------+
|   Mumbai|           2|
|    Delhi|           1|
|Bangalore|           1|
|Hyderabad|           1|
+---------+------------+


top 2 cities by revenue:
+------+------------+
|  city|totalrevenue|
+------+------------+
|Mumbai|       73000|
| Delhi|       50000|
+------+------------+

