In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('June13Assignment1').getOrCreate()
spark

In [2]:
# 1. Ingest the CSV files into two PySpark DataFrames
customers_df=spark.read.csv('/content/customers.csv',header=True,inferSchema=True)
customers_df.show()
orders_df=spark.read.csv('/content/orders.csv',header=True,inferSchema=True)
orders_df.show()
# 2. Infer schema and print the schema for both
customers_df.printSchema()
orders_df.printSchema()
# 3. Add a column TotalAmount = Quantity * Price to orders
orders_df=orders_df.withColumn('TotalAmount',orders_df['Quantity']*orders_df['Price'])
orders_df.show()
# 4. Join both DataFrames on CustomerID
customer_orders_df=customers_df.join(orders_df,on='CustomerID',how='inner')
customer_orders_df.show()

+----------+-----+---------+---+
|CustomerID| Name|     City|Age|
+----------+-----+---------+---+
|       101|Aditi|   Mumbai| 28|
|       102|Rohan|    Delhi| 35|
|       103|Meena|Bangalore| 41|
|       104|Kabir|Hyderabad| 30|
|       105| Zoya|  Chennai| 25|
+----------+-----+---------+---+

+-------+----------+-------+--------+-----+----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|
+-------+----------+-------+--------+-----+----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|
|   1002|       102| Mobile|       2|25000|2024-02-10|
|   1003|       103|   Desk|       1|10000|2024-03-15|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|
|   1005|       104|Monitor|       1|12000|2024-04-25|
+-------+----------+-------+--------+-----+----------+

root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |--

In [6]:
# 5. Filter orders where TotalAmount > 20000
customer_orders_df.filter(customer_orders_df['TotalAmount']>20000).show()
# 6. Show customers who placed more than 1 order
from pyspark.sql.functions import col
customer_orders_df.groupBy('CustomerID').count().withColumnRenamed('count', 'order_count').filter(col('order_count') > 1).show()
# 7. Group orders by City and get average order value
customer_orders_df.groupBy('City').avg('TotalAmount').show()
# 8. Sort orders by OrderDate in descending order
customer_orders_df.orderBy(customer_orders_df['OrderDate'].desc()).show()
# 9. Write the final result as a Parquet file partitioned by City
customer_orders_df.write.mode('overwrite').partitionBy('City').parquet('/content/customer_orders.parquet')
# 10. Create a temporary view and run Spark SQL:
customer_orders_df.createOrReplaceTempView('customer_orders')
# Total sales by customer
spark.sql('select CustomerID,sum(TotalAmount) as TotalSales from customer_orders group by CustomerID').show()
# Count of products per city
spark.sql('select City, count(Product) as ProductCount from customer_orders group by City').show()
# Top 2 cities by revenue
spark.sql('select City,sum(TotalAmount) as TotalRevenue from customer_orders group by City order by TotalRevenue desc limit 2').show()

+----------+-----+------+---+-------+-------+--------+-----+----------+-----------+
|CustomerID| Name|  City|Age|OrderID|Product|Quantity|Price| OrderDate|TotalAmount|
+----------+-----+------+---+-------+-------+--------+-----+----------+-----------+
|       101|Aditi|Mumbai| 28|   1001| Laptop|       1|70000|2024-01-05|      70000|
|       102|Rohan| Delhi| 35|   1002| Mobile|       2|25000|2024-02-10|      50000|
+----------+-----+------+---+-------+-------+--------+-----+----------+-----------+

+----------+-----------+
|CustomerID|order_count|
+----------+-----------+
|       101|          2|
+----------+-----------+

+---------+----------------+
|     City|avg(TotalAmount)|
+---------+----------------+
|Bangalore|         10000.0|
|   Mumbai|         36500.0|
|    Delhi|         50000.0|
|Hyderabad|         12000.0|
+---------+----------------+

+----------+-----+---------+---+-------+-------+--------+-----+----------+-----------+
|CustomerID| Name|     City|Age|OrderID|Product|Q