In [0]:
import pandas as pd
from pyspark.sql.types import StructType,StructField, IntegerType, StringType, DateType 
schema = StructType([
    StructField("product_id",IntegerType(), True),
    StructField("customer_id",StringType(), True),
    StructField("order_date",DateType(), True),
    StructField("location",StringType(), True),
    StructField("source_order",StringType(), True)])

df = spark.read.format("csv").option("inferSchema","true").schema(schema).load("/FileStore/tables/sales_csv.txt")
display(df)


product_id,customer_id,order_date,location,source_order
1,A,2023-01-01,India,Swiggy
2,A,2022-01-01,India,Swiggy
2,A,2023-01-07,India,Swiggy
3,A,2023-01-10,India,Restaurant
3,A,2022-01-11,India,Swiggy
3,A,2023-01-11,India,Restaurant
2,B,2022-02-01,India,Swiggy
2,B,2023-01-02,India,Swiggy
1,B,2023-01-04,India,Restaurant
1,B,2023-02-11,India,Swiggy


In [0]:
from pyspark.sql.functions import year, month, quarter

sales_df = df.withColumn("order_year", year(df.order_date))
sales_df = sales_df.withColumn("order_month", month(df.order_date))
sales_df = sales_df.withColumn("order_quarter", quarter(df.order_date))
display(sales_df)

product_id,customer_id,order_date,location,source_order,order_year,order_month,order_quarter
1,A,2023-01-01,India,Swiggy,2023,1,1
2,A,2022-01-01,India,Swiggy,2022,1,1
2,A,2023-01-07,India,Swiggy,2023,1,1
3,A,2023-01-10,India,Restaurant,2023,1,1
3,A,2022-01-11,India,Swiggy,2022,1,1
3,A,2023-01-11,India,Restaurant,2023,1,1
2,B,2022-02-01,India,Swiggy,2022,2,1
2,B,2023-01-02,India,Swiggy,2023,1,1
1,B,2023-01-04,India,Restaurant,2023,1,1
1,B,2023-02-11,India,Swiggy,2023,2,1


In [0]:
from pyspark.sql.types import StructType,StructField, IntegerType, StringType, DateType 
schema = StructType([
    StructField("product_id",IntegerType(), True),
    StructField("product_name",StringType(), True),
    StructField("price",StringType(), True)
    ])

menu_df = spark.read.format("csv").option("inferSchema","true").schema(schema).load("/FileStore/tables/menu_csv-2.txt")
display(menu_df)

product_id,product_name,price
1,PIZZA,100
2,Chowmin,150
3,sandwich,120
4,Dosa,110
5,Biryani,80
6,Pasta,180


In [0]:
Total_price_Customer = (df.join(menu_df,'product_id').groupBy('customer_id').agg({'price':'sum'}).orderBy('customer_id'))
display(Total_price_Customer)

customer_id,sum(price)
A,4260.0
B,4440.0
C,2400.0
D,1200.0
E,2040.0


Databricks visualization. Run in Databricks to view.

In [0]:
Total_price_product = (df.join(menu_df,'product_id').groupBy('product_name').agg({'price':'sum'}).orderBy('product_name'))
display(Total_price_product)

product_name,sum(price)
Biryani,480.0
Chowmin,3600.0
Dosa,1320.0
PIZZA,2100.0
Pasta,1080.0
sandwich,5760.0


In [0]:
Total_Sales = (sales_df.join(menu_df,'product_id').groupBy('order_month').agg({'price':'sum'}).orderBy('order_month'))
display(Total_Sales)

Total_Sales_year = (sales_df.join(menu_df,'product_id').groupBy('order_year').agg({'price':'sum'}).orderBy('order_year'))
display(Total_Sales_year)

order_month,sum(price)
1,2960.0
2,2730.0
3,910.0
5,2960.0
6,2960.0
7,910.0
11,910.0


Databricks visualization. Run in Databricks to view.

order_year,sum(price)
2022,4350.0
2023,9990.0


Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import count
Count_Product_Purchased = (sales_df.join(menu_df,'product_id').groupBy('product_id','product_name').agg(count('product_id').alias('product_count')).orderBy('product_count', ascending=0).drop('product_id'))
display(Count_Product_Purchased)

product_name,product_count
sandwich,48
Chowmin,24
PIZZA,21
Dosa,12
Biryani,6
Pasta,6


Databricks visualization. Run in Databricks to view.

In [0]:

Count_Product_Purchased = (sales_df.join(menu_df,'product_id').groupBy('product_id','product_name').agg(count('product_id').alias('product_count')).orderBy('product_count', ascending=0).drop('product_id').limit(1))
display(Count_Product_Purchased)

product_name,product_count
sandwich,48


Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import countDistinct

df = (sales_df.filter(sales_df.source_order == 'Restaurant').groupBy('customer_id').agg(count('customer_id')))
      
display(df)

customer_id,count(customer_id)
E,6
B,6
D,3
C,3
A,9


Databricks visualization. Run in Databricks to view.

In [0]:
Total_Sales_Country = (sales_df.join(menu_df,'product_id').groupBy('Location').agg({'price':'sum'}).orderBy('Location', ascending=1))
display(Total_Sales_Country)

Location,sum(price)
India,4860.0
UK,7020.0
USA,2460.0


Databricks visualization. Run in Databricks to view.

In [0]:
Total_Sales_Source = (sales_df.join(menu_df,'product_id').groupBy('source_order').agg({'price':'sum'}).orderBy('source_order', ascending=1))
display(Total_Sales_Source)

source_order,sum(price)
Restaurant,3090.0
Swiggy,6330.0
zomato,4920.0


Databricks visualization. Run in Databricks to view.