In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
## #create one dataframe
## convert date into date format
## find the difference between the sales happening for TV and laptop on the same day

In [13]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [14]:
data = [
    ("2023-09-01", "TV", 100),
    ("2023-09-01", "Laptop", 80),
    ("2023-09-02", "TV", 150),
    ("2023-09-02", "Laptop", 75),
    ("2023-09-03", "TV", 200),
    ("2023-09-03", "Laptop", 0),
    ("2023-09-04", "TV", 98),
    ("2023-09-04", "Laptop", 100)
]

In [15]:
schema = StructType([
    StructField("sales_date",StringType()),
    StructField("product_name",StringType()),
    StructField("sales",IntegerType())
])

In [16]:
df = spark.createDataFrame(data,schema)

In [17]:
df.show()

+----------+------------+-----+
|sales_date|product_name|sales|
+----------+------------+-----+
|2023-09-01|          TV|  100|
|2023-09-01|      Laptop|   80|
|2023-09-02|          TV|  150|
|2023-09-02|      Laptop|   75|
|2023-09-03|          TV|  200|
|2023-09-03|      Laptop|    0|
|2023-09-04|          TV|   98|
|2023-09-04|      Laptop|  100|
+----------+------------+-----+



In [18]:
df.printSchema()

root
 |-- sales_date: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- sales: integer (nullable = true)



In [19]:
## Now convert sales_Date to DATE format from string 

In [36]:
changed_df = df.withColumn("new_sales_date",to_date("sales_date",'yyyy-MM-dd')).select("product_name","sales","new_sales_date")

In [37]:
changed_df.printSchema()

root
 |-- product_name: string (nullable = true)
 |-- sales: integer (nullable = true)
 |-- new_sales_date: date (nullable = true)



In [38]:
changed_df.show()

+------------+-----+--------------+
|product_name|sales|new_sales_date|
+------------+-----+--------------+
|          TV|  100|    2023-09-01|
|      Laptop|   80|    2023-09-01|
|          TV|  150|    2023-09-02|
|      Laptop|   75|    2023-09-02|
|          TV|  200|    2023-09-03|
|      Laptop|    0|    2023-09-03|
|          TV|   98|    2023-09-04|
|      Laptop|  100|    2023-09-04|
+------------+-----+--------------+



In [43]:
new_df = changed_df.groupBy("new_sales_date","product_name").agg(sum("sales").alias("sum_of_sales")).orderBy("new_sales_date")

In [44]:
new_df.show()

+--------------+------------+------------+
|new_sales_date|product_name|sum_of_sales|
+--------------+------------+------------+
|    2023-09-01|          TV|         100|
|    2023-09-01|      Laptop|          80|
|    2023-09-02|          TV|         150|
|    2023-09-02|      Laptop|          75|
|    2023-09-03|      Laptop|           0|
|    2023-09-03|          TV|         200|
|    2023-09-04|      Laptop|         100|
|    2023-09-04|          TV|          98|
+--------------+------------+------------+



In [45]:
new_df.createOrReplaceTempView("orders_table_1113")

In [47]:
spark.sql("select * from orders_table_1113").show()

+--------------+------------+------------+
|new_sales_date|product_name|sum_of_sales|
+--------------+------------+------------+
|    2023-09-01|          TV|         100|
|    2023-09-01|      Laptop|          80|
|    2023-09-02|      Laptop|          75|
|    2023-09-02|          TV|         150|
|    2023-09-03|          TV|         200|
|    2023-09-03|      Laptop|           0|
|    2023-09-04|      Laptop|         100|
|    2023-09-04|          TV|          98|
+--------------+------------+------------+



In [51]:
spark.sql("select new_sales_date , sum(case when product_name = 'TV' then sum_of_sales end) - sum(case when product_name = 'Laptop' then sum_of_sales end) as diff from orders_table_1113 group by new_sales_date order by new_sales_date ")

new_sales_date,diff
2023-09-01,20
2023-09-02,75
2023-09-03,200
2023-09-04,-2
