In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import trim, lower, date_format, col,count

# Create a SparkSession
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()

# Sample Sales data
data = [(1, "Toycar1", "2000-01-16"),
        (2, "toYcar2", "2000-01-17"),
        (3, "toycaR3", "2000-02-18"),
        (4, "doll1", "2000-02-19"),
        (5, "doll2", "2000-02-28"),
        (6, "data", "2000-03-31"),
        (7, "doll1", "2000-02-19")]

# Create a DataFrame
sales_df = spark.createDataFrame(data, ["sale_id", "product_name", "sale_date"])
sales_df.show()



+-------+------------+----------+
|sale_id|product_name| sale_date|
+-------+------------+----------+
|      1|     Toycar1|2000-01-16|
|      2|     toYcar2|2000-01-17|
|      3|     toycaR3|2000-02-18|
|      4|       doll1|2000-02-19|
|      5|       doll2|2000-02-28|
|      6|        data|2000-03-31|
|      7|       doll1|2000-02-19|
+-------+------------+----------+



<h3>transformation /h3>

In [0]:
# Perform the transformations
df1=sales_df.withColumn("product_name", trim(lower(col("product_name"))))
df1.show()
df2=df1.withColumn("sale_date", date_format(col("sale_date"), "yyyy-MM")) 
df2.show()
df2.groupBy("product_name", "sale_date").agg(count("sale_id").alias("total")).orderBy("product_name", "sale_date").show()


+-------+------------+----------+
|sale_id|product_name| sale_date|
+-------+------------+----------+
|      1|     toycar1|2000-01-16|
|      2|     toycar2|2000-01-17|
|      3|     toycar3|2000-02-18|
|      4|       doll1|2000-02-19|
|      5|       doll2|2000-02-28|
|      6|        data|2000-03-31|
|      7|       doll1|2000-02-19|
+-------+------------+----------+

+-------+------------+---------+
|sale_id|product_name|sale_date|
+-------+------------+---------+
|      1|     toycar1|  2000-01|
|      2|     toycar2|  2000-01|
|      3|     toycar3|  2000-02|
|      4|       doll1|  2000-02|
|      5|       doll2|  2000-02|
|      6|        data|  2000-03|
|      7|       doll1|  2000-02|
+-------+------------+---------+

+------------+---------+-----+
|product_name|sale_date|total|
+------------+---------+-----+
|        data|  2000-03|    1|
|       doll1|  2000-02|    2|
|       doll2|  2000-02|    1|
|     toycar1|  2000-01|    1|
|     toycar2|  2000-01|    1|
|     toycar3

In [0]:
result_df = sales_df \
    .withColumn("product_name", trim(lower(col("product_name")))) \
    .withColumn("sale_date", date_format(col("sale_date"), "yyyy-MM")) \
    .groupBy("product_name", "sale_date") \
    .agg(count("sale_id").alias("total")) \
    .orderBy("product_name", "sale_date")

# Show the result
result_df.show()

+------------+---------+-----+
|product_name|sale_date|total|
+------------+---------+-----+
|        data|  2000-03|    1|
|       doll1|  2000-02|    2|
|       doll2|  2000-02|    1|
|     toycar1|  2000-01|    1|
|     toycar2|  2000-01|    1|
|     toycar3|  2000-02|    1|
+------------+---------+-----+

