In [2]:
!pip install pyspark



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [4]:
# Entry point to programming with DataFrames and SQL in Spark
spark = SparkSession.builder.appName("Sales Analysis").getOrCreate()

In [5]:
# Data given in the code snippet
data = [
    ("2024-01-15", 1, "Product_A", 10, 1001),
    ("2024-01-20", 2, "Product_B", 5, 1002),
    ("2024-02-11", 1, "Product_A", 7, 1001),
    ("2024-02-13", 3, "Product_C", 8, 1003),
    ("2024-03-14", 4, "Product_D", 20, 1004),
    ("2024-04-15", 2, "Product_B", 15, 1002),
    ("2024-05-16", 1, "Product_A", 9, 1001),
    ("2024-06-17", 3, "Product_C", 6, 1003),
]

In [6]:
# Define schema for the data
schema = StructType([
    StructField("date", StringType(), True),
    StructField("store_id", IntegerType(), True),
    StructField("product", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
])

In [7]:
# Create a DataFrame
df = spark.createDataFrame(data, schema=schema)

In [8]:
# Show the DataFrame
df.show()

+----------+--------+---------+--------+-----------+
|      date|store_id|  product|quantity|customer_id|
+----------+--------+---------+--------+-----------+
|2024-01-15|       1|Product_A|      10|       1001|
|2024-01-20|       2|Product_B|       5|       1002|
|2024-02-11|       1|Product_A|       7|       1001|
|2024-02-13|       3|Product_C|       8|       1003|
|2024-03-14|       4|Product_D|      20|       1004|
|2024-04-15|       2|Product_B|      15|       1002|
|2024-05-16|       1|Product_A|       9|       1001|
|2024-06-17|       3|Product_C|       6|       1003|
+----------+--------+---------+--------+-----------+



In [9]:
# Example Analysis
# 1. Aggregate total quantity sold per product
df.groupBy("product").sum("quantity").alias("total_quantity").show()

+---------+-------------+
|  product|sum(quantity)|
+---------+-------------+
|Product_C|           14|
|Product_A|           26|
|Product_B|           20|
|Product_D|           20|
+---------+-------------+



In [10]:
# 2. Filter data for a specific product (e.g., "Product_A")
df.filter(df["product"] == "Product_A").show()

+----------+--------+---------+--------+-----------+
|      date|store_id|  product|quantity|customer_id|
+----------+--------+---------+--------+-----------+
|2024-01-15|       1|Product_A|      10|       1001|
|2024-02-11|       1|Product_A|       7|       1001|
|2024-05-16|       1|Product_A|       9|       1001|
+----------+--------+---------+--------+-----------+



In [11]:
# 3. Sort data by date
df.orderBy("date").show()

+----------+--------+---------+--------+-----------+
|      date|store_id|  product|quantity|customer_id|
+----------+--------+---------+--------+-----------+
|2024-01-15|       1|Product_A|      10|       1001|
|2024-01-20|       2|Product_B|       5|       1002|
|2024-02-11|       1|Product_A|       7|       1001|
|2024-02-13|       3|Product_C|       8|       1003|
|2024-03-14|       4|Product_D|      20|       1004|
|2024-04-15|       2|Product_B|      15|       1002|
|2024-05-16|       1|Product_A|       9|       1001|
|2024-06-17|       3|Product_C|       6|       1003|
+----------+--------+---------+--------+-----------+



In [12]:
# 4. Calculate total sales per store
df.groupBy("store_id").sum("quantity").alias("total_sales").show()

+--------+-------------+
|store_id|sum(quantity)|
+--------+-------------+
|       1|           26|
|       3|           14|
|       2|           20|
|       4|           20|
+--------+-------------+

