In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import current_date, lit, when, row_number, col
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType


In [None]:
import findspark
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
findspark.init("/home/ayoub/spark/spark-3.0.3-bin-hadoop3.2") 

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("ShoppingDataBI") \
                  .set("spark.sql.execution.arrow.pyspark.enabled", "false") \
                  .setMaster("local[*]") \
                  .set("spark.executor.memory", "3g") \
                  .set("spark.executor.cores", "2") \
                  .set("spark.driver.memory", "3g")  

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
shopping_df = spark.read.option("header", "true").csv("../dataset/customer_shopping_data.csv")
shopping_df.show(5)

+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+
|invoice_no|customer_id|gender|age|category|quantity|  price|payment_method|invoice_date| shopping_mall|
+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+
|   I138884|    C241288|Female| 28|Clothing|       5| 1500.4|   Credit Card|    5/8/2022|        Kanyon|
|   I317333|    C111565|  Male| 21|   Shoes|       3|1800.51|    Debit Card|  12/12/2021|Forum Istanbul|
|   I127801|    C266599|  Male| 20|Clothing|       1| 300.08|          Cash|   9/11/2021|     Metrocity|
|   I173702|    C988172|Female| 66|   Shoes|       5|3000.85|   Credit Card|  16/05/2021|  Metropol AVM|
|   I337046|    C189076|Female| 53|   Books|       4|   60.6|          Cash|  24/10/2021|        Kanyon|
+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+
only showing top 5 rows



In [4]:
data_mart_path = "../datalake/shopping_data_mart"

shopping_df = shopping_df.withColumn("start_date", current_date()) \
                         .withColumn("end_date", lit(None)) \
                         .withColumn("is_active", lit(True))

shopping_df.write.format("delta").mode("overwrite").save(data_mart_path)

                                                                                

In [5]:
window_spec = Window.partitionBy("customer_id").orderBy("invoice_date")

shopping_df_with_row_num = shopping_df.withColumn("row_num", row_number().over(window_spec))

updated_customer_data = shopping_df_with_row_num \
    .withColumn("is_active", 
                when(col("row_num") == 1, lit(True))  
                .otherwise(lit(False))) \
    .withColumn("end_date", 
                when(col("row_num") > 1, current_date())  
                .otherwise(lit(None))) \
    .drop("row_num")


updated_customer_data.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(data_mart_path)

                                                                                

In [6]:
shopping_df.printSchema()
shopping_df.show(5)

root
 |-- invoice_no: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- invoice_date: string (nullable = true)
 |-- shopping_mall: string (nullable = true)
 |-- start_date: date (nullable = false)
 |-- end_date: null (nullable = true)
 |-- is_active: boolean (nullable = false)

+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+----------+--------+---------+
|invoice_no|customer_id|gender|age|category|quantity|  price|payment_method|invoice_date| shopping_mall|start_date|end_date|is_active|
+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+----------+--------+---------+
|   I138884|    C241288|Female| 28|Clothing|       5| 150

In [7]:
# Dimension Customers
customer_dim = shopping_df.select("customer_id", "gender", "age").distinct()

# Dimension Categories
category_dim = shopping_df.select("category").distinct()

# Dimension Shopping Mall
shopping_mall_dim = shopping_df.select("shopping_mall").distinct()


# Show
customer_dim.show(5)
category_dim.show(5)
shopping_mall_dim.show(5)

                                                                                

+-----------+------+---+
|customer_id|gender|age|
+-----------+------+---+
|    C270129|Female| 66|
|    C372643|  Male| 27|
|    C167183|Female| 54|
|    C214260|Female| 65|
|    C198868|Female| 57|
+-----------+------+---+
only showing top 5 rows

+---------------+
|       category|
+---------------+
|Food & Beverage|
|       Clothing|
|       Souvenir|
|          Books|
|     Technology|
+---------------+
only showing top 5 rows

+----------------+
|   shopping_mall|
+----------------+
|          Kanyon|
|       Metrocity|
|Mall of Istanbul|
|    Zorlu Center|
|     Cevahir AVM|
+----------------+
only showing top 5 rows



In [8]:
shopping_df_clean = shopping_df.withColumn("quantity", F.col("quantity").cast(IntegerType())) \
                               .withColumn("price", F.col("price").cast(FloatType()))

shopping_df_clean = shopping_df_clean.fillna({'quantity': 0, 'price': 0.0})

fact_table = shopping_df_clean.withColumn("total_sales", F.col("quantity") * F.col("price"))


fact_table.show(5)

+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+----------+--------+---------+-----------+
|invoice_no|customer_id|gender|age|category|quantity|  price|payment_method|invoice_date| shopping_mall|start_date|end_date|is_active|total_sales|
+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+----------+--------+---------+-----------+
|   I138884|    C241288|Female| 28|Clothing|       5| 1500.4|   Credit Card|    5/8/2022|        Kanyon|2024-11-07|    null|     true|     7502.0|
|   I317333|    C111565|  Male| 21|   Shoes|       3|1800.51|    Debit Card|  12/12/2021|Forum Istanbul|2024-11-07|    null|     true|  5401.5303|
|   I127801|    C266599|  Male| 20|Clothing|       1| 300.08|          Cash|   9/11/2021|     Metrocity|2024-11-07|    null|     true|     300.08|
|   I173702|    C988172|Female| 66|   Shoes|       5|3000.85|   Credit Card|  16/05/2021|  Metropol AVM|2024-11-07|   

In [9]:
full_table = fact_table.join(customer_dim, "customer_id", "left") \
                       .join(category_dim, "category", "left") \
                       .join(shopping_mall_dim, "shopping_mall", "left")

full_table.show(5)

                                                                                

+--------------+--------+-----------+----------+------+---+--------+-------+--------------+------------+----------+--------+---------+-----------+------+---+
| shopping_mall|category|customer_id|invoice_no|gender|age|quantity|  price|payment_method|invoice_date|start_date|end_date|is_active|total_sales|gender|age|
+--------------+--------+-----------+----------+------+---+--------+-------+--------------+------------+----------+--------+---------+-----------+------+---+
|        Kanyon|Clothing|    C241288|   I138884|Female| 28|       5| 1500.4|   Credit Card|    5/8/2022|2024-11-07|    null|     true|     7502.0|Female| 28|
|Forum Istanbul|   Shoes|    C111565|   I317333|  Male| 21|       3|1800.51|    Debit Card|  12/12/2021|2024-11-07|    null|     true|  5401.5303|  Male| 21|
|     Metrocity|Clothing|    C266599|   I127801|  Male| 20|       1| 300.08|          Cash|   9/11/2021|2024-11-07|    null|     true|     300.08|  Male| 20|
|  Metropol AVM|   Shoes|    C988172|   I173702|Fema

In [10]:
shopping_df_clean = shopping_df_clean.withColumn("invoice_date", F.col("invoice_date").cast("string"))

shopping_df_clean = shopping_df_clean.withColumn("day", F.to_date(F.col("invoice_date"), "d/M/yyyy")) \
                                     .withColumn("month", F.month(F.to_date(F.col("invoice_date"), "d/M/yyyy"))) \
                                     .withColumn("quarter", F.quarter(F.to_date(F.col("invoice_date"), "d/M/yyyy")))

shopping_df_clean.createOrReplaceTempView("shopping_data")

query_day = """
SELECT day, SUM(quantity) AS total_quantity
FROM shopping_data
GROUP BY day
ORDER BY day
"""

query_month = """
SELECT month, SUM(quantity) AS total_quantity
FROM shopping_data
GROUP BY month
ORDER BY month
"""

query_quarter = """
SELECT quarter, SUM(quantity) AS total_quantity
FROM shopping_data
GROUP BY quarter
ORDER BY quarter
"""


result_day = spark.sql(query_day)
result_month = spark.sql(query_month)
result_quarter = spark.sql(query_quarter)



result_day.show()
result_month.show()
result_quarter.show()

                                                                                

+----------+--------------+
|       day|total_quantity|
+----------+--------------+
|2021-01-01|           333|
|2021-01-02|           387|
|2021-01-03|           405|
|2021-01-04|           379|
|2021-01-05|           374|
|2021-01-06|           459|
|2021-01-07|           402|
|2021-01-08|           337|
|2021-01-09|           322|
|2021-01-10|           351|
|2021-01-11|           344|
|2021-01-12|           378|
|2021-01-13|           369|
|2021-01-14|           378|
|2021-01-15|           371|
|2021-01-16|           383|
|2021-01-17|           362|
|2021-01-18|           373|
|2021-01-19|           371|
|2021-01-20|           400|
+----------+--------------+
only showing top 20 rows



                                                                                

+-----+--------------+
|month|total_quantity|
+-----+--------------+
|    1|         34763|
|    2|         31343|
|    3|         26224|
|    4|         22558|
|    5|         23217|
|    6|         22674|
|    7|         23699|
|    8|         22921|
|    9|         22098|
|   10|         23477|
|   11|         22586|
|   12|         23152|
+-----+--------------+

+-------+--------------+
|quarter|total_quantity|
+-------+--------------+
|      1|         92330|
|      2|         68449|
|      3|         68718|
|      4|         69215|
+-------+--------------+



In [11]:

shopping_df_clean = shopping_df_clean.withColumn("invoice_date", F.col("invoice_date").cast("string"))

shopping_df_clean = shopping_df_clean.withColumn("day", F.to_date(F.col("invoice_date"), "d/M/yyyy")) \
                                     .withColumn("month", F.month(F.to_date(F.col("invoice_date"), "d/M/yyyy"))) \
                                     .withColumn("quarter", F.quarter(F.to_date(F.col("invoice_date"), "d/M/yyyy")))


shopping_df_clean.createOrReplaceTempView("shopping_data")

# Average Quantity per Category per Day
query_avg_quantity_day = """
SELECT day, category, AVG(quantity) AS avg_quantity
FROM shopping_data
GROUP BY day, category
ORDER BY day
"""

# Average Quantity per Category per Month
query_avg_quantity_month = """
SELECT month, category, AVG(quantity) AS avg_quantity
FROM shopping_data
GROUP BY month, category
ORDER BY month
"""

# Average Quantity per Category per Quarter
query_avg_quantity_quarter = """
SELECT quarter, category, AVG(quantity) AS avg_quantity
FROM shopping_data
GROUP BY quarter, category
ORDER BY quarter
"""

# Average Price per Category per Day
query_avg_price_day = """
SELECT day, category, AVG(price) AS avg_price
FROM shopping_data
GROUP BY day, category
ORDER BY day
"""

# Average Price per Category per Month
query_avg_price_month = """
SELECT month, category, AVG(price) AS avg_price
FROM shopping_data
GROUP BY month, category
ORDER BY month
"""

# Average Price per Category per Quarter
query_avg_price_quarter = """
SELECT quarter, category, AVG(price) AS avg_price
FROM shopping_data
GROUP BY quarter, category
ORDER BY quarter
"""


result_avg_quantity_day = spark.sql(query_avg_quantity_day)
result_avg_quantity_month = spark.sql(query_avg_quantity_month)
result_avg_quantity_quarter = spark.sql(query_avg_quantity_quarter)

result_avg_price_day = spark.sql(query_avg_price_day)
result_avg_price_month = spark.sql(query_avg_price_month)
result_avg_price_quarter = spark.sql(query_avg_price_quarter)


result_avg_quantity_day.show()
result_avg_quantity_month.show()
result_avg_quantity_quarter.show()

result_avg_price_day.show()
result_avg_price_month.show()
result_avg_price_quarter.show()


                                                                                

+----------+---------------+------------------+
|       day|       category|      avg_quantity|
+----------+---------------+------------------+
|2021-01-01|       Souvenir| 3.142857142857143|
|2021-01-01|           Toys|3.3076923076923075|
|2021-01-01|     Technology|               3.0|
|2021-01-01|          Books|               3.5|
|2021-01-01|       Clothing| 2.892857142857143|
|2021-01-01|          Shoes|3.5714285714285716|
|2021-01-01|      Cosmetics|2.8947368421052633|
|2021-01-01|Food & Beverage|               3.5|
|2021-01-02|          Books|3.3333333333333335|
|2021-01-02|       Clothing| 2.877551020408163|
|2021-01-02|          Shoes|               3.0|
|2021-01-02|       Souvenir| 2.142857142857143|
|2021-01-02|           Toys|               3.0|
|2021-01-02|Food & Beverage|2.9583333333333335|
|2021-01-02|     Technology|               2.6|
|2021-01-02|      Cosmetics| 2.857142857142857|
|2021-01-03|          Books|               3.0|
|2021-01-03|     Technology|            

                                                                                

+-------+---------------+------------------+
|quarter|       category|      avg_quantity|
+-------+---------------+------------------+
|      1|       Souvenir| 2.972721636701798|
|      1|          Books|2.9898862199747156|
|      1|          Shoes| 3.001601537475977|
|      1|      Cosmetics|3.0046878329426807|
|      1|       Clothing|2.9952318623784593|
|      1|           Toys|3.0154689010634868|
|      1|Food & Beverage|2.9745762711864407|
|      1|     Technology|3.0130975769482644|
|      2|       Souvenir|2.9331597222222223|
|      2|      Cosmetics|2.9941944847605226|
|      2|Food & Beverage|3.0200515611572616|
|      2|     Technology|3.0227479526842584|
|      2|          Shoes|3.0094623655913977|
|      2|           Toys|3.0008795074758137|
|      2|          Books| 2.992888888888889|
|      2|       Clothing| 3.017825311942959|
|      3|       Souvenir| 3.020924149956408|
|      3|           Toys|2.9833475661827498|
|      3|     Technology|              2.98|
|      3| 

                                                                                

+----------+---------------+------------------+
|       day|       category|         avg_price|
+----------+---------------+------------------+
|2021-01-01|          Shoes|2143.4642944335938|
|2021-01-01|       Souvenir| 36.86571448189871|
|2021-01-01|      Cosmetics|117.70000156603362|
|2021-01-01|       Clothing| 868.0885522024972|
|2021-01-01|           Toys|118.54769134521484|
|2021-01-01|     Technology|            3150.0|
|2021-01-01|          Books| 53.02499961853027|
|2021-01-01|Food & Beverage|18.304999896458217|
|2021-01-02|       Clothing| 863.4954902493224|
|2021-01-02|Food & Beverage|15.472083131472269|
|2021-01-02|          Books|50.499999364217125|
|2021-01-02|           Toys|107.51999867757162|
|2021-01-02|       Souvenir|25.135713849748885|
|2021-01-02|      Cosmetics|116.17142949785504|
|2021-01-02|     Technology|            2730.0|
|2021-01-02|          Shoes|1800.5099836077009|
|2021-01-03|       Clothing| 856.5919855291194|
|2021-01-03|          Books| 45.44999917

In [12]:
shopping_df_clean.printSchema()
shopping_df_clean.show(5)

root
 |-- invoice_no: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = false)
 |-- price: float (nullable = false)
 |-- payment_method: string (nullable = true)
 |-- invoice_date: string (nullable = true)
 |-- shopping_mall: string (nullable = true)
 |-- start_date: date (nullable = false)
 |-- end_date: null (nullable = true)
 |-- is_active: boolean (nullable = false)
 |-- day: date (nullable = true)
 |-- month: integer (nullable = true)
 |-- quarter: integer (nullable = true)

+----------+-----------+------+---+--------+--------+-------+--------------+------------+--------------+----------+--------+---------+----------+-----+-------+
|invoice_no|customer_id|gender|age|category|quantity|  price|payment_method|invoice_date| shopping_mall|start_date|end_date|is_active|       day|month|quarter|
+----------+-----------+-----

In [13]:
shopping_df_clean = shopping_df_clean.withColumn("total_sales", F.col("quantity") * F.col("price"))


shopping_df_clean_cleaned = shopping_df_clean.filter(F.col("total_sales").isNotNull() & (F.col("total_sales") > 0))

categories = shopping_df_clean_cleaned.select("category").distinct()
days = shopping_df_clean_cleaned.select("day").distinct()

all_combinations = categories.crossJoin(days)

avg_sales_per_day = all_combinations.join(
    shopping_df_clean_cleaned.groupBy("category", "day")
    .agg(F.avg("total_sales").alias("avg_sales")),
    on=["category", "day"],
    how="left"
)

avg_sales_per_day.orderBy("category", "day").show()


                                                                                

+--------+----------+------------------+
|category|       day|         avg_sales|
+--------+----------+------------------+
|   Books|2021-01-01| 209.5749994913737|
|   Books|2021-01-02|212.09999783833823|
|   Books|2021-01-03| 166.6499973932902|
|   Books|2021-01-04|215.12999839782714|
|   Books|2021-01-05| 285.6857125418527|
|   Books|2021-01-06|233.30999603271485|
|   Books|2021-01-07| 94.68750079472859|
|   Books|2021-01-08|186.12857491629464|
|   Books|2021-01-09| 212.0999984741211|
|   Books|2021-01-10|163.61999969482423|
|   Books|2021-01-11| 148.9749994277954|
|   Books|2021-01-12|219.67499764760336|
|   Books|2021-01-13|  128.775000333786|
|   Books|2021-01-14|245.76666577657065|
|   Books|2021-01-15| 98.47500228881836|
|   Books|2021-01-16|173.53636273470792|
|   Books|2021-01-17| 106.0499997138977|
|   Books|2021-01-18| 103.0200002670288|
|   Books|2021-01-19| 161.2392850603376|
|   Books|2021-01-20|212.09999783833823|
+--------+----------+------------------+
only showing top

                                                                                

In [14]:

shopping_data = spark.read.format("delta").load(data_mart_path)

shopping_data_pd = shopping_data.toPandas()

shopping_data_pd.head()

                                                                                

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall,start_date,end_date,is_active
0,I314581,C100311,Male,66,Cosmetics,4,162.64,Debit Card,1/9/2022,Kanyon,2024-11-07,,True
1,I210837,C101776,Female,56,Food & Beverage,3,15.69,Credit Card,31/10/2021,Metrocity,2024-11-07,,True
2,I708544,C101845,Female,28,Food & Beverage,2,10.46,Cash,2/1/2022,Kanyon,2024-11-07,,True
3,I540074,C102302,Male,25,Clothing,1,300.08,Credit Card,16/11/2022,Mall of Istanbul,2024-11-07,,True
4,I340411,C102342,Male,30,Clothing,2,600.16,Credit Card,5/3/2023,Kanyon,2024-11-07,,True


In [16]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.layouts import column
import pandas as pd

In [17]:
shopping_data = spark.read.format("delta").load(data_mart_path)

shopping_data_pd = shopping_data.toPandas()

output_notebook()

source = ColumnDataSource(shopping_data_pd)

                                                                                

In [20]:
result_avg_quantity_day_pd = result_avg_quantity_day.toPandas()

source = ColumnDataSource(result_avg_quantity_day_pd)

p = figure(x_axis_type="datetime", title="Average Quantity per Category per Day", plot_height=400, plot_width=800)

for category in result_avg_quantity_day_pd['category'].unique():
    category_data = result_avg_quantity_day_pd[result_avg_quantity_day_pd['category'] == category]
    p.line('day', 'avg_quantity', source=ColumnDataSource(category_data), legend_label=str(category), line_width=2)

p.xaxis.axis_label = 'Day'
p.yaxis.axis_label = 'Average Quantity'
p.legend.title = "Category"
p.legend.location = "top_left"

output_notebook()
show(p)

                                                                                

In [21]:
result_avg_price_month_pd = result_avg_price_month.toPandas()

source = ColumnDataSource(result_avg_price_month_pd)

p2 = figure(x_axis_type="linear", title="Average Price per Category per Month", plot_height=400, plot_width=800)

for category in result_avg_price_month_pd['category'].unique():
    category_data = result_avg_price_month_pd[result_avg_price_month_pd['category'] == category]
    p2.line('month', 'avg_price', source=ColumnDataSource(category_data), legend_label=str(category), line_width=2)

p2.xaxis.axis_label = 'Month'
p2.yaxis.axis_label = 'Average Price'
p2.legend.title = "Category"
p2.legend.location = "top_left"

show(p2)

In [22]:
result_month_pd = result_month.toPandas()

source = ColumnDataSource(result_month_pd)

p3 = figure(x_axis_type="linear", title="Total Quantity per Month", plot_height=400, plot_width=800)

p3.vbar(x='month', top='total_quantity', source=source, width=0.9, legend_field="month", line_color="white", fill_color="blue")

p3.xaxis.axis_label = 'Month'
p3.yaxis.axis_label = 'Total Quantity'
p3.legend.title = "Month"
p3.legend.location = "top_left"

show(p3)

                                                                                

In [23]:
result_quarter_pd = result_quarter.toPandas()

source = ColumnDataSource(result_quarter_pd)

p4 = figure(x_axis_type="linear", title="Total Quantity per Quarter", plot_height=400, plot_width=800)

p4.vbar(x='quarter', top='total_quantity', source=source, width=0.9, legend_field="quarter", line_color="white", fill_color="green")

p4.xaxis.axis_label = 'Quarter'
p4.yaxis.axis_label = 'Total Quantity'
p4.legend.title = "Quarter"
p4.legend.location = "top_left"

show(p4)

In [None]:
spark.stop()