In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, dayofmonth, month, year, quarter, substring_index, split, when, concat_ws, lit,round
from pyspark.sql.types import DateType
from datetime import datetime, timedelta
import os

In [2]:
spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("business_requirement")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
#Get the most selling products

most_selling_product = spark.sql("""
SELECT
    p.product_name,
    SUM(sf.units) AS total_units_sold
FROM
    retail.sales_transactions_Fact sf
JOIN
    retail.product_dim p ON sf.product_id = p.product_id
GROUP BY
    p.product_name
ORDER BY
    total_units_sold DESC
LIMIT 10
""")

most_selling_product

product_name,total_units_sold
Boots,1144
Sandals,1040
Washing Machine,661
Coffee Maker,637
Blouse,628
Blender,616
Toaster,601
Jeans,592
Headphones,586
Hair Straightener,579


In [14]:
#Get the most redeemed offers from customers

most_redeemed_offer_c = spark.sql("""
SELECT
    o.offer_name,
    count(sf.offer_id) AS total_redeemed_offer
FROM
    retail.sales_transactions_Fact sf
JOIN
    retail.offer_Dim o ON sf.offer_id = o.offer_id
GROUP BY
    o.offer_name
ORDER BY
    total_redeemed_offer DESC
LIMIT 5
""")

most_redeemed_offer_c

offer_name,total_redeemed_offer
offer_4,656
offer_3,638
offer_2,581
offer_1,577
offer_5,560


In [15]:
#Get the most redeemed offers by products

most_redeemed_offer_p = spark.sql("""
SELECT
    p.product_name,
    count(sf.offer_id) AS total_redeemed_offer
FROM
    retail.sales_transactions_Fact sf
JOIN
    retail.product_dim p ON sf.product_id = p.product_id
GROUP BY
    p.product_name
ORDER BY
    total_redeemed_offer DESC
LIMIT 5
""")

most_redeemed_offer_p

product_name,total_redeemed_offer
Sandals,192
Boots,185
Hair Straightener,123
Smartphone,121
Coffee Maker,121


In [5]:
#Get the most loweset cities in online sales

lowest_cities = spark.sql("""

SELECT
    city,
    SUM(total_price) AS total_online_sales
FROM
    retail.sales_transactions_Fact sf
JOIN
    retail.location_Dim c ON sf.location_id = c.location_id
WHERE
    sf.payment_id like 'yes%' 
GROUP BY
    city
ORDER BY
    total_online_sales ASC
limit 10

""")

lowest_cities

city,total_online_sales
Fall River,29.99
Grass Valley,29.99
Falmouth,37.981
Revere,39.99
Redlands,47.984
Marlborough,49.99
Fortuna,50.983
Dublin,53.982
Freetown,79.96
Saugus,79.98


In [6]:
# Stop the Spark session
spark.stop()