### Reading Data from Azure Blob Storage

In [None]:
%python
spark.conf.set(f"fs.azure.account.key.cs0storage0acc.blob.core.windows.net", '<accountkey>')


In [None]:
%python
parquet_path = f"wasbs://transformationoutput@cs0storage0acc.blob.core.windows.net/output.parquet"
df = spark.read.parquet(parquet_path)
df.toPandas()

Unnamed: 0,store_id,product_id,date,sales,revenue,stock,price,promo_type_1,day_id,month_id,...,product_width,cluster_id,hierarchy1_id,hierarchy2_id,hierarchy3_id,hierarchy4_id,hierarchy5_id,storetype_id,store_size,city_id
0,S0006,P0004,2017-01-02,0.0,0.00,18.0,4.50,PR14,2,1,...,4.0,cluster_3,H03,H0314,H031405,H03140500,H0314050003,ST03,8,C024
1,S0013,P0005,2017-01-02,0.0,0.00,11.0,33.90,PR14,2,1,...,16.0,cluster_9,H03,H0312,H031211,H03121109,H0312110917,ST04,33,C026
2,S0026,P0005,2017-01-02,0.0,0.00,4.0,33.90,PR14,2,1,...,16.0,cluster_9,H03,H0312,H031211,H03121109,H0312110917,ST04,41,C014
3,S0013,P0001,2017-01-02,2.0,10.59,0.0,6.25,PR14,2,1,...,20.0,cluster_5,H01,H0105,H010501,H01050100,H0105010006,ST04,33,C026
4,S0104,P0005,2017-01-02,0.0,0.00,3.0,33.90,PR14,2,1,...,16.0,cluster_9,H03,H0312,H031211,H03121109,H0312110917,ST04,47,C002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2143515,S0141,P0099,2019-11-08,0.0,0.00,9.0,8.00,PR14,8,11,...,10.0,cluster_0,H03,H0313,H031302,H03130210,H0313021001,ST03,12,C005
2143516,S0020,P0100,2019-11-08,0.0,0.00,9.0,7.90,PR14,8,11,...,21.0,cluster_0,H03,H0313,H031307,H03130700,H0313070009,ST04,36,C014
2143517,S0050,P0078,2019-11-05,0.0,0.00,9.0,6.50,PR14,5,11,...,7.5,cluster_2,H00,H0003,H000312,H00031205,H0003120501,ST04,31,C014
2143518,S0050,P0100,2019-11-08,0.0,0.00,9.0,7.90,PR14,8,11,...,21.0,cluster_0,H03,H0313,H031307,H03130700,H0313070009,ST04,31,C014


### Creating Materialized Views

In [None]:
%python
df.createOrReplaceTempView("overview")

In [None]:
%python
store_df = df.select(
    "store_id",
    "storetype_id",
    "store_size",
    "city_id"
)
store_df.createOrReplaceTempView("stores_view")

In [None]:
%python
sales_view_df = df.select(
    "product_id",
    "store_id",
    "date",
    "stock",
    "sales",
    "revenue",
    "price",
    "day_id",
    "month_id",
    "year_id",
    "day_of_week",
    "business_quarter",
    "promo_type_1"
)

sales_view_df.createOrReplaceTempView("sales_view")

### Total Products

In [None]:
%sql
SELECT 
    COUNT(DISTINCT product_id) AS product_count
FROM sales_view;

product_count
87


Databricks visualization. Run in Databricks to view.

### Total Stores

In [None]:
%sql
SELECT 
    COUNT(DISTINCT store_id) AS distinct_store_count
FROM sales_view;

distinct_store_count
144


Databricks visualization. Run in Databricks to view.

### Total Revenue

In [None]:
%sql
SELECT 
    SUM(revenue) AS total_revenue
FROM sales_view;

total_revenue
2888884.230000195


Databricks visualization. Run in Databricks to view.

### Total Sales

In [None]:
%sql
SELECT 
    COUNT(1) AS total_sales
FROM sales_view WHERE revenue>0;

total_sales
364381


Databricks visualization. Run in Databricks to view.

### Total Available Locations 

In [None]:
%sql
SELECT count(DISTINCT city_id) as locations FROM stores_view

locations
37


Databricks visualization. Run in Databricks to view.

### Top Products from Top Shops

In [None]:
%sql
WITH ShopRevenue AS (
    SELECT 
        store_id,
        SUM(revenue) AS total_revenue
    FROM 
        sales_view
    GROUP BY 
        store_id
),

TopShops AS (
    SELECT 
        store_id,
        total_revenue,
        RANK() OVER (ORDER BY total_revenue DESC) AS shop_rank
    FROM 
        ShopRevenue
),

RankedProducts AS (
    SELECT 
        s.store_id,
        p.product_id,
        SUM(p.sales) AS total_sales,
        SUM(p.revenue) AS total_revenue,
        RANK() OVER (PARTITION BY s.store_id ORDER BY SUM(p.revenue) DESC) AS product_rank
    FROM 
        sales_view p
    JOIN 
        TopShops s ON p.store_id = s.store_id
    WHERE 
        s.shop_rank <= 3
    GROUP BY 
        s.store_id, p.product_id
)

SELECT 
    store_id,
    total_sales,
    total_revenue,
    product_id
FROM 
    RankedProducts
WHERE 
    product_rank = 1
ORDER BY 
    store_id;



store_id,total_sales,total_revenue,product_id
S0026,880.0,10484.360000000022,P0059
S0038,2438.0,9777.97999999997,P0035
S0085,852.0,9916.030000000015,P0059


Databricks visualization. Run in Databricks to view.

### Top Products

In [None]:
%sql
SELECT 
    product_id,
    SUM(sales) AS total_sales
FROM sales_view
GROUP BY product_id
ORDER BY total_sales DESC
LIMIT 3;


product_id,total_sales
P0051,236744.0
P0017,104069.0
P0035,55916.0


Databricks visualization. Run in Databricks to view.

### Revenue Performance over Time

In [None]:
%sql
SELECT 
    make_date(year_id, month_id, 1) AS timeline,
    SUM(revenue) AS total_revenue
FROM sales_view
GROUP BY timeline
HAVING SUM(revenue) > 0
ORDER BY timeline;

timeline,total_revenue
2017-01-01,53138.269999999815
2017-02-01,45948.85999999961
2017-03-01,67973.02999999955
2017-04-01,49404.91999999973
2017-05-01,44461.12999999973
2017-06-01,51762.90999999974
2017-07-01,52311.45999999974
2017-08-01,56155.4999999999
2017-09-01,55403.649999999776
2017-10-01,73642.63999999975


Databricks visualization. Run in Databricks to view.

### Monthly Sales Performance of each year

In [None]:
%sql
SELECT 
    year_id,
    month_id,
    SUM(sales) AS total_sales
FROM sales_view
GROUP BY year_id, month_id HAVING total_sales != 0 
ORDER BY year_id, month_id;


year_id,month_id,total_sales
2017,1,26260.0
2017,2,24234.0
2017,3,29882.0
2017,4,22028.0
2017,5,17304.0
2017,6,19388.0
2017,7,19156.0
2017,8,21538.0
2017,9,20472.0
2017,10,23623.0


Databricks visualization. Run in Databricks to view.

### Sales Performance for each Day of Week

In [None]:
%sql
SELECT 
    day_of_week,year_id,
    SUM(sales) AS total_sales
FROM 
    sales_view
GROUP BY 
    day_of_week,year_id
ORDER BY 
    day_of_week,year_id;


day_of_week,year_id,total_sales
1,2017,46362.0
1,2018,49911.0
1,2019,33711.296
2,2017,34981.0
2,2018,43385.0
2,2019,29317.898
3,2017,35296.0
3,2018,40805.0
3,2019,27339.702
4,2017,34446.0


Databricks visualization. Run in Databricks to view.

### Most Effective Promotion Type

In [None]:
%sql
SELECT 
    promo_type_1,
    SUM(sales) AS total_sales
FROM sales_view
GROUP BY promo_type_1
ORDER BY total_sales DESC;


promo_type_1,total_sales
PR14,620901.1799999999
PR10,42165.0
PR03,35079.0
PR05,30921.0
PR06,13118.0
PR12,9723.0
PR11,6838.0
PR07,6239.0
PR17,5242.0
PR01,2536.0


Databricks visualization. Run in Databricks to view.

### Daily Revenue Performance

In [None]:
%sql
WITH DailyRevenue AS (
    SELECT 
        date,
        SUM(revenue) AS total_revenue
    FROM sales_view
    GROUP BY date
)

SELECT 
    date,
    (total_revenue-LEAD(total_revenue, 1, 0) OVER (ORDER BY date DESC))*100/LEAD(total_revenue, 1, 0) OVER (ORDER BY date DESC)
FROM DailyRevenue WHERE total_revenue != 0
ORDER BY date DESC LIMIT 1;




date,"(((total_revenue - lead(total_revenue, 1, 0) OVER (ORDER BY date DESC NULLS LAST ROWS BETWEEN 1 FOLLOWING AND 1 FOLLOWING)) * 100) / lead(total_revenue, 1, 0) OVER (ORDER BY date DESC NULLS LAST ROWS BETWEEN 1 FOLLOWING AND 1 FOLLOWING))"
2019-10-31,-4.200608130935854


Databricks visualization. Run in Databricks to view.

### Quarterly Sales Performance of each year

In [None]:
%sql
SELECT 
    YEAR(date) AS year,
    QUARTER(date) AS quarter,
    SUM(sales) AS total_sales
FROM sales_view
GROUP BY year, quarter
ORDER BY year, quarter;



year,quarter,total_sales
2017,1,80376.0
2017,2,58720.0
2017,3,61166.0
2017,4,66430.0
2018,1,63349.0
2018,2,94364.0
2018,3,81417.0
2018,4,65046.0
2019,1,71758.53199999999
2019,2,65381.898


Databricks visualization. Run in Databricks to view.

### Distribution of stores across city with their store types

In [None]:
%sql
SELECT 
    city_id,
    storetype_id,
    COUNT(DISTINCT store_id) AS total_stores
FROM 
    stores_view
GROUP BY 
    city_id, storetype_id
ORDER BY 
    total_stores DESC 


city_id,storetype_id,total_stores
C014,ST04,17
C014,ST03,13
C022,ST04,11
C022,ST03,11
C031,ST04,6
C031,ST03,5
C024,ST04,4
C008,ST04,3
C030,ST03,3
C005,ST04,3


Databricks visualization. Run in Databricks to view.

### distribution of stores with their size/Area

In [None]:
%sql
SELECT 
    store_size,storetype_id,COUNT(1)
FROM 
    stores_view 
GROUP BY
     store_size,storetype_id

store_size,storetype_id,count(1)
8,ST03,8623
15,ST03,75800
23,ST04,105894
16,ST03,32444
20,ST04,69823
44,ST02,16862
35,ST04,45588
41,ST04,94004
34,ST04,28682
14,ST03,42251


Databricks visualization. Run in Databricks to view.

### Stock Level Information

In [None]:
%sql
SELECT 
    cluster_id,
    storetype_id,
    SUM(sales) AS total_stock
FROM 
    overview
GROUP BY cluster_id,storetype_id


cluster_id,storetype_id,total_stock
cluster_0,ST04,106340.192
cluster_5,ST01,8110.0
cluster_5,ST03,1915.0
cluster_9,ST02,158.0
cluster_8,ST04,2896.0
cluster_5,ST02,433.0
cluster_1,ST03,15403.0
cluster_0,ST02,2576.0
cluster_0,ST01,18541.988
cluster_1,ST04,15514.0


Databricks visualization. Run in Databricks to view.

### Volume of Product sold and Store Area

In [None]:
%sql
SELECT 
    store_size,
    (product_length * product_depth * product_width) AS volume,
    SUM(sales) AS total_sales
FROM 
    overview
GROUP BY 
    store_size, volume

### Sales and Store Type

In [None]:
%sql
SELECT 
    storetype_id,
    SUM(sales) AS total_sales
FROM 
    overview
GROUP BY 
    storetype_id
ORDER BY 
    total_sales DESC;


storetype_id,total_sales
ST04,556175.192
ST03,108036.0
ST01,101409.98799999998
ST02,10020.0


Databricks visualization. Run in Databricks to view.

### Sales And Stock 

In [None]:
%sql
SELECT 
    date,
    AVG(sales) AS total_sales,
    AVG(stock) AS average_stock
FROM 
    sales_view
GROUP BY 
    date
ORDER BY 
    date
