In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window

conf = pyspark.SparkConf()
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/05 11:46:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
### some basic cleaning, exploration, and QA

store_open_close = spark.read.csv('system2_case_study_transaction_data_location_info.csv', header=True, inferSchema=True)
store_sales = spark.read.csv('system2_case_study_transaction_data_monthly_sales_by_locationid.csv', header=True, inferSchema=True)
store_info = spark.read.csv('system2_case_study_webscrape_data_locations.csv', header=True, inferSchema=True)
comp_sales = spark.read.csv('system2_case_study_reported_numbers.csv', header=True, inferSchema=True)

# there are dupes in the store_open_close table
# assuming that we can grab the min and max dates (for this assignment)
store_open_close = store_open_close.groupBy('locationid').agg(F.min(F.col('est_open_date')).alias('est_open_date'),F.max(F.col('est_close_date')).alias('est_close_date'))

# there are dupes in the sales table
# assuming that we can add them together (for this assignment)
store_sales = store_sales.groupBy('locationid','period_start','period_end','label').agg(F.sum(F.col('amount')).alias('amount'))
store_sales = store_sales.withColumn('qt_end', F.date_sub(F.add_months(F.date_trunc('quarter', F.col('period_end')), 3), 1))
store_sales.show()

[Stage 8:>                                                          (0 + 1) / 1]

+----------+------------+----------+---------+------------------+----------+
|locationid|period_start|period_end|    label|            amount|    qt_end|
+----------+------------+----------+---------+------------------+----------+
|      1041|  2019-08-01|2019-08-31|2019-MS08|1337.5012817107845|2019-09-30|
|      1080|  2019-08-01|2019-08-31|2019-MS08|202.40023576342875|2019-09-30|
|       331|  2019-08-01|2019-08-31|2019-MS08| 3784.071590671761|2019-09-30|
|       336|  2019-08-01|2019-08-31|2019-MS08|1168.5250390091855|2019-09-30|
|       474|  2019-08-01|2019-08-31|2019-MS08| 1611.716807033345|2019-09-30|
|       583|  2019-08-01|2019-08-31|2019-MS08| 771.0859404306472|2019-09-30|
|       651|  2019-08-01|2019-08-31|2019-MS08| 3446.550485547101|2019-09-30|
|       754|  2019-08-01|2019-08-31|2019-MS08| 5494.037670844849|2019-09-30|
|       902|  2019-08-01|2019-08-31|2019-MS08| 2908.162819356863|2019-09-30|
|       125|  2019-09-01|2019-09-30|2019-MS09|161.34738341932922|2019-09-30|

                                                                                

In [5]:
# comp sales
comp_sales.show()

+------+--------------------+-------------+------------+
|entity|              metric|period_end_dt|reported_yoy|
+------+--------------------+-------------+------------+
|  PRTY|Brand Comparable ...|      3/31/16|      -1.50%|
|  PRTY|Brand Comparable ...|      6/30/16|       3.80%|
|  PRTY|Brand Comparable ...|      9/30/16|       1.20%|
|  PRTY|Brand Comparable ...|     12/31/16|      -3.50%|
|  PRTY|Brand Comparable ...|      3/31/17|       1.70%|
|  PRTY|Brand Comparable ...|      6/30/17|       0.10%|
|  PRTY|Brand Comparable ...|      9/30/17|      -2.60%|
|  PRTY|Brand Comparable ...|     12/31/17|      -1.40%|
|  PRTY|Brand Comparable ...|      3/31/18|       2.40%|
|  PRTY|Brand Comparable ...|      6/30/18|       0.10%|
|  PRTY|Brand Comparable ...|      9/30/18|      -1.00%|
|  PRTY|Brand Comparable ...|     12/31/18|      -2.90%|
|  PRTY|Brand Comparable ...|      3/31/19|      -1.40%|
|  PRTY|Brand Comparable ...|      6/30/19|      -2.10%|
|  PRTY|Brand Comparable ...|  

In [25]:
# top stores by dollar amount after pandemic
store_sales.where(F.col('period_start') >= '2020-08-01').groupBy('locationid').agg(F.sum('amount').alias('amount')).orderBy(F.col('amount').desc()).show()

+----------+------------------+
|locationid|            amount|
+----------+------------------+
|       410|1810943.4764412066|
|       520| 650131.7460584699|
|       846| 592398.1937818673|
|       117|  542125.375890249|
|       761| 535687.6311212053|
|       754|464942.72885827586|
|       634| 460643.3228629713|
|       710| 443666.2680222966|
|       103| 399132.0899436094|
|       118|387569.39450555603|
|       109| 367129.8510906332|
|       534|353681.77233843994|
|       413| 349595.9001168511|
|       411| 349474.4484120353|
|      5999| 327721.3119189659|
|       320| 318470.6679776114|
|      1441| 312759.7299203606|
|         3| 305194.6331005389|
|       514| 299039.7201772194|
|       601| 298525.9638211416|
+----------+------------------+
only showing top 20 rows



In [49]:
# stores that did not join to a web location
store_sales_agg = store_sales.where(F.col('period_start') >= '2022-01-01').groupBy('locationid').agg(F.sum(F.col('amount')).alias('amount'))

store_sales_agg.select('locationid', 'amount').distinct().join(store_info.withColumn('locationid', F.col('location_id')), 'locationid', 'left').where(F.col('store_services').isNull()).orderBy(F.col('amount').desc()).show(50)

+----------+------------------+-------+----------+----+-----+-----+-----------+--------------+-----+----+-------+--------+--------+---------+
|locationid|            amount|address|as_of_date|city|phone|state|location_id|store_services|title| url|zipcode|postcode|latitude|longitude|
+----------+------------------+-------+----------+----+-----+-----+-----------+--------------+-----+----+-------+--------+--------+---------+
|      5999| 327721.3119189659|   NULL|      NULL|NULL| NULL| NULL|       NULL|          NULL| NULL|NULL|   NULL|    NULL|    NULL|     NULL|
|      1441| 312759.7299203606|   NULL|      NULL|NULL| NULL| NULL|       NULL|          NULL| NULL|NULL|   NULL|    NULL|    NULL|     NULL|
|       110|183206.33633870876|   NULL|      NULL|NULL| NULL| NULL|       NULL|          NULL| NULL|NULL|   NULL|    NULL|    NULL|     NULL|
|       606| 167599.8570311895|   NULL|      NULL|NULL| NULL| NULL|       NULL|          NULL| NULL|NULL|   NULL|    NULL|    NULL|     NULL|
|     

In [98]:
# web location that did not join to sales
store_info.withColumn('locationid', F.col('location_id')) .join(store_sales.select('locationid').distinct(), 'locationid', 'anti').select('location_id','state', 'store_services').show(200, truncate = False)

+-----------+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|location_id|state         |store_services                                                                                                                                 |
+-----------+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|1138       |Minnesota     |In-Store Shopping | In-Store Pickup | Curbside Pickup | Scheduled or Same Day Delivery | Balloon Delivery | Helium                             |
|1115       |Missouri      |In-Store Shopping | Next Gen | In-Store Pickup | Curbside Pickup | Scheduled or Same Day Delivery | Balloon Delivery | Helium                  |
|1020       |New Jersey    |Flag Ship Store | In-Store Shopping | Next Gen | In-Store Pickup | Curbside Pickup | Scheduled or Same Day 

In [97]:
# web location that did not join to sales by state 

(
store_info
 .withColumn('locationid', F.col('location_id'))
    .join(store_sales.select('locationid').distinct(), 'locationid', 'anti')
    .groupBy('state')
    .agg(
        F.count_distinct('location_id').alias('store_ct')
    )
 .show(50, truncate = False)
)

+--------------+--------+
|state         |store_ct|
+--------------+--------+
|Minnesota     |6       |
|Ohio          |1       |
|Oregon        |1       |
|Arkansas      |3       |
|Texas         |14      |
|North Dakota  |4       |
|Pennsylvania  |1       |
|Nebraska      |2       |
|Puerto Rico   |5       |
|Washington    |1       |
|Illinois      |1       |
|Oklahoma      |2       |
|Delaware      |1       |
|Missouri      |7       |
|Georgia       |1       |
|Virginia      |8       |
|North Carolina|5       |
|New Jersey    |1       |
|Alabama       |1       |
|Arizona       |1       |
|Iowa          |1       |
|Massachusetts |3       |
|Louisiana     |1       |
|Tennessee     |6       |
|New Hampshire |2       |
|Florida       |3       |
|South Carolina|2       |
|California    |19      |
|New York      |12      |
+--------------+--------+



In [88]:
# worst performing new york 2022 yoy

store_sales_agg = (
    store_sales
    .where(F.col('period_start') >= '2021-01-01')
    .groupBy('locationid')
    .agg(
        F.sum(F.when((F.col('period_start') >= '2021-01-01') 
               & (F.col('period_start') < '2022-01-01'), F.col('amount') ).otherwise(0)).alias('2021_spend'),
        F.sum(F.when((F.col('period_start') >= '2022-01-01') 
               & (F.col('period_start') < '2023-01-01'), F.col('amount') ).otherwise(0)).alias('2022_spend')
    
    )
    .withColumn('yoy',F.round((F.col('2022_spend') - F.col('2021_spend'))/F.col('2021_spend'), 4))

)

(
store_info
 .withColumn('locationid', F.col('location_id'))
 .join(store_sales_agg, 'locationid', 'inner')
    .orderBy(F.col('yoy'))
.select('state', 'city', 'yoy')
    .where(F.col('state') == 'New York' )
 .show(10, truncate = False)
)

+--------+-------------+-------+
|state   |city         |yoy    |
+--------+-------------+-------+
|New York|Brooklyn     |-0.5073|
|New York|Brooklyn     |-0.4267|
|New York|Schenectady  |-0.4041|
|New York|Bronx        |-0.3777|
|New York|Staten Island|-0.3517|
|New York|Patchogue    |-0.3448|
|New York|Bronx        |-0.3423|
|New York|Brooklyn     |-0.3408|
|New York|Rosedale     |-0.327 |
|New York|New Hartford |-0.2781|
+--------+-------------+-------+
only showing top 10 rows



In [87]:
# worst performing every state 2022 yoy
store_sales_agg = (
    store_sales
    .where(F.col('period_start') >= '2021-01-01')
    .groupBy('locationid')
    .agg(
        F.sum(F.when((F.col('period_start') >= '2021-01-01') 
               & (F.col('period_start') < '2022-01-01'), F.col('amount') ).otherwise(0)).alias('2021_spend'),
        F.sum(F.when((F.col('period_start') >= '2022-01-01') 
               & (F.col('period_start') < '2023-01-01'), F.col('amount') ).otherwise(0)).alias('2022_spend')
    
    )
    .withColumn('2022_yoy_change',(F.round((F.col('2022_spend') - F.col('2021_spend'))/F.col('2021_spend'), 4)))

)

(
store_info
 .withColumn('locationid', F.col('location_id'))
 .join(store_sales_agg, 'locationid', 'inner')
    .orderBy(F.col('2022_yoy_change'))
.select('state', 'city', '2022_yoy_change')
    # .where(F.col('state') == 'California' )
 .show(50, truncate = False)
)

+------------+----------------+---------------+
|state       |city            |2022_yoy_change|
+------------+----------------+---------------+
|Maryland    |Pasadena        |NULL           |
|New Jersey  |South Plainfield|-0.7736        |
|California  |Rialto          |-0.7357        |
|California  |Indio           |-0.6306        |
|Arkansas    |Little Rock     |-0.6078        |
|California  |Santa Rosa      |-0.5997        |
|California  |Lancaster       |-0.5941        |
|Michigan    |Roseville       |-0.5903        |
|Nevada      |Las Vegas       |-0.5879        |
|California  |Riverside       |-0.5724        |
|California  |Anaheim         |-0.5683        |
|California  |Downey          |-0.5615        |
|California  |Santa Clarita   |-0.5558        |
|California  |West Covina     |-0.5545        |
|Illinois    |Berwyn          |-0.5507        |
|Texas       |Brownsville     |-0.5444        |
|Illinois    |Chicago         |-0.5358        |
|Texas       |Carrollton      |-0.533   

In [89]:
# worst performing states

store_sales_agg = (
    store_sales
    .where(F.col('period_start') >= '2021-01-01')


)

(
store_info
 .withColumn('locationid', F.col('location_id'))
 .join(store_sales_agg, 'locationid', 'inner')
    
    .groupBy('state')
    .agg(
        F.sum(F.when((F.col('period_start') >= '2021-01-01') 
               & (F.col('period_start') < '2022-01-01'), F.col('amount') ).otherwise(0)).alias('2021_spend'),
        F.sum(F.when((F.col('period_start') >= '2022-01-01') 
               & (F.col('period_start') < '2023-01-01'), F.col('amount') ).otherwise(0)).alias('2022_spend'),
        F.count_distinct('location_id').alias('store_ct')
    
    )
    .withColumn('2022_yoy_change',F.round((F.col('2022_spend') - F.col('2021_spend'))/F.col('2021_spend'), 4))
.select('state', '2022_yoy_change', 'store_ct')
    # .where(F.col('state') == 'California' )
    .orderBy(F.col('2022_yoy_change'))
 .show(50, truncate = False)
)

+--------------+---------------+--------+
|state         |2022_yoy_change|store_ct|
+--------------+---------------+--------+
|Arkansas      |-0.6078        |1       |
|Hawaii        |-0.4799        |2       |
|New Jersey    |-0.3937        |25      |
|Nevada        |-0.3837        |6       |
|Michigan      |-0.3581        |21      |
|California    |-0.2964        |69      |
|Colorado      |-0.2011        |13      |
|Illinois      |-0.1973        |38      |
|Florida       |-0.1094        |60      |
|Indiana       |-0.1025        |17      |
|New Mexico    |-0.0926        |3       |
|Rhode Island  |-0.066         |2       |
|Georgia       |-0.0582        |26      |
|Missouri      |-0.0476        |8       |
|Kentucky      |-0.0399        |8       |
|New Hampshire |-0.0192        |1       |
|Virginia      |-0.0096        |13      |
|New York      |0.0107         |37      |
|Oregon        |0.0184         |1       |
|Oklahoma      |0.0411         |5       |
|Pennsylvania  |0.0413         |25

In [83]:
# worst performing group of servies

store_sales_agg = (
    store_sales
    .where(F.col('period_start') >= '2021-01-01')


)

(
store_info
 .withColumn('locationid', F.col('location_id'))
 .join(store_sales_agg, 'locationid', 'inner')
    
    .groupBy('store_services')
    .agg(
        F.sum(F.when((F.col('period_start') >= '2021-01-01') 
               & (F.col('period_start') < '2022-01-01'), F.col('amount') ).otherwise(0)).alias('2021_spend'),
        F.sum(F.when((F.col('period_start') >= '2022-01-01') 
               & (F.col('period_start') < '2023-01-01'), F.col('amount') ).otherwise(0)).alias('2022_spend'),
        F.count_distinct('location_id').alias('store_ct')
    
    )
    .withColumn('2022_yoy_change',F.round((F.col('2022_spend') - F.col('2021_spend'))/F.col('2021_spend'), 4))
.select('store_services', '2022_yoy_change', 'store_ct')
    # .where(F.col('state') == 'California' )
    .orderBy(F.col('2022_yoy_change'))
 .show(50, truncate = False)
)

+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------+--------+
|store_services                                                                                                                                 |2022_yoy_change     |store_ct|
+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------+--------+
|In-Store Shopping | Next Gen | Helium                                                                                                          |-0.5878762814390801 |1       |
|In-Store Shopping                                                                                                                              |-0.43166200141841643|4       |
|In-Store Shopping | Next Gen | In-Store Pickup | Curbside Pickup | Scheduled or Same Day Delivery | Balloon Delivery | 

In [76]:
# worst performing services in general

store_sales_agg = (
    store_sales
    .where(F.col('period_start') >= '2021-01-01')


)

for service in ['Flag Ship Store', 
                'In-Store Shopping', 
                'Next Gen', 
                'In-Store Pickup', 
                'Curbside Pickup', 
                'Scheduled or Same Day Delivery', 
                'Balloon Delivery', 'Helium']:
    (
    store_info
     .withColumn('locationid', F.col('location_id'))
     .join(store_sales_agg, 'locationid', 'inner')
        .withColumn('store_services', F.when(F.col('store_services').rlike(service), service).otherwise(f'No {service}'))
        .groupBy('store_services')
        .agg(
            F.sum(F.when((F.col('period_start') >= '2021-01-01') 
                   & (F.col('period_start') < '2022-01-01'), F.col('amount') ).otherwise(0)).alias('2021_spend'),
            F.sum(F.when((F.col('period_start') >= '2022-01-01') 
                   & (F.col('period_start') < '2023-01-01'), F.col('amount') ).otherwise(0)).alias('2022_spend'),
            F.count_distinct('location_id').alias('store_ct')
        
        )
        .withColumn('2022_yoy_change',F.round((F.col('2022_spend') - F.col('2021_spend'))/F.col('2021_spend'), 4))
    .select('store_services', '2022_yoy_change', 'store_ct')
        # .where(F.col('state') == 'California' )
        .orderBy(F.col('2022_yoy_change'))
     .show(50, truncate = False)
    )

+------------------+---------------+--------+
|store_services    |2022_yoy_change|store_ct|
+------------------+---------------+--------+
|No Flag Ship Store|-0.0555        |607     |
|Flag Ship Store   |0.0581         |25      |
+------------------+---------------+--------+

+-----------------+---------------+--------+
|store_services   |2022_yoy_change|store_ct|
+-----------------+---------------+--------+
|In-Store Shopping|-0.0516        |632     |
+-----------------+---------------+--------+

+--------------+---------------+--------+
|store_services|2022_yoy_change|store_ct|
+--------------+---------------+--------+
|Next Gen      |-0.0556        |33      |
|No Next Gen   |-0.0514        |599     |
+--------------+---------------+--------+

+------------------+---------------+--------+
|store_services    |2022_yoy_change|store_ct|
+------------------+---------------+--------+
|No In-Store Pickup|-0.0527        |33      |
|In-Store Pickup   |-0.0516        |599     |
+-------------

In [53]:
# best performing store's services

store_sales_agg = (
    store_sales
    .where(F.col('period_start') >= '2021-01-01')
    .groupBy('locationid')
    .agg(
        F.sum(F.when((F.col('period_start') >= '2021-01-01') 
               & (F.col('period_start') < '2022-01-01'), F.col('amount') ).otherwise(0)).alias('2021_spend'),
        F.sum(F.when((F.col('period_start') >= '2022-01-01') 
               & (F.col('period_start') < '2023-01-01'), F.col('amount') ).otherwise(0)).alias('2022_spend')
    
    )
    .withColumn('yoy',(F.col('2022_spend') - F.col('2021_spend'))/F.col('2021_spend'))

)

(
store_info
 .withColumn('locationid', F.col('location_id'))
 .join(store_sales_agg, 'locationid', 'inner')
    .orderBy(F.col('yoy').desc())
.select('state', 'store_services', 'yoy')
 .show(50, truncate = False)
)

+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|state        |store_services                                                                                                                                 |yoy               |
+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|Mississippi  |In-Store Shopping | Helium                                                                                                                     |19.149014757508375|
|Mississippi  |In-Store Shopping | In-Store Pickup | Curbside Pickup | Scheduled or Same Day Delivery | Helium                                                |16.933944281408696|
|Texas        |In-Store Shopping | In-Store Pickup | Curbside Pickup | Scheduled or Same Day Delivery | B

In [30]:
store_sales.where(F.col('locationid') == '1020').show()

+----------+------------+----------+-----+------+------+
|locationid|period_start|period_end|label|amount|qt_end|
+----------+------------+----------+-----+------+------+
+----------+------------+----------+-----+------+------+



In [26]:
# I believe this the online store
store_open_close.where(F.col('locationid') == '1002').show()

+----------+-------------------+-------------------+
|locationid|      est_open_date|     est_close_date|
+----------+-------------------+-------------------+
|      1002|2018-07-01 00:00:00|2024-09-01 00:00:00|
+----------+-------------------+-------------------+



In [30]:
# every store has a close and end date
store_sales.select('locationid').distinct().join(store_open_close, 'locationid', 'left').where(F.col('est_open_date').isNull()).show()

+----------+-------------+--------------+
|locationid|est_open_date|est_close_date|
+----------+-------------+--------------+
+----------+-------------+--------------+



In [37]:
# any non US stores? nope

states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
    "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
    "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
    "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
    "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota",
    "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina",
    "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia",
    "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

store_info.where(~F.col('state').isin(states)).show()

+--------------------+----------+--------------------+--------------+-----------+-----------+--------------------+--------------------+--------------------+-------+--------+----------+------------------+
|             address|as_of_date|                city|         phone|      state|location_id|      store_services|               title|                 url|zipcode|postcode|  latitude|         longitude|
+--------------------+----------+--------------------+--------------+-----------+-----------+--------------------+--------------------+--------------------+-------+--------+----------+------------------+
|22 González Guist...|2024-10-07|            Guaynabo|(787) 781-3605|Puerto Rico|         49|In-Store Shopping...|San Patricio Village|https://stores.pa...|    968|     968| 18.406803|       -66.1053842|
|  RD #2, Bldg #2-828|2024-10-07|Bldg #2-828, Maya...|(787) 831-5121|Puerto Rico|        138|In-Store Shopping...|Villa Capitán Bld...|https://stores.pa...|    680|     680| 18.200345|

In [23]:
# looking at some timeseries going downwards 
store_sales.groupBy('period_start','period_end','label').agg(F.count('*')).where(F.col('period_start') >= '2022-01-01').orderBy('period_start').show(300)

+------------+----------+---------+--------+
|period_start|period_end|    label|count(1)|
+------------+----------+---------+--------+
|  2022-01-01|2022-01-31|2022-MS01|     744|
|  2022-02-01|2022-02-28|2022-MS02|     756|
|  2022-03-01|2022-03-31|2022-MS03|     764|
|  2022-04-01|2022-04-30|2022-MS04|     761|
|  2022-05-01|2022-05-31|2022-MS05|     762|
|  2022-06-01|2022-06-30|2022-MS06|     763|
|  2022-07-01|2022-07-31|2022-MS07|     762|
|  2022-08-01|2022-08-31|2022-MS08|     767|
|  2022-09-01|2022-09-30|2022-MS09|     769|
|  2022-10-01|2022-10-31|2022-MS10|     800|
|  2022-11-01|2022-11-30|2022-MS11|     738|
|  2022-12-01|2022-12-31|2022-MS12|     759|
|  2023-01-01|2023-01-31|2023-MS01|     720|
|  2023-02-01|2023-02-28|2023-MS02|     748|
|  2023-03-01|2023-03-31|2023-MS03|     739|
|  2023-04-01|2023-04-30|2023-MS04|     733|
|  2023-05-01|2023-05-31|2023-MS05|     729|
|  2023-06-01|2023-06-30|2023-MS06|     715|
|  2023-07-01|2023-07-31|2023-MS07|     699|
|  2023-08

In [27]:
store_sales.where(( F.col('locationid') == 520)).orderBy('period_start').show(300)

+----------+------------+----------+---------+------------------+----------+
|locationid|period_start|period_end|    label|            amount|    qt_end|
+----------+------------+----------+---------+------------------+----------+
|       520|  2019-08-01|2019-08-31|2019-MS08|13762.204956521222|2019-09-30|
|       520|  2019-09-01|2019-09-30|2019-MS09|17568.949302166387|2019-09-30|
|       520|  2019-10-01|2019-10-31|2019-MS10| 42124.30597344486|2019-12-31|
|       520|  2019-11-01|2019-11-30|2019-MS11|16947.354632566363|2019-12-31|
|       520|  2019-12-01|2019-12-31|2019-MS12| 15097.94879155212|2019-12-31|
|       520|  2020-01-01|2020-01-31|2020-MS01|14818.486750882654|2020-03-31|
|       520|  2020-02-01|2020-02-29|2020-MS02|14696.926720930622|2020-03-31|
|       520|  2020-03-01|2020-03-31|2020-MS03| 9439.515307622163|2020-03-31|
|       520|  2020-05-01|2020-05-31|2020-MS05| 6230.144739798236|2020-06-30|
|       520|  2020-06-01|2020-06-30|2020-MS06|  26508.2283590802|2020-06-30|

In [21]:
store_sales.where(( F.col('locationid') == 1002)).orderBy('period_start').toPandas().to_csv('1002_online.csv')

In [22]:
store_sales.where((F.col('period_start') == '2020-04-01')).orderBy(F.col('amount').desc()).show(100)

+----------+------------+----------+---------+------------------+----------+
|locationid|period_start|period_end|    label|            amount|    qt_end|
+----------+------------+----------+---------+------------------+----------+
|      1002|  2020-04-01|2020-04-30|2020-MS04|129055.45267063857|2020-06-30|
|       579|  2020-04-01|2020-04-30|2020-MS04|1711.2043577651048|2020-06-30|
|       363|  2020-04-01|2020-04-30|2020-MS04|1278.8285292146238|2020-06-30|
|       410|  2020-04-01|2020-04-30|2020-MS04|1116.9570249283904|2020-06-30|
|        17|  2020-04-01|2020-04-30|2020-MS04|  980.267392939502|2020-06-30|
|       351|  2020-04-01|2020-04-30|2020-MS04|  784.636004623168|2020-06-30|
|       254|  2020-04-01|2020-04-30|2020-MS04| 722.7976337925091|2020-06-30|
|       344|  2020-04-01|2020-04-30|2020-MS04| 669.1305107219584|2020-06-30|
|       175|  2020-04-01|2020-04-30|2020-MS04| 590.7904275232106|2020-06-30|
|       973|  2020-04-01|2020-04-30|2020-MS04| 526.8724402098672|2020-06-30|

In [36]:
store_sales.groupBy('locationid','period_start','period_end','label').agg(F.count('*').alias('count')).where(F.col('count') >= 1).orderBy(F.col('count').desc()).show(300)

+----------+------------+----------+---------+-----+
|locationid|period_start|period_end|    label|count|
+----------+------------+----------+---------+-----+
|      1041|  2019-08-01|2019-08-31|2019-MS08|    1|
|      1080|  2019-08-01|2019-08-31|2019-MS08|    1|
|       331|  2019-08-01|2019-08-31|2019-MS08|    1|
|       336|  2019-08-01|2019-08-31|2019-MS08|    1|
|       474|  2019-08-01|2019-08-31|2019-MS08|    1|
|       583|  2019-08-01|2019-08-31|2019-MS08|    1|
|       651|  2019-08-01|2019-08-31|2019-MS08|    1|
|       754|  2019-08-01|2019-08-31|2019-MS08|    1|
|       902|  2019-08-01|2019-08-31|2019-MS08|    1|
|       125|  2019-09-01|2019-09-30|2019-MS09|    1|
|       364|  2019-09-01|2019-09-30|2019-MS09|    1|
|       223|  2019-09-01|2019-09-30|2019-MS09|    1|
|      1150|  2019-11-01|2019-11-30|2019-MS11|    1|
|       422|  2019-11-01|2019-11-30|2019-MS11|    1|
|       555|  2019-11-01|2019-11-30|2019-MS11|    1|
|       621|  2019-11-01|2019-11-30|2019-MS11|

In [144]:
store_sales.groupBy('qt_end').agg(F.sum('amount').alias('amount'),F.count_distinct('locationid').alias('store_ct')).orderBy('qt_end').show(100)

+----------+------------------+--------+
|    qt_end|            amount|store_ct|
+----------+------------------+--------+
|2019-09-30|2601353.0933812144|     770|
|2019-12-31| 5968705.615264985|     779|
|2020-03-31|2934950.1018679305|     770|
|2020-06-30|2276428.8576691262|     741|
|2020-09-30|4115140.8896111823|     774|
|2020-12-31| 5567378.338841764|     766|
|2021-03-31|3753022.1131896004|     761|
|2021-06-30| 5115723.719401804|     779|
|2021-09-30| 4496018.231040375|     780|
|2021-12-31| 6414808.341609914|     773|
|2022-03-31| 3892243.701251664|     771|
|2022-06-30|4673587.1481143795|     796|
|2022-09-30| 4146296.521482139|     798|
|2022-12-31| 5927458.843006935|     794|
|2023-03-31|4066617.2181202797|     774|
|2023-06-30|4077826.9274084843|     766|
|2023-09-30| 3432942.175473987|     735|
|2023-12-31| 4686094.578831631|     744|
+----------+------------------+--------+



In [119]:
# comp store calculations

store_sales_w_fp=(
    store_sales
    .where(F.col('qt_end')>= '2019-12-31')
     .groupBy('qt_end','locationid')
 .agg(F.sum('amount').alias('amount'))
 .join(store_open_close, 'locationid', 'left' )
)

comp_yoy = (store_sales_w_fp.alias('current')
    .where(# need 13 months to count as a comp
        (F.add_months(F.col('current.est_open_date'),13) <= F.col('current.qt_end'))
    )
     .join(store_sales_w_fp.alias('ly'), 
           # join exactly a year ago and by store
           (F.add_months(F.col('ly.qt_end'),12) == F.col('current.qt_end')) & 
           (F.col('ly.locationid') == F.col('current.locationid')) 
           ,
           'inner')
    .select('current.*', F.col('ly.amount').alias('ly_amount'), F.col('ly.qt_end').alias('ly_qt_end'))
      .groupBy('qt_end')
          .agg(
              F.sum('amount').alias('ty_amount'), 
              F.sum('ly_amount').alias('ly_amount'),
              F.count_distinct('locationid').alias('store_ct'),
               
                )
     .withColumn('yoy_change',(F.col('ty_amount')-F.col('ly_amount'))/F.col('ly_amount'))
    .orderBy('qt_end')
)

comp_yoy.show(100)

+----------+------------------+------------------+--------+--------------------+
|    qt_end|         ty_amount|         ly_amount|store_ct|          yoy_change|
+----------+------------------+------------------+--------+--------------------+
|2020-12-31| 5547010.950353009| 5861563.863975732|     752|-0.05366365033671589|
|2021-03-31|   3731197.5049473|2890111.5831117462|     740|  0.2910219545675698|
|2021-06-30| 5009076.694535113|2235273.6773891114|     721|  1.2409232234980345|
|2021-09-30| 4446626.898198184| 4063062.204639172|     750| 0.09440286026658916|
|2021-12-31| 6330074.937772987| 5466850.967281493|     743| 0.15790150045388018|
|2022-03-31| 3846958.901788982| 3702896.631956586|     744|0.038905290682196114|
|2022-06-30|4546657.4958752105| 5046201.573961108|     754|-0.09899407916314593|
|2022-09-30|  3887253.16254667|4445384.2333673835|     760|-0.12555294245013518|
|2022-12-31| 5527187.867566972| 6312042.202794009|     753|-0.12434237763486802|
|2023-03-31| 3825933.2330489

In [100]:
comp_yoy.toPandas().to_csv('comp_yoy.csv')

In [7]:
# simple yoy calculations

w = Window.partitionBy(F.lit('')).orderBy("qt_end")
last_year_amount = F.lag(F.col("amount"), 4).over(w)
quarterly_sales = (
    store_sales
    .where(F.col('qt_end')>= '2019-12-31')
     .groupBy('qt_end')
     .agg(F.sum('amount').alias('amount'),F.count_distinct('locationid').alias('store_ct'))
     .withColumn('yoy', last_year_amount)
     .withColumn('yoy_change',(F.col('amount') - last_year_amount) / last_year_amount)
     .orderBy('qt_end')
)

quarterly_sales.show(20)

+----------+------------------+--------+------------------+--------------------+
|    qt_end|            amount|store_ct|               yoy|          yoy_change|
+----------+------------------+--------+------------------+--------------------+
|2019-12-31| 5968705.615264985|     779|              NULL|                NULL|
|2020-03-31|2934950.1018679305|     770|              NULL|                NULL|
|2020-06-30|2276428.8576691262|     741|              NULL|                NULL|
|2020-09-30|4115140.8896111823|     774|              NULL|                NULL|
|2020-12-31| 5567378.338841764|     766| 5968705.615264985| -0.0672385777239247|
|2021-03-31|3753022.1131896004|     761|2934950.1018679305| 0.27873455524883134|
|2021-06-30| 5115723.719401804|     779|2276428.8576691262|  1.2472583327905442|
|2021-09-30| 4496018.231040375|     780|4115140.8896111823| 0.09255511576547264|
|2021-12-31| 6414808.341609914|     773| 5567378.338841764|  0.1522134748515131|
|2022-03-31| 3892243.7012516

In [8]:
quarterly_sales.toPandas().to_csv('quarterly_sales.csv')

In [103]:
# monthly calculations

w = Window.partitionBy(F.lit('')).orderBy("period_start")
last_year_amount = F.lag(F.col("amount"), 12).over(w)
monthly_sales = (
    store_sales
    .where(F.col('period_start')>= '2019-12-31')
    .groupBy('period_start')
     .agg(F.sum('amount').alias('amount'),F.count_distinct('locationid').alias('store_ct'))
     .withColumn('yoy', last_year_amount)
     .withColumn('yoy_change',(F.col('amount') - last_year_amount) / last_year_amount)
     .orderBy('period_start')
)

monthly_sales.show(300)

[Stage 926:>                                                        (0 + 1) / 1]

+------------+------------------+--------+------------------+--------------------+
|period_start|            amount|store_ct|               yoy|          yoy_change|
+------------+------------------+--------+------------------+--------------------+
|  2020-01-01|1104449.4093486872|     738|              NULL|                NULL|
|  2020-02-01| 1126917.447651824|     754|              NULL|                NULL|
|  2020-03-01| 703583.2448674168|     706|              NULL|                NULL|
|  2020-04-01|144698.39209704567|      68|              NULL|                NULL|
|  2020-05-01| 742517.5725841038|     505|              NULL|                NULL|
|  2020-06-01|1389212.8929879726|     734|              NULL|                NULL|
|  2020-07-01|1353286.8233231446|     730|              NULL|                NULL|
|  2020-08-01|1349112.9344967334|     736|              NULL|                NULL|
|  2020-09-01|1412741.1317913022|     728|              NULL|                NULL|
|  2

                                                                                

In [94]:
monthly_sales.toPandas().to_csv('monthly_sales.csv')