In [1]:
import pandas as pd

import matplotlib.ticker as tick
import matplotlib.pylab as plt
import seaborn as sns
import plotly.express as px
from util.reformat_large_tick_values import reformat_large_tick_values as ticker

from sqlalchemy.engine import create_engine
import warnings
warnings.filterwarnings('ignore', message='Cannot create BigQuery Storage client*')

In [2]:
engine = create_engine('bigquery://', credentials_path= 'key/iowa-liquor-sales-365322-12ff7a5498e6.json')

In [None]:
county_liters_sold_sql_statement = """
WITH 
source_table AS (
  SELECT
    volume_sold_liters,
    date,
    store_number,
    county,
    EXTRACT(YEAR FROM date) AS year
  FROM `bigquery-public-data.iowa_liquor_sales.sales`
  WHERE EXTRACT(YEAR FROM date) IN (2018, 2019, 2020, 2021, 2022, 2023)
  ),

store_county_map AS ( 
  SELECT 
    store_number,
    MAX(county) AS county
  FROM source_table
  WHERE store_number IS NOT NULL
  GROUP BY store_number
  ),

joined_table AS (
  SELECT a.volume_sold_liters, a.date, a.year, COALESCE(a.county, b.county) AS county
  FROM source_table a
  LEFT JOIN store_county_map b ON a.store_number = b.store_number
  ),

cleaned_source_table AS (
  SELECT 
    volume_sold_liters,
    date,
    year,
    county
  FROM joined_table
  WHERE county IN ("POLK", "LINN", "SCOTT", "JOHNSON", "BLACK HAWK", "WOODBURY", "DUBUQUE", "STORY", "DALLAS", "POTTAWATTAMIE")
  )

SELECT 
    county,
    volume_sold_liters
FROM cleaned_source_table


  """.strip()

liters_per_county_df = pd.read_sql_query(county_liters_sold_sql_statement, engine)
liters_per_county_df['county'] = liters_per_county_df['county'].str.title()
liters_per_county_df

In [13]:
row_count_sql_statement = """
SELECT count(*)
FROM `bigquery-public-data.iowa_liquor_sales.sales`
WHERE EXTRACT(YEAR FROM date) IN (2018, 2019, 2020, 2021, 2022, 2023) AND county IN ("POLK", "LINN", "SCOTT", "JOHNSON", "BLACK HAWK", "WOODBURY", "DUBUQUE", "STORY", "DALLAS", "POTTAWATTAMIE")
  """.strip()
row_count_sql_df = pd.read_sql_query(row_count_sql_statement, engine)
row_count_sql_df

Unnamed: 0,f0_
0,8738982


In [15]:
all_rows_sql_statement = """  
SELECT
    volume_sold_liters,
    date,
    store_number,
    county,
    EXTRACT(YEAR FROM date) AS year
  FROM `bigquery-public-data.iowa_liquor_sales.sales`
  WHERE EXTRACT(YEAR FROM date) IN (2018, 2019, 2020, 2021, 2022, 2023) AND county IN ("POLK", "LINN", "SCOTT", "JOHNSON", "BLACK HAWK", "WOODBURY", "DUBUQUE", "STORY", "DALLAS", "POTTAWATTAMIE")
  """.strip()
all_rows_df = pd.read_sql_query(all_rows_sql_statement, engine)

KeyboardInterrupt: 

In [7]:
undersampled_sql_statement = """  
SELECT
    volume_sold_liters,
    date,
    store_number,
    county,
    EXTRACT(YEAR FROM date) AS year
  FROM `bigquery-public-data.iowa_liquor_sales.sales`
  TABLESAMPLE SYSTEM (10 PERCENT)
  WHERE EXTRACT(YEAR FROM date) IN (2018, 2019, 2020, 2021, 2022, 2023) AND county IN ("POLK", "LINN", "SCOTT", "JOHNSON", "BLACK HAWK", "WOODBURY", "DUBUQUE", "STORY", "DALLAS", "POTTAWATTAMIE")
  """.strip()
undersampled_df = pd.read_sql_query(undersampled_sql_statement, engine)
undersampled_df.head(40)

Unnamed: 0,volume_sold_liters,date,store_number,county,year
0,0.05,2020-06-03,2590,LINN,2020
1,0.05,2023-06-19,2647,LINN,2023
2,0.05,2022-11-11,2560,LINN,2022
3,0.05,2023-08-04,2560,LINN,2023
4,0.05,2023-01-03,3666,LINN,2023
5,0.05,2020-11-16,2552,LINN,2020
6,0.05,2022-08-15,5462,LINN,2022
7,0.05,2023-08-07,4530,LINN,2023
8,0.05,2023-05-30,4492,LINN,2023
9,0.05,2022-09-02,2618,LINN,2022


In [8]:
print(undersampled_df)

        volume_sold_liters        date store_number         county  year
0                     0.05  2020-06-03         2590           LINN  2020
1                     0.05  2023-06-19         2647           LINN  2023
2                     0.05  2022-11-11         2560           LINN  2022
3                     0.05  2023-08-04         2560           LINN  2023
4                     0.05  2023-01-03         3666           LINN  2023
...                    ...         ...          ...            ...   ...
863720              504.00  2023-02-23         3477  POTTAWATTAMIE  2023
863721              540.00  2021-07-21         4312  POTTAWATTAMIE  2021
863722              735.00  2023-07-12         4743  POTTAWATTAMIE  2023
863723              787.50  2019-02-08         3477  POTTAWATTAMIE  2019
863724             2016.00  2022-03-30         3477  POTTAWATTAMIE  2022

[863725 rows x 5 columns]


In [9]:
county_counts = undersampled_df['county'].value_counts()
print(county_counts)

county
POLK             282737
LINN             125416
SCOTT             87616
BLACK HAWK        85346
JOHNSON           73869
POTTAWATTAMIE     48947
WOODBURY          48500
STORY             45293
DUBUQUE           42078
DALLAS            23923
Name: count, dtype: int64


In [27]:
year_counts = undersampled_df['year'].value_counts()
print(year_counts)

year
2023    150153
2020    148531
2021    148409
2022    144226
2019    136589
2018    135817
Name: count, dtype: int64


the year sampling seems well distributed, but not the counties. Considering undersampling each county, then coalescing results
