In [1]:
import pandas as pd

import matplotlib.ticker as tick
import matplotlib.pylab as plt
import seaborn as sns
import plotly.express as px
from util.reformat_large_tick_values import reformat_large_tick_values as ticker

from sqlalchemy.engine import create_engine
import warnings
warnings.filterwarnings('ignore', message='Cannot create BigQuery Storage client*')

In [2]:
engine = create_engine('bigquery://', credentials_path= 'key/iowa-liquor-sales-365322-12ff7a5498e6.json')

In [None]:
county_liters_sold_sql_statement = """
WITH 
source_table AS (
  SELECT
    volume_sold_liters,
    date,
    store_number,
    county,
    EXTRACT(YEAR FROM date) AS year
  FROM `bigquery-public-data.iowa_liquor_sales.sales`
  WHERE EXTRACT(YEAR FROM date) IN (2018, 2019, 2020, 2021, 2022, 2023)
  ),

store_county_map AS ( 
  SELECT 
    store_number,
    MAX(county) AS county
  FROM source_table
  WHERE store_number IS NOT NULL
  GROUP BY store_number
  ),

joined_table AS (
  SELECT a.volume_sold_liters, a.date, a.year, COALESCE(a.county, b.county) AS county
  FROM source_table a
  LEFT JOIN store_county_map b ON a.store_number = b.store_number
  ),

cleaned_source_table AS (
  SELECT 
    volume_sold_liters,
    date,
    year,
    county
  FROM joined_table
  WHERE county IN ("POLK", "LINN", "SCOTT", "JOHNSON", "BLACK HAWK", "WOODBURY", "DUBUQUE", "STORY", "DALLAS", "POTTAWATTAMIE")
  )

SELECT 
    county,
    volume_sold_liters
FROM cleaned_source_table
LIMIT 1000000

  """.strip()

liters_per_county_df = pd.read_sql_query(county_liters_sold_sql_statement, engine)
liters_per_county_df['county'] = liters_per_county_df['county'].str.title()
liters_per_county_df

In [None]:
undersampled_sql_statement = """
SELECT * 
FROM `bigquery-public-data.iowa_liquor_sales.sales`
TABLESAMPLE SYSTEM (50 PERCENT)
""".strip()
undersampled_df = pd.read_sql_query(undersampled_sql_statement, engine)
undersampled_df

In [None]:
undersampled_sql_statement = """
SELECT * 
FROM `bigquery-public-data.iowa_liquor_sales.sales`
TABLESAMPLE SYSTEM (10 PERCENT)
""".strip()
undersampled_df = pd.read_sql_query(undersampled_sql_statement, engine)
undersampled_df

In [None]:
undersampled_sql_statement = """  
SELECT
    volume_sold_liters,
    date,
    store_number,
    county,
    EXTRACT(YEAR FROM date) AS year
  FROM `bigquery-public-data.iowa_liquor_sales.sales`
  TABLESAMPLE SYSTEM (10 PERCENT)
  WHERE EXTRACT(YEAR FROM date) IN (2018, 2019, 2020, 2021, 2022, 2023) AND county IN ("POLK", "LINN", "SCOTT", "JOHNSON", "BLACK HAWK", "WOODBURY", "DUBUQUE", "STORY", "DALLAS", "POTTAWATTAMIE")
  """.strip()
undersampled_df = pd.read_sql_query(undersampled_sql_statement, engine)
undersampled_df.head(40)