In [6]:
import pandas as pd
import matplotlib.ticker as tick
import matplotlib.pylab as plt
import seaborn as sns

from util.reformat_large_tick_values import reformat_large_tick_values
from sqlalchemy.engine import create_engine
import warnings
warnings.filterwarnings('ignore', message='Cannot create BigQuery Storage client*')
engine = create_engine('bigquery://', credentials_path= 'key/iowa-liquor-sales-365322-12ff7a5498e6.json')

# Introduction:  

This independent exploratory analysis uses public data on liquor sold in Iowa and US census population data to examine sales trends. This analysis focuses on the ten most populous Iowa counties and the time period 2018-2021. 

The ten most populous Iowa counties (out of 100 total counties) comprise 52.6% of the adult (18+) population and represent 63% of the state's total liquor sales (by volume).

## Public Datasets

Iowa Liquor Sales dataset, Iowa Department of Commerce, Alcoholic Beverages Division
`bigquery-public-data.iowa_liquor_sales`  
Data represents sales to Iowa Class "E" liquor licensees. Examples include grocery stores, liquor stores etc. which sell liquor for off-premises consumption. Unaggregated dataset stored in BigQuery.


Population data, United States Census Bureau, https://www.census.gov/quickfacts/fact/table/IA/POP010220
Accurate population numbers are limited to 2020, the year of the US census. In this analysis, population counts and percent minors for 2020 are used for all years.

## Import Census Data
#### Data taken directly from US census website. No additional cleaning performed. 

In [7]:
census_data_df = pd.read_csv('data/census_data.csv')
census_data_df

Unnamed: 0,county,population,percent_minors
0,POLK,492401,0.245
1,LINN,230299,0.228
2,SCOTT,174669,0.235
3,JOHNSON,152854,0.197
4,BLACK HAWK,131144,0.22
5,WOODBURY,105941,0.262
6,DUBUQUE,99266,0.227
7,STORY,98537,0.164
8,DALLAS,99678,0.269
9,POTTAWATTAMIE,93667,0.234


# Cleaning Liquor Sales Data

## Duplicate Check

In [8]:
#Check all rows of unaggregated dataset for duplicate invoice number
duplicate_check_sql_statement = """
SELECT COUNT(DISTINCT invoice_and_item_number) AS distinct_rows, COUNT(*) AS total_rows
FROM `bigquery-public-data.iowa_liquor_sales.sales`
""".strip()
duplicate_check_sql_statement= pd.read_sql_query(duplicate_check_sql_statement, engine)


Unnamed: 0,distinct_rows,total_rows
0,29006010,29006010


### Distinct rows equal total rows showing that all rows are unique. 

In [9]:
county_names_check_sql_statement = """SELECT COUNT(DISTINCT county) AS county_count
FROM `bigquery-public-data.iowa_liquor_sales.sales`"""
county_names_check = pd.read_sql_query(county_names_check_sql_statement, engine)
county_names_check

Unnamed: 0,county_count
0,100


### The dataset has 100 distinct counties, which is correct. Iowa has 100 counties. 

## Null check

In [10]:
#Null check on Iowa Liquor Sales dataset column: volume_sold_liters
null_check_liters_sold_sql_statement = """
SELECT 
  countif(volume_sold_liters is null) / count(1) * 100 as percent_null_liters_sold
FROM `bigquery-public-data.iowa_liquor_sales.sales` 
WHERE EXTRACT(YEAR from date) IN (2017, 2018, 2019, 2020, 2021, 2022, 2023)
  """.strip()
percent_null_liters_sold = pd.read_sql_query(null_check_liters_sold_sql_statement, engine)
percent_null_liters_sold

Unnamed: 0,percent_null_liters_sold
0,0.0


In [11]:
#Null check on Iowa Liquor Sales dataset column: county
null_check_county_sql_statement = """SELECT 
  countif(county is null) / count(1) * 100 as percent_null_county
FROM `bigquery-public-data.iowa_liquor_sales.sales` 
WHERE EXTRACT(YEAR from date) IN (2017, 2018, 2019, 2020, 2021, 2022, 2023)
  """.strip()
percent_null_county = pd.read_sql_query(null_check_county_sql_statement, engine)
percent_null_county

Unnamed: 0,percent_null_county
0,0.304728


### The store_number column has no nulls. 

### Point three percent (0.3%) of values in the county column are null. There are 2,299 distinct store numbers, only two are missing valid county name. 

### Strategy: Clean county column by mapping each store number to a valid county name.

In [12]:
cleaned_null_check_county_sql_statement = """
WITH 
source_table AS (
  SELECT
    volume_sold_liters,
    date,
    EXTRACT(YEAR FROM date) AS year,
    county,
    store_number
  FROM `bigquery-public-data.iowa_liquor_sales.sales`
  WHERE EXTRACT(YEAR FROM date) IN (2017, 2018, 2019, 2020, 2021,2022, 2023)
  ),

store_county_map AS ( 
  SELECT 
    store_number,
  MAX(county) AS county
  FROM source_table
  WHERE store_number IS NOT NULL
  GROUP BY store_number
  ),

joined_table AS (
  SELECT a.volume_sold_liters, a.date, a.year, a.store_number, COALESCE(a.county, b.county) AS county
  FROM source_table a
  LEFT JOIN store_county_map b ON a.store_number = b.store_number
  ),

 null_check_county AS (
   SELECT countif(county is null) / count(1) * 100 as percent_null_county
FROM joined_table)

SELECT * FROM null_check_county
  """.strip()

cleaned_percent_null_county = pd.read_sql_query(cleaned_null_check_county_sql_statement, engine)
cleaned_percent_null_county

Unnamed: 0,percent_null_county
0,0.006698


### County column nulls reduced from 0.304728% to  0.006698%

In [14]:
#Null check on Iowa Liquor Sales dataset column: liquor_type
null_check_liquor_type_sql_statement = """
SELECT 
  countif(category_name is null) / count(1) * 100 as percent_null_liquor_type
FROM `bigquery-public-data.iowa_liquor_sales.sales` 
WHERE EXTRACT(YEAR from date) IN (2017, 2018, 2019, 2020, 2021, 2022, 2023)
  """.strip()
percent_null_liquor_type = pd.read_sql_query(null_check_liquor_type_sql_statement, engine)
percent_null_liquor_type

Unnamed: 0,percent_null_liquor_type
0,0.050096


In [18]:
distinct_categories_sql_statement = """SELECT DISTINCT category_name 
FROM `bigquery-public-data.iowa_liquor_sales.sales` 
WHERE EXTRACT(YEAR from date) IN (2017, 2018, 2019, 2020, 2021, 2022, 2023)
  """.strip()
distinct_categories = pd.read_sql_query(distinct_categories_sql_statement, engine)
distinct_categories

Unnamed: 0,category_name
0,CREAM LIQUEURS
1,AMERICAN CORDIALS & LIQUEURS
2,TENNESSEE WHISKIES
3,CORN WHISKIES
4,AGED DARK RUM
5,AMERICAN WHISKIES
6,BLENDED WHISKIES
7,IMPORTED FLAVORED VODKA
8,100% AGAVE TEQUILA
9,TRIPLE SEC


### In the original dataset, 5% of rows have null values in the category_name column. There are 52 distinct categories. Let's simplify the categories and fill the nulls and 'None' values. 

### We'll use a CASE statement to recategorize all products. We'll combine existing categories and use the 'item_description' column to categorize all products with null values.

In [17]:
cleaned_null_check_liquor_type_sql_statement = """
WITH cleaned_table AS(
SELECT CASE 
          WHEN category_name LIKE '%AMAR%' OR category_name LIKE '%CREME%'OR category_name LIKE '%LIQUEUR%' OR category_name LIKE '%CORDIAL%' OR category_name LIKE '%ANIS%' OR category_name LIKE '%TRIPLE SEC%' OR item_description LIKE '%JAGERM%' OR item_description LIKE '%LIQUEUR%' OR item_description LIKE '%SAINTS N SINNERS APPLE PIE%' THEN 'Liqueur' 
          WHEN category_name LIKE '%WHIS%' OR category_name LIKE '%BOUR%'OR category_name LIKE '%RYE%' OR item_description LIKE '%RYE%' OR item_description LIKE '%WHIS%'OR item_description LIKE '%SCOTCH%' OR category_name LIKE '%SCOTCH%' OR item_description LIKE '%BEAM%' OR item_description LIKE '%BOUR%' OR item_description LIKE '%CROWN ROYAL%' OR item_description LIKE '%JACK DAN%'OR item_description LIKE '%EVAN WILL%'OR item_description LIKE '%MAKER%MARK%' OR item_description LIKE '%SIR WINSTON%' OR item_description LIKE '%ELIJAH%' OR item_description LIKE '%JOHNNIE WALKER%'THEN 'Whiskey'
          WHEN category_name LIKE '%VOD%' OR item_description LIKE '%VOD%'OR item_description LIKE '%SMIRN%'THEN 'Vodka'
          WHEN category_name LIKE '%BRANDI%' OR category_name LIKE '%BRANDY%' OR category_name LIKE '%SCHNAPPS%' OR item_description LIKE '%COGNAC%' OR item_description LIKE '%HENNESSY%'THEN 'Brandy'
          WHEN category_name LIKE '%RUM%' OR item_description LIKE '%RUM%' OR item_description LIKE '%CAPTAIN MOR%'THEN 'Rum'
          WHEN category_name LIKE '%TEQ%' OR category_name LIKE '%MEZC%' OR item_description LIKE '%JUAREZ%' OR item_description LIKE '%TEQU%' OR item_description LIKE '%REPOSADO%' OR item_description LIKE '%TORTILLA GOLD%' OR item_description LIKE '%MONTEZUMA%' THEN 'Tequila and Mezcal'
          WHEN category_name LIKE '%GIN%' OR item_description LIKE '%GIN%' THEN 'Gin'
          WHEN category_name LIKE '%COCKT%' OR item_description LIKE '%TARANTULA%' THEN 'Cocktails'
          ELSE 'Other' END AS liquor_type
FROM `bigquery-public-data.iowa_liquor_sales.sales` 
WHERE EXTRACT(YEAR from date) IN (2017, 2018, 2019, 2020, 2021, 2022, 2023)
)
SELECT
  countif(liquor_type is null) / count(1) * 100 as percent_null_liquor_type
FROM cleaned_table
  """.strip()
cleaned_percent_null_liquor_type = pd.read_sql_query(cleaned_null_check_liquor_type_sql_statement, engine)
cleaned_percent_null_liquor_type

Unnamed: 0,percent_null_liquor_type
0,0.0


### Our cleaning and recategorization reduced nulls in the category_name column from 5% to 0% and 52 categories to nine simplified categories. We will refer to this categorization as 'liquor_type' moving forward for clarity. 
