# Python and BigQuery Setup
<hr>

In [None]:
# Import basic Python data science libraries
import pandas as pd
import numpy as np

# Import BigQuery library
from google.cloud import bigquery

# Import Exceptions library to help with dataset error catching
from google.cloud.exceptions import NotFound

# Install chart_studio, to use plotly in notebooks
!pip install --upgrade chart_studio --quiet

# Import and setup for plotly
import chart_studio.plotly as py

# Import plotly express for quick plots
import plotly.express as px

In [None]:
# Setup BigQuery project, create 'bqml_tutorial' dataset in project if necessary

# ENTER YOUR PROJECT ID HERE
PROJECT_ID = "gcp-data-science-demo"

# Create BigQuery client
client = bigquery.Client(project = PROJECT_ID)

project_dataset_name =  '{}.bqml_tutorial'.format(PROJECT_ID)

# If 'bqml_tutorial' dataset does not exist in project, create it
try:
    dataset_id = bigquery.Dataset(project_dataset_name)
    client.get_dataset(dataset_id) 
    print("Dataset {} already exists".format(project_dataset_name))
    
except NotFound:
    print("Dataset {} does not exist".format(project_dataset_name))
    
    dataset = client.create_dataset(bigquery.Dataset(
      project_dataset_name)) 
    
    print("Created dataset {}".format(project_dataset_name))

# Exploratory Analysis of Sales Categories
<hr>

#### Monthly Sales by Category

In [None]:
%%bigquery sales_by_category_month
SELECT
  FORMAT_DATE('%Y-%m', date) AS month,
  category,
  category_name,

  SUM(volume_sold_gallons) AS category_monthly_gallons,
  SUM(sale_dollars) AS category_monthly_dollars
  
FROM
  `bigquery-public-data.iowa_liquor_sales.sales` Sales

WHERE
  # Remove current month so as to avoid partial data
  FORMAT_DATE('%Y-%m', date) < FORMAT_DATE('%Y-%m', CURRENT_DATE())

GROUP BY
  month, category, category_name
  
ORDER BY
  category_monthly_gallons DESC

In [None]:
sales_by_category_month

#### Top Overall Sales Categories

In [None]:
# Get top overall sales categories across entire time span
NUM_TOP_CATEGORIES = 20

top_overall_sales_categories = (sales_by_category_month.
  groupby(['category', 'category_name']).
  sum().
  sort_values(['category_monthly_gallons'], ascending = False).
  reset_index().
  head(n = NUM_TOP_CATEGORIES)
  )

top_overall_sales_categories

#### Top Categories Monthly Volume Interactive Time Series Plot

In [None]:
# Plot Monthly Volume for Top Categories Using plotly

top_sales_categories_by_month = (sales_by_category_month[
  np.isin(sales_by_category_month['category_name'], 
    top_overall_sales_categories['category_name'])].
  sort_values(['month', 'category_name'])
  )

top_sales_categories_by_month_plot = px.line(
  top_sales_categories_by_month, 
  x = 'month', 
  y = 'category_monthly_gallons',
  color = 'category_name'
  )

top_sales_categories_by_month_plot.show()

# Understanding Sales Patterns Across Categories
<hr>

#### Correlation Between Pairs of Categories by Volume Across Months

In [None]:
# Add start date to BigQuery parameters dictionary
bigquery_params = {
  # Start w/ September 2016 since plot above shows category shift
  'start_date': '2016-09-01'
  }

In [None]:
%%bigquery correlation_among_categories --params $bigquery_params
# Look at Correlation Among Categories' Volume Across Months
WITH
MonthlyTotals AS
(
  SELECT
    FORMAT_DATE('%Y-%m', date) AS month,
    SUM(volume_sold_gallons) AS total_monthly_volume

  FROM
    `bigquery-public-data.iowa_liquor_sales.sales`

  WHERE
    # Start w/ date given by query parameter
    date >= @start_date AND
    # Remove current month so as to avoid partial data
    FORMAT_DATE('%Y-%m', date) < FORMAT_DATE('%Y-%m', CURRENT_DATE())
    
  GROUP BY
    month
),

MonthCategory AS
(
  SELECT
    FORMAT_DATE('%Y-%m', date) AS month,
    category,
    category_name,

    SUM(volume_sold_gallons) AS category_monthly_volume,

    SAFE_DIVIDE(
      SUM(volume_sold_gallons),
      total_monthly_volume
      ) * 100 AS category_pct_of_month_volume

  FROM
    `bigquery-public-data.iowa_liquor_sales.sales` Sales
    
  LEFT JOIN
    MonthlyTotals ON 
      FORMAT_DATE('%Y-%m', Sales.date) = MonthlyTotals.month

  WHERE
    # Start w/ date given by query parameter
    date >= @start_date AND    
    # Remove current month so as to avoid partial data
    FORMAT_DATE('%Y-%m', date) < FORMAT_DATE('%Y-%m', CURRENT_DATE())

  GROUP BY
    month, category, category_name, total_monthly_volume
)

SELECT
  Category1.category AS category1,
  Category1.category_name AS category_name1,

  Category2.category AS category2,
  Category2.category_name AS category_name2,

  COUNT(DISTINCT Category1.month) AS num_months,

  CORR(
    Category1.category_pct_of_month_volume,
    Category2.category_pct_of_month_volume
    ) AS category_corr_across_months,

  AVG(Category1.category_pct_of_month_volume) AS
    category1_avg_pct_of_month_volume,
  AVG(Category2.category_pct_of_month_volume) AS
    category2_avg_pct_of_month_volume

FROM
  MonthCategory Category1

INNER JOIN
  MonthCategory Category2 ON
  (
    Category1.month = Category2.month
  )

GROUP BY
  category1, category_name1, category2, category_name2

HAVING
  # At least 2 years' worth of overlapping months
  num_months >= 24 AND
  # Each category accounts for >= 1% of monthly volume (on average)
  category1_avg_pct_of_month_volume >= 1 AND
  category2_avg_pct_of_month_volume >= 1
  
ORDER BY
  category_corr_across_months

In [None]:
correlation_among_categories

#### Heat Map of Category Pair Correlation Coefficients

In [None]:
# OPTIONAL - show heat map of correlation coefficients
category_correlation_matrix = pd.pivot_table(
  correlation_among_categories, 
  values = 'category_corr_across_months', 
  index = ['category_name1'],
  columns = ['category_name2'], 
  aggfunc = np.mean
  )

import plotly.figure_factory as ff

category_correlation_heatmap = ff.create_annotated_heatmap(
  x = category_correlation_matrix.index.tolist(),
  y = category_correlation_matrix.columns.tolist(),
  z = category_correlation_matrix.values,
  annotation_text = category_correlation_matrix.values.round(2),
  showscale = True
  )

category_correlation_heatmap.show(width = 1600, height = 4500)

#### CHOSEN Categories Monthly Volume Time Series to Understand Correlation

In [None]:
# Look at monthly sales volume by category
chosen_categories = ['Cream Liqueurs', 'American Brandies', 
  'American Schnapps', 'Flavored Rum']

chosen_categories_sales_by_month = (sales_by_category_month[
  (sales_by_category_month['month'] >= '2016-09')
  &
  np.isin(sales_by_category_month['category_name'], 
    chosen_categories)].
  sort_values(['category_name', 'month'])
  )

chosen_categories_sales_by_month_plot = px.line(
  chosen_categories_sales_by_month,
  x = 'month', 
  y = 'category_monthly_gallons',
  color = 'category_name'
  )

chosen_categories_sales_by_month_plot.show()

# Create and Evaluate Product Categories Using K-Means Clustering
<hr>

#### Example Sales Category Names and Item Descriptions

In [None]:
%%bigquery
SELECT
  category_name,
  item_description

FROM
  `bigquery-public-data.iowa_liquor_sales.sales` Sales

WHERE
  category_name IS NOT NULL AND
  item_description IS NOT NULL

GROUP BY
  category_name, item_description

ORDER BY
  SAFE_DIVIDE(SUM(sale_dollars), SUM(volume_sold_gallons)) DESC

LIMIT 10

What we want to avoid:

```
IF(REGEXP_CONTAINS(LOWER(category_name), "whiskies|whisky|whiskey|scotch|single malt"), "Whiskey",   
  IF(REGEXP_CONTAINS(LOWER(category_name), "vodka"), "Vodka",   
     IF(REGEXP_CONTAINS(LOWER(category_name), "schnapps|amaretto|liqueur|creme|triple sec"), "Liqueur",   
        IF(REGEXP_CONTAINS(LOWER(category_name), "rum"), "Rum",  
           IF(REGEXP_CONTAINS(LOWER(category_name), "^gin"), "Gin",   
              IF(REGEXP_CONTAINS(LOWER(category_name), "brandy"), "Brandy",  
                 IF(REGEXP_CONTAINS(LOWER(category_name), "tequila"), "Tequila",  
                    IF(REGEXP_CONTAINS(LOWER(category_name), "bourbon"),   "Bourbon",  
                       IF(REGEXP_CONTAINS(LOWER(category_name), "cocktails"), "Cocktails",
                          IF(REGEXP_CONTAINS(LOWER(category_name), "spirit"), "Other Spirits",  
```

#### Example Categories and Items with Tokens

In [None]:
%%bigquery
SELECT
  category_name,
  REGEXP_EXTRACT_ALL(LOWER(category_name), '[a-z]+') AS cat_tokens,
    
  item_description,
  REGEXP_EXTRACT_ALL(LOWER(item_description), '[a-z]+') AS item_tokens

FROM
  `bigquery-public-data.iowa_liquor_sales.sales` Sales

WHERE
  category_name IS NOT NULL AND
  item_description IS NOT NULL

GROUP BY
  category_name, item_description

ORDER BY
  SAFE_DIVIDE(SUM(sale_dollars), SUM(volume_sold_gallons)) DESC

LIMIT 10

#### Categories and Items with Tokens AND Numerical Features

In [None]:
%%bigquery
CREATE OR REPLACE TABLE bqml_tutorial.iowa_liquor_sales_category_info AS
(
  SELECT
    category_name,
    REGEXP_EXTRACT_ALL(LOWER(category_name), '[a-z]+') AS cat_tokens,
    
    item_description,
    REGEXP_EXTRACT_ALL(LOWER(item_description), '[a-z]+') AS item_tokens,
    
    SUM(volume_sold_gallons) total_item_volume,
    
    SAFE_DIVIDE(
      SUM(sale_dollars),
      SUM(volume_sold_gallons)
      ) AS avg_dollars_per_gallon,
    
    SAFE_DIVIDE(
      SUM(IF(FORMAT_DATE('%m', date) IN ('10', '11', '12'), 
        volume_sold_gallons, 0)),
      SUM(volume_sold_gallons)
      ) * 100 AS pct_item_volume_Q4
    
  FROM
    `bigquery-public-data.iowa_liquor_sales.sales` Sales

  WHERE
    category_name IS NOT NULL AND
    item_description IS NOT NULL

  GROUP BY
    category_name, item_description

  HAVING
    total_item_volume > 0
    
  ORDER BY
    total_item_volume DESC
)

In [None]:
%%bigquery

SELECT *

FROM
  bqml_tutorial.iowa_liquor_sales_category_info

ORDER BY
  total_item_volume DESC

LIMIT 10

#### Perform K-Means Clustering on Product Categories

In [None]:
%%bigquery

CREATE OR REPLACE MODEL bqml_tutorial.category_names_kmeans10
  OPTIONS(model_type = 'kmeans', num_clusters = 10, 
    kmeans_init_method = 'KMEANS++') AS
(
  SELECT
    cat_tokens,
    item_tokens,
    avg_dollars_per_gallon,
    pct_item_volume_Q4

  FROM
    bqml_tutorial.iowa_liquor_sales_category_info
)
;

#### Evaluate K-Means Clustering

In [None]:
%%bigquery
SELECT * 

FROM
  ML.EVALUATE(MODEL bqml_tutorial.category_names_kmeans10)
;

#### Look at K-Means Clustering Centroids

In [None]:
%%bigquery category_kmeans_centroids
SELECT *

FROM
  ML.CENTROIDS(MODEL bqml_tutorial.category_names_kmeans10)

In [None]:
pd.set_option('max_colwidth', 50)
display(category_kmeans_centroids)

#### Study Clustering Results in More Detail

In [None]:
%%bigquery category_kmeans_clustering_results

# CREATE OR REPLACE TABLE bqml_tutorial.category_names_kmeans_results AS
SELECT
  CENTROID_ID AS cluster,
  
  SUM(total_item_volume) AS total_volume,
    
  SAFE_DIVIDE(
    SUM(total_item_volume * avg_dollars_per_gallon),
    SUM(total_item_volume)
    ) AS avg_dollars_per_gallon,

  SAFE_DIVIDE(
    SUM(total_item_volume * pct_item_volume_Q4),
    SUM(total_item_volume)
    ) AS pct_volume_Q4,
 
  STRING_AGG(item_description, ', ' ORDER BY total_item_volume DESC LIMIT 10)
    AS top_items,
    
  STRING_AGG(DISTINCT category_name, ', ' ORDER BY category_name
    ) AS categories

FROM
  ML.PREDICT(
    MODEL bqml_tutorial.category_names_kmeans10,
    TABLE bqml_tutorial.iowa_liquor_sales_category_info
  )

GROUP BY
  cluster

ORDER BY
  total_volume DESC

In [None]:
# Set option for longer columns to help with long strings here
pd.set_option('max_colwidth', 2000)
display(category_kmeans_clustering_results)