In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# If running in Google Colab, install PostgreSQL and restore the database
if 'google.colab' in sys.modules:
    # Update package installer
    !sudo apt-get update -qq > /dev/null 2>&1

    # Install PostgreSQL
    !sudo apt-get install postgresql -qq > /dev/null 2>&1

    # Start PostgreSQL service (suppress output)
    !sudo service postgresql start > /dev/null 2>&1

    # Set password for the 'postgres' user to avoid authentication errors (suppress output)
    !sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD 'password';" > /dev/null 2>&1

    # Create the 'colab_db' database (suppress output)
    !sudo -u postgres psql -c "CREATE DATABASE contoso_100k;" > /dev/null 2>&1

    # Download the PostgreSQL .sql dump
    !wget -q -O contoso_100k.sql https://github.com/lukebarousse/Int_SQL_Data_Analytics_Course/releases/download/v.0.0.0/contoso_100k.sql

    # Restore the dump file into the PostgreSQL database (suppress output)
    !sudo -u postgres psql contoso_100k < contoso_100k.sql > /dev/null 2>&1

    # Shift libraries from ipython-sql to jupysql
    !pip uninstall -y ipython-sql > /dev/null 2>&1
    !pip install jupysql > /dev/null 2>&1

# Load the sql extension for SQL magic
%load_ext sql

# Connect to the PostgreSQL database
%sql postgresql://postgres:password@localhost:5432/contoso_100k

# Enable automatic conversion of SQL results to pandas DataFrames
%config SqlMagic.autopandas = True

# Disable named parameters for SQL magic
%config SqlMagic.named_parameters = "disabled"

# Display pandas number to two decimal places
pd.options.display.float_format = '{:.2f}'.format

### COALESCE for Average Revenue
#### Spending Customers vs. All Customers

In [2]:
%%sql

SELECT
    customerkey,
    SUM(quantity * netprice * exchangerate) AS net_revenue
FROM sales
GROUP BY
    customerkey

Unnamed: 0,customerkey,net_revenue
0,876049,2601.13
1,2089398,98.39
2,300840,1221.78
3,418360,2602.96
4,1128199,638.48
...,...,...
49482,871851,2856.86
49483,552140,3790.84
49484,1735944,2381.39
49485,1110282,2384.39


In [3]:
%%sql

-- Put query into a CTE
WITH sales_data AS (
        SELECT
            customerkey,
            SUM(quantity * netprice * exchangerate) AS net_revenue
        FROM sales
        GROUP BY
            customerkey
)

SELECT
    c.customerkey,
    s.net_revenue,
    COALESCE(s.net_revenue, 0) AS cleaned_net_revenue
FROM customer c
LEFT JOIN sales_data s ON c.customerkey = s.customerkey
LIMIT 10

Unnamed: 0,customerkey,net_revenue,cleaned_net_revenue
0,15,2217.41,2217.41
1,23,,0.0
2,36,,0.0
3,120,,0.0
4,180,2510.22,2510.22
5,185,1395.52,1395.52
6,189,,0.0
7,210,,0.0
8,225,,0.0
9,243,287.67,287.67


In [4]:
%%sql

-- Put query into a CTE
WITH sales_data AS (
        SELECT
            customerkey,
            SUM(quantity * netprice * exchangerate) AS net_revenue
        FROM sales
        GROUP BY
            customerkey
)

SELECT
    AVG(s.net_revenue) AS spending_customers_avg_net_revenue,  -- average net revenue for customers that have sales
    AVG(COALESCE(s.net_revenue, 0)) AS all_customers_avg_net_revenue -- average net revenue for all customers
FROM customer c
LEFT JOIN sales_data s ON c.customerkey = s.customerkey

Unnamed: 0,spending_customers_avg_net_revenue,all_customers_avg_net_revenue
0,4170.94,1965.97


### NULLIF for Average Revenue

In [5]:
%%sql

-- Put query into a CTE
WITH sales_data AS (
        SELECT
            customerkey,
            SUM(quantity * netprice * exchangerate) AS net_revenue
        FROM sales
        GROUP BY
            customerkey
)

SELECT
    AVG(s.net_revenue) AS spending_customers_avg_net_revenue,  -- average net revenue for customers that have sales
    AVG(NULLIF(s.net_revenue, 0)) AS all_customers_avg_net_revenue -- average net revenue for all customers
FROM customer c
LEFT JOIN sales_data s ON c.customerkey = s.customerkey

Unnamed: 0,spending_customers_avg_net_revenue,all_customers_avg_net_revenue
0,4170.94,4170.94
