In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# If running in Google Colab, install PostgreSQL and restore the database
if 'google.colab' in sys.modules:
    # Update package installer
    !sudo apt-get update -qq > /dev/null 2>&1

    # Install PostgreSQL
    !sudo apt-get install postgresql -qq > /dev/null 2>&1

    # Start PostgreSQL service (suppress output)
    !sudo service postgresql start > /dev/null 2>&1

    # Set password for the 'postgres' user to avoid authentication errors (suppress output)
    !sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD 'password';" > /dev/null 2>&1

    # Create the 'colab_db' database (suppress output)
    !sudo -u postgres psql -c "CREATE DATABASE contoso_100k;" > /dev/null 2>&1

    # Download the PostgreSQL .sql dump
    !wget -q -O contoso_100k.sql https://github.com/lukebarousse/Int_SQL_Data_Analytics_Course/releases/download/v.0.0.0/contoso_100k.sql

    # Restore the dump file into the PostgreSQL database (suppress output)
    !sudo -u postgres psql contoso_100k < contoso_100k.sql > /dev/null 2>&1

    # Shift libraries from ipython-sql to jupysql
    !pip uninstall -y ipython-sql > /dev/null 2>&1
    !pip install jupysql > /dev/null 2>&1

# Load the sql extension for SQL magic
%load_ext sql

# Connect to the PostgreSQL database
%sql postgresql://postgres:password@localhost:5432/contoso_100k

# Enable automatic conversion of SQL results to pandas DataFrames
%config SqlMagic.autopandas = True

# Disable named parameters for SQL magic
%config SqlMagic.named_parameters = "disabled"

# Display pandas number to two decimal places
pd.options.display.float_format = '{:.2f}'.format

### FIRST_VALUE(), LAST_VALUE(), NTH_VALUE(), LAG(), LEAD()
#### Month-Over-Month Revenue Growth

In [5]:
%%sql
WITH monthly_revenue AS(
    SELECT
      TO_CHAR(orderdate, 'YYYY-MM') AS month,
      SUM(quantity * netprice * exchangerate) AS net_revenue
    FROM
      sales
    WHERE EXTRACT(YEAR FROM orderdate) = 2023
    GROUP BY month
    ORDER BY month
)
SELECT
  *,
  FIRST_VALUE(net_revenue) OVER (ORDER BY month) AS first_month_revenue,
  LAST_VALUE(net_revenue) OVER (ORDER BY month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_month_revenue,
  NTH_VALUE(net_revenue, 3) OVER (ORDER BY month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS third_month_revenue,
  LAG(net_revenue) OVER (ORDER BY month) AS previous_month_revenue,
  LEAD(net_revenue) OVER (ORDER BY month) AS next_month_revenue
FROM monthly_revenue

Unnamed: 0,month,net_revenue,first_month_revenue,last_month_revenue,third_month_revenue,previous_month_revenue,next_month_revenue
0,2023-01,3664431.34,3664431.34,2928550.93,2244316.52,,4465204.57
1,2023-02,4465204.57,3664431.34,2928550.93,2244316.52,3664431.34,2244316.52
2,2023-03,2244316.52,3664431.34,2928550.93,2244316.52,4465204.57,1162796.16
3,2023-04,1162796.16,3664431.34,2928550.93,2244316.52,2244316.52,2943005.99
4,2023-05,2943005.99,3664431.34,2928550.93,2244316.52,1162796.16,2864500.03
5,2023-06,2864500.03,3664431.34,2928550.93,2244316.52,2943005.99,2337639.34
6,2023-07,2337639.34,3664431.34,2928550.93,2244316.52,2864500.03,2623919.79
7,2023-08,2623919.79,3664431.34,2928550.93,2244316.52,2337639.34,2622774.85
8,2023-09,2622774.85,3664431.34,2928550.93,2244316.52,2623919.79,2551322.61
9,2023-10,2551322.61,3664431.34,2928550.93,2244316.52,2622774.85,2700103.38


In [8]:
%%sql
WITH monthly_revenue AS(
    SELECT
      TO_CHAR(orderdate, 'YYYY-MM') AS month,
      SUM(quantity * netprice * exchangerate) AS net_revenue
    FROM
      sales
    WHERE EXTRACT(YEAR FROM orderdate) = 2023
    GROUP BY month
    ORDER BY month
)
SELECT
  *,
  LAG(net_revenue) OVER (ORDER BY month) AS previous_month_revenue,
  net_revenue - LAG(net_revenue) OVER (ORDER BY month) AS monthly_rev_growth,
  100*(net_revenue - LAG(net_revenue) OVER (ORDER BY month)) / LAG(net_revenue) OVER (ORDER BY month) AS rate_of_change
FROM
  monthly_revenue

Unnamed: 0,month,net_revenue,previous_month_revenue,monthly_rev_growth,rate_of_change
0,2023-01,3664431.34,,,
1,2023-02,4465204.57,3664431.34,800773.22,21.85
2,2023-03,2244316.52,4465204.57,-2220888.05,-49.74
3,2023-04,1162796.16,2244316.52,-1081520.36,-48.19
4,2023-05,2943005.99,1162796.16,1780209.83,153.1
5,2023-06,2864500.03,2943005.99,-78505.96,-2.67
6,2023-07,2337639.34,2864500.03,-526860.69,-18.39
7,2023-08,2623919.79,2337639.34,286280.45,12.25
8,2023-09,2622774.85,2623919.79,-1144.94,-0.04
9,2023-10,2551322.61,2622774.85,-71452.24,-2.72


### LAG( ) and LEAD( )
#### LTV Change from Cohort-to-Cohort

In [9]:
%%sql

WITH yearly_cohort AS (
  SELECT
    customerkey,
    EXTRACT(YEAR FROM MIN(orderdate)) AS cohort_year,
    SUM(quantity * netprice * exchangerate) AS customer_ltv
  FROM
    sales
  GROUP BY
    customerkey
),
cohort_summary AS (
  SELECT
    cohort_year,
    customerkey,
    customer_ltv,
    AVG(customer_ltv) OVER (PARTITION BY cohort_year) AS avg_cohort_ltv
  FROM
    yearly_cohort
  ORDER BY
    cohort_year,
    customerkey
),
cohort_final AS (
    SELECT DISTINCT
      cohort_year,
      avg_cohort_ltv
    FROM
      cohort_summary
    ORDER BY
      cohort_year
)
SELECT
  *,
  LAG(avg_cohort_ltv) OVER (ORDER BY cohort_year) AS prev_cohort_ltv,
  100*(avg_cohort_ltv - LAG(avg_cohort_ltv) OVER (ORDER BY cohort_year)) / LAG(avg_cohort_ltv) OVER (ORDER BY cohort_year) AS ltv_change
FROM cohort_final

Unnamed: 0,cohort_year,avg_cohort_ltv,prev_cohort_ltv,ltv_change
0,2015,5271.59,,
1,2016,5404.92,5271.59,2.53
2,2017,5403.08,5404.92,-0.03
3,2018,4896.64,5403.08,-9.37
4,2019,4731.95,4896.64,-3.36
5,2020,3933.32,4731.95,-16.88
6,2021,3943.33,3933.32,0.25
7,2022,3315.52,3943.33,-15.92
8,2023,2543.18,3315.52,-23.29
9,2024,2037.55,2543.18,-19.88
