**Querying postgreSQL in Jupyter notebook**


# Setup and custom tables

In [1]:
import pandas as pd
import sqlalchemy
import sqlalchemy_utils
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

In [2]:
# Define a database name
# Set your postgres username
dbname = "baseball"
username = "lacar"  # change this to your username

# Working with PostgreSQL in Python
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database=dbname, user=username)

# Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine("postgres://%s@localhost/%s" % (username, dbname))
print(engine.url)

postgres://lacar@localhost/baseball


## Using CTE

In [4]:
# Just use a date difference less than 30 days

sql_query = """
WITH posts (user_id, post, time)
AS (VALUES
(1, 'A', CAST('2-14-20' AS date)),
(2, 'B', CAST('2-14-20' AS date)),
(3, 'C', CAST('2-15-20' AS date)),
(1, 'B', CAST('2-15-20' AS date)),
(2, 'A', CAST('2-16-20' AS date)),
(3, 'B', CAST('2-17-20' AS date)),
(1, 'D', CAST('2-18-20' AS date)))

SELECT *
FROM posts
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,user_id,post,time
0,1,A,2020-02-14
1,2,B,2020-02-14
2,3,C,2020-02-15
3,1,B,2020-02-15
4,2,A,2020-02-16
5,3,B,2020-02-17
6,1,D,2020-02-18


## Using pandas

### Dates

In [4]:
# Generate random date ranges
# From https://towardsdatascience.com/mastering-dates-and-timestamps-in-pandas-and-python-in-general-5b8c6edcc50c

import random
import time
from dateutil.parser import parse
def str_time_prop(start, end, format, prop):
    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))
    ptime = stime + prop * (etime - stime)
    return time.strftime(format, time.localtime(ptime))

selected_format = '%Y-%m-%d %H:%M:%S'

def random_date(start, end, prop):
    return parse(str_time_prop(start, end, selected_format, prop)).strftime(selected_format)

def make_date(begin_dt, end_dt):
    return random_date(begin_dt, end_dt, random.random())

# e.g. make_date("2020-01-01 13:40:00", "2020-01-14 14:50:00")

# Generate dates (my function)
def generate_dates(n_dates, begin_dt, end_dt):
    return sorted([make_date(begin_dt, end_dt) for x in range(n_dates)])

### Names

In [5]:
# Generate names (my function)
def generate_name_list(n_names):
    import names   # needed to pip install
    name_list = list()
    for i in range(n_names):
        name_list.append(names.get_first_name())
    return name_list

### Random values within a range

In [6]:
def generate_values(n_vals, lowest, highest):
    import random
    random_vals = random.sample(range(lowest, highest), n_vals)
    return random_vals

In [7]:
# Also try random.random()

In [8]:
generate_values(10, 1, 20)

[6, 4, 2, 13, 17, 11, 8, 10, 3, 9]

### Multipurpose 3-digit codes

In [9]:
# Generate 3-digit codes (e.g. city ids) (my function)
def generate_codes(n_codes):
    # 3 digits between 110 and 999 without repeating
    import random
    code_ids = random.sample(range(110, 1000), n_codes)
    return code_ids

### Multipurpose custom values

In [10]:
# Generate random list following input of a set of values to choose
def generate_custom_vals(list2consider, n_items):
    custom_list = np.random.choice(list2consider, size=n_items, replace=True).tolist()
    return custom_list

### Example

In [11]:
my_date_list = generate_dates(10, "2020-01-01 13:40:00", "2020-01-14 14:50:00")
my_name_list = generate_name_list(10)
my_city_codes = generate_codes(10)

# Custom list 1
my_list2consider = ['desktop-browser','mobile-browser','ios-native','android-native']
my_list2consider4table = generate_custom_vals(my_list2consider, 10)

# Custom list 2
my_list2consider = ['US', 'Canada', 'Mexico']
my_countries4table = generate_custom_vals(my_list2consider, 10)

In [12]:
col_1 = range(1, 11)
col_2 = pd.to_datetime(my_date_list)
col_3 = my_list2consider4table
col_4 = my_countries4table
table1 = pd.DataFrame([col_1, col_2, col_3, col_4]).T
table1.columns = ['user_id', 'join_ts', 'join_client', 'country']

table1

# Temp table created here that I'll just over-write with each new problem
# table1.to_sql('user_summary', engine, if_exists='replace')


Unnamed: 0,user_id,join_ts,join_client,country
0,1,2020-01-04 09:36:15,desktop-browser,Mexico
1,2,2020-01-04 13:49:17,ios-native,Canada
2,3,2020-01-07 02:19:36,mobile-browser,Mexico
3,4,2020-01-07 20:11:17,mobile-browser,US
4,5,2020-01-08 07:05:53,desktop-browser,Mexico
5,6,2020-01-10 17:25:59,mobile-browser,Canada
6,7,2020-01-11 22:10:44,desktop-browser,Canada
7,8,2020-01-11 23:08:14,desktop-browser,US
8,9,2020-01-12 08:17:45,mobile-browser,Mexico
9,10,2020-01-14 13:55:38,mobile-browser,US


# 1205. Monthly Transactions II

https://leetcode.com/problems/monthly-transactions-ii/

Write an SQL query to find for each month and country, the number of approved transactions and their total amount, the number of chargebacks and their total amount.

Note: In your query, given the month and country, ignore rows with all zeros.

The query result format is in the following example:

Transactions table:
+------+---------+----------+--------+------------+
| id   | country | state    | amount | trans_date |
+------+---------+----------+--------+------------+
| 101  | US      | approved | 1000   | 2019-05-18 |
| 102  | US      | declined | 2000   | 2019-05-19 |
| 103  | US      | approved | 3000   | 2019-06-10 |
| 104  | US      | approved | 4000   | 2019-06-13 |
| 105  | US      | approved | 5000   | 2019-06-15 |
+------+---------+----------+--------+------------+

Chargebacks table:
+------------+------------+
| trans_id   | trans_date |
+------------+------------+
| 102        | 2019-05-29 |
| 101        | 2019-06-30 |
| 105        | 2019-09-18 |
+------------+------------+

Result table:
+----------+---------+----------------+-----------------+-------------------+--------------------+
| month    | country | approved_count | approved_amount | chargeback_count  | chargeback_amount  |
+----------+---------+----------------+-----------------+-------------------+--------------------+
| 2019-05  | US      | 1              | 1000            | 1                 | 2000               |
| 2019-06  | US      | 3              | 12000           | 1                 | 1000               |
| 2019-09  | US      | 0              | 0               | 1                 | 5000               |
+----------+---------+----------------+-----------------+-------------------+--------------------+

In [6]:
# Checking created tables
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))
 
SELECT *
FROM Transactions;
"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,id,country,state,amount,trans_date
0,101,US,approved,1000,2019-05-18
1,102,US,declined,2000,2019-05-19
2,103,US,approved,3000,2019-06-10
3,104,US,approved,4000,2019-06-13
4,105,US,approved,5000,2019-06-15


In [7]:
# Checking created tables
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))
 
SELECT *
FROM Chargebacks;
"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,trans_id,trans_date
0,102,2019-05-29
1,101,2019-06-30
2,105,2019-09-18


## First attempt, trying all at once

In [34]:
# Trying query all at once
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))
 
-- month    | country | approved_count | approved_amount | chargeback_count  | chargeback_amount 
-- extract year-month from date
-- group by country
-- use a case when for approved count
-- need to join on trans_id for the chargeback amount, but the count/amount is assigned based on trans_date
-- then will need to re-join on year-month 



SELECT CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)) AS month,
       country,
       SUM(CASE WHEN state='approved' THEN 1
                  END) AS approved_count,
       SUM(CASE WHEN state='approved' THEN amount
                  END) AS
                  approved_amount
                
FROM Transactions T

JOIN (SELECT CONCAT(EXTRACT(YEAR FROM T1.trans_date), '-', EXTRACT(MONTH FROM T1.trans_date)) AS month,
              SUM (CASE WHEN state='declined' THEN 1
                  END) AS chargeback_count,
              SUM (CASE WHEN state='approved' THEN amount
                  END) AS chargeback_amount
       FROM Transactions T1
       RIGHT JOIN Chargebacks CB
       ON T1.id=CB.trans_id
       GROUP BY CONCAT(EXTRACT(YEAR FROM T1.trans_date), '-', EXTRACT(MONTH FROM T1.trans_date)),
                      T1.country) cb_table
                      
ON T.CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date))=cb_table.CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date))
AND T.country=cb_table.country

GROUP BY CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)),
         T.country

"""
pd.read_sql_query(sql_query,con)


DatabaseError: Execution failed on sql '
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))
 
-- month    | country | approved_count | approved_amount | chargeback_count  | chargeback_amount 
-- extract year-month from date
-- group by country
-- use a case when for approved count
-- need to join on trans_id for the chargeback amount, but the count/amount is assigned based on trans_date
-- then will need to re-join on year-month 



SELECT CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)) AS month,
       country,
       SUM(CASE WHEN state='approved' THEN 1
                  END) AS approved_count,
       SUM(CASE WHEN state='approved' THEN amount
                  END) AS
                  approved_amount
                
FROM Transactions T

JOIN (SELECT CONCAT(EXTRACT(YEAR FROM T1.trans_date), '-', EXTRACT(MONTH FROM T1.trans_date)) AS month,
              SUM (CASE WHEN state='declined' THEN 1
                  END) AS chargeback_count,
              SUM (CASE WHEN state='approved' THEN amount
                  END) AS chargeback_amount
       FROM Transactions T1
       RIGHT JOIN Chargebacks CB
       ON T1.id=CB.trans_id
       GROUP BY CONCAT(EXTRACT(YEAR FROM T1.trans_date), '-', EXTRACT(MONTH FROM T1.trans_date)),
                      T1.country) cb_table
                      
ON T.CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date))=cb_table.CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date))
AND T.country=cb_table.country

GROUP BY CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)),
         T.country

': schema "t" does not exist
LINE 46: ON T.CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONT...
            ^


## Approved counts only

In [48]:
# Query with approved counts only
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))

SELECT CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)) AS month,
       country,
       SUM(CASE WHEN state='approved' THEN 1
                ELSE 0 END) AS approved_count,
       SUM(CASE WHEN state='approved' THEN amount
                ELSE 0 END) AS approved_amount
                
FROM Transactions T

GROUP BY CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)),
         T.country

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,month,country,approved_count,approved_amount
0,2019-5,US,1,1000
1,2019-6,US,3,12000


## Chargebacks only

In [49]:
# Query with chargebacks only
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))

SELECT CONCAT(EXTRACT(YEAR FROM CB.trans_date), '-', EXTRACT(MONTH FROM CB.trans_date)) AS month,
       T1.country,
      SUM (CASE WHEN state='declined' THEN 1
           ELSE 0 END) AS chargeback_count,
      SUM (CASE WHEN state='declined' THEN amount
           ELSE 0 END) AS chargeback_amount
       
FROM Transactions T1
RIGHT JOIN Chargebacks CB
ON T1.id=CB.trans_id
GROUP BY CONCAT(EXTRACT(YEAR FROM CB.trans_date), '-', EXTRACT(MONTH FROM CB.trans_date)),
                T1.country

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,month,country,chargeback_count,chargeback_amount
0,2019-5,US,1,2000
1,2019-9,US,0,0
2,2019-6,US,0,0


**Key was recognizing that both approved and declined can be counted**

In [56]:
# Query with chargebacks only
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))

SELECT CONCAT(EXTRACT(YEAR FROM CB.trans_date), '-', EXTRACT(MONTH FROM CB.trans_date)) AS month,
       T1.country,
       COUNT(T1.amount) AS chargeback_count,
       SUM(T1.amount) AS chargeback_amount
       
FROM Transactions T1
RIGHT JOIN Chargebacks CB
ON T1.id=CB.trans_id
GROUP BY CONCAT(EXTRACT(YEAR FROM CB.trans_date), '-', EXTRACT(MONTH FROM CB.trans_date)),
         T1.country

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,month,country,chargeback_count,chargeback_amount
0,2019-5,US,1,2000
1,2019-9,US,1,5000
2,2019-6,US,1,1000


## Putting it all together with correction

Need outer join so that 0s from each table are included

In [83]:
# Trying query all at once
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date ))),


-- need to have two CTEs

approved_table AS
    (SELECT CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)) AS month,
           T.country,
           SUM(CASE WHEN state='approved' THEN 1
                    ELSE 0 END) AS approved_count,
           SUM(CASE WHEN state='approved' THEN amount
                    ELSE 0 END) AS approved_amount
    FROM Transactions T
    GROUP BY CONCAT(EXTRACT(YEAR FROM trans_date), '-', EXTRACT(MONTH FROM trans_date)), T.country),

cb_table AS
    (SELECT CONCAT(EXTRACT(YEAR FROM CB.trans_date), '-', EXTRACT(MONTH FROM CB.trans_date)) AS month,
           T1.country,
           COUNT(T1.amount) AS chargeback_count,
           SUM(T1.amount) AS chargeback_amount      
    FROM Transactions T1
    RIGHT JOIN Chargebacks CB
    ON T1.id=CB.trans_id
    GROUP BY CONCAT(EXTRACT(YEAR FROM CB.trans_date), '-', EXTRACT(MONTH FROM CB.trans_date)),
             T1.country)
 


SELECT *
FROM approved_table a_t
FULL JOIN cb_table cbt
ON a_t.month=cbt.month
AND a_t.country=cbt.country

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,month,country,approved_count,approved_amount,month.1,country.1,chargeback_count,chargeback_amount
0,2019-5,US,1.0,1000.0,2019-5,US,1,2000
1,2019-6,US,3.0,12000.0,2019-6,US,1,1000
2,,,,,2019-9,US,1,5000


## Check discussion

Not all the way there

In [89]:
# To parse the date if it was a string

sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))

SELECT LEFT(state, 4)
FROM Transactions

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,left
0,appr
1,decl
2,appr
3,appr
4,appr


In [92]:
# Interesting use of SUM but didn't work for me - safer to use CASE

sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))

-- This didn't work for me: SELECT SUM(state='approved') AS approved_count

SELECT SUM(CASE WHEN state='approved' THEN 1 END) AS approved_count
FROM Transactions

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,approved_count
0,4


In [97]:
# date format trick also doesn't work for me

# sql_query = """
# WITH Transactions (id, country, state, amount, trans_date)
# AS (VALUES
# (101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
# (102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
# (103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
# (104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
# (105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

# Chargebacks (trans_id, trans_date)
# AS (VALUES
# (102, CAST('2019-05-29' AS date )),
# (101, CAST('2019-06-30' AS date )),
# (105, CAST('2019-09-18' AS date )))


# -- SELECT DATE_FORMAT(trans_date,"%Y-%m") AS month  # doesn't work
# -- FROM Transactions


# """
# pd.read_sql_query(sql_query,con)


## My solution after looking at discussion

Key is to have a chargeback table look like the transaction table, then do something with that

In [108]:
# UNION ALL chargeback table
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))


SELECT T.id, country, state, amount, CB.trans_date
FROM Transactions T
RIGHT JOIN Chargebacks CB
ON T.id=CB.trans_id
"""
pd.read_sql_query(sql_query,con)

Unnamed: 0,id,country,state,amount,trans_date
0,101,US,approved,1000,2019-06-30
1,102,US,declined,2000,2019-05-29
2,105,US,approved,5000,2019-09-18


In [109]:
# UNION ALL chargeback table
sql_query = """
WITH Transactions (id, country, state, amount, trans_date)
AS (VALUES
(101, 'US', 'approved', 1000, CAST('2019-05-18' AS date )),
(102, 'US', 'declined', 2000, CAST('2019-05-19' AS date )),
(103, 'US', 'approved', 3000, CAST('2019-06-10' AS date )),
(104, 'US', 'approved', 4000, CAST('2019-06-13' AS date )),
(105, 'US', 'approved', 5000, CAST('2019-06-15' AS date ))),

Chargebacks (trans_id, trans_date)
AS (VALUES
(102, CAST('2019-05-29' AS date )),
(101, CAST('2019-06-30' AS date )),
(105, CAST('2019-09-18' AS date )))


SELECT T.id, country, 'chargeback' AS state, amount, CB.trans_date
FROM Transactions T
RIGHT JOIN Chargebacks CB
ON T.id=CB.trans_id

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,id,country,state,amount,trans_date
0,101,US,chargeback,1000,2019-06-30
1,102,US,chargeback,2000,2019-05-29
2,105,US,chargeback,5000,2019-09-18


## **Finish this query**

# 1393. Capital Gain/Loss (SQL mock 7/27/20)

**note: problem can be made harder**

https://leetcode.com/problems/capital-gainloss/


+---------------+---------+
| Column Name   | Type    |
+---------------+---------+
| stock_name    | varchar |
| operation     | enum    |
| operation_day | int     |
| price         | int     |
+---------------+---------+
(stock_name, day) is the primary key for this table.
The operation column is an ENUM of type ('Sell', 'Buy')
Each row of this table indicates that the stock which has stock_name had an operation on the day operation_day with the price.
It is guaranteed that each 'Sell' operation for a stock has a corresponding 'Buy' operation in a previous day.

Write an SQL query to report the Capital gain/loss for each stock.

The capital gain/loss of a stock is total gain or loss after buying and selling the stock one or many times.

Return the result table in any order.

The query result format is in the following example:

Stocks table:
+---------------+-----------+---------------+--------+
| stock_name    | operation | operation_day | price  |
+---------------+-----------+---------------+--------+
| Leetcode      | Buy       | 1             | 1000   |
| Corona Masks  | Buy       | 2             | 10     |
| Leetcode      | Sell      | 5             | 9000   |
| Handbags      | Buy       | 17            | 30000  |
| Corona Masks  | Sell      | 3             | 1010   |
| Corona Masks  | Buy       | 4             | 1000   |
| Corona Masks  | Sell      | 5             | 500    |
| Corona Masks  | Buy       | 6             | 1000   |
| Handbags      | Sell      | 29            | 7000   |
| Corona Masks  | Sell      | 10            | 10000  |
+---------------+-----------+---------------+--------+

In [10]:
# input table
sql_query = """
WITH Stocks (stock_name, operation, operation_day, price)
AS (VALUES
('Leetcode', 'Buy', 1, 1000),
('Corona Masks', 'Buy', 2, 10),
('Leetcode', 'Sell', 5, 9000),
('Handbags', 'Buy', 17, 30000),
('Corona Masks', 'Sell', 3, 1010),
('Corona Masks', 'Buy', 4, 1000),
('Corona Masks', 'Sell', 5, 500),
('Corona Masks', 'Buy', 6, 1000),
('Handbags', 'Sell', 19, 7000),
('Corona Masks', 'Sell', 10, 10000))

SELECT *
FROM Stocks

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,stock_name,operation,operation_day,price
0,Leetcode,Buy,1,1000
1,Corona Masks,Buy,2,10
2,Leetcode,Sell,5,9000
3,Handbags,Buy,17,30000
4,Corona Masks,Sell,3,1010
5,Corona Masks,Buy,4,1000
6,Corona Masks,Sell,5,500
7,Corona Masks,Buy,6,1000
8,Handbags,Sell,19,7000
9,Corona Masks,Sell,10,10000


In [25]:
# Attempt 1
sql_query = """
WITH Stocks (stock_name, operation, operation_day, price)
AS (VALUES
('Leetcode', 'Buy', 1, 1000),
('Corona Masks', 'Buy', 2, 10),
('Leetcode', 'Sell', 5, 9000),
('Handbags', 'Buy', 17, 30000),
('Corona Masks', 'Sell', 3, 1010),
('Corona Masks', 'Buy', 4, 1000),
('Corona Masks', 'Sell', 5, 500),
('Corona Masks', 'Buy', 6, 1000),
('Handbags', 'Sell', 19, 7000),
('Corona Masks', 'Sell', 10, 10000))

SELECT stock_name,
       SUM(CASE WHEN operation='Buy' THEN -1.0*price
                ELSE 1.0*price END) AS capital_gain_loss
FROM stocks
GROUP BY stock_name
"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,stock_name,capital_gain_loss
0,Leetcode,8000.0
1,Corona Masks,9500.0
2,Handbags,-23000.0


# 1364. Number of Trusted Contacts of a Customer (SQL mock 7/27/20)

https://leetcode.com/problems/number-of-trusted-contacts-of-a-customer/

Write an SQL query to find the following for each invoice_id:

customer_name: The name of the customer the invoice is related to.
price: The price of the invoice.
contacts_cnt: The number of contacts related to the customer.
trusted_contacts_cnt: The number of contacts related to the customer and at the same time they are customers to the shop. (i.e His/Her email exists in the Customers table.)
Order the result table by invoice_id.

The query result format is in the following example:

Customers table:
+-------------+---------------+--------------------+
| customer_id | customer_name | email              |
+-------------+---------------+--------------------+
| 1           | Alice         | alice@leetcode.com |
| 2           | Bob           | bob@leetcode.com   |
| 13          | John          | john@leetcode.com  |
| 6           | Alex          | alex@leetcode.com  |
+-------------+---------------+--------------------+
Contacts table:
+-------------+--------------+--------------------+
| user_id     | contact_name | contact_email      |
+-------------+--------------+--------------------+
| 1           | Bob          | bob@leetcode.com   |
| 1           | John         | john@leetcode.com  |
| 1           | Jal          | jal@leetcode.com   |
| 2           | Omar         | omar@leetcode.com  |
| 2           | Meir         | meir@leetcode.com  |
| 6           | Alice        | alice@leetcode.com |
+-------------+--------------+--------------------+
Invoices table:
+------------+-------+---------+
| invoice_id | price | user_id |
+------------+-------+---------+
| 77         | 100   | 1       |
| 88         | 200   | 1       |
| 99         | 300   | 2       |
| 66         | 400   | 2       |
| 55         | 500   | 13      |
| 44         | 60    | 6       |
+------------+-------+---------+
Result table:
+------------+---------------+-------+--------------+----------------------+
| invoice_id | customer_name | price | contacts_cnt | trusted_contacts_cnt |
+------------+---------------+-------+--------------+----------------------+
| 44         | Alex          | 60    | 1            | 1                    |
| 55         | John          | 500   | 0            | 0                    |
| 66         | Bob           | 400   | 2            | 0                    |
| 77         | Alice         | 100   | 3            | 2                    |
| 88         | Alice         | 200   | 3            | 2                    |
| 99         | Bob           | 300   | 2            | 0                    |
+------------+---------------+-------+--------------+----------------------+
Alice has three contacts, two of them are trusted contacts (Bob and John).
Bob has two contacts, none of them is a trusted contact.
Alex has one contact and it is a trusted contact (Alice).
John doesn't have any contacts.



In [34]:
# input table
sql_query = """
WITH
Customers (customer_id, customer_name, email)
AS (VALUES
(2, 'Alice', 'alice@leetcode.com'),
(3, 'Bob', 'bob@leetcode.com'),
(13, 'John', 'john@leetcode.com'),
(6, 'Alex', 'alex@leetcode.com')),

Contacts (user_id, contact_name, contact_email)
AS (VALUES
(1, 'Bob', 'bob@leetcode.com'),
(1, 'John', 'john@leetcode.com'),
(1, 'Jal', 'jal@leetcode.com'),
(2, 'Omar', 'omar@leetcode.com'),
(2, 'Meir', 'meir@leetcode.com'),
(6, 'Alice', 'alice@leetcode.com')),

Invoices (invoice_id, price, user_id)
AS (VALUES
(77, 100, 1),
(88, 200, 1),
(99, 300, 2),
(66, 400, 2),
(55, 500, 13),
(44, 60, 6))

SELECT *
FROM Customers

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,customer_id,customer_name,email
0,2,Alice,alice@leetcode.com
1,3,Bob,bob@leetcode.com
2,13,John,john@leetcode.com
3,6,Alex,alex@leetcode.com


In [None]:
# input table
sql_query = """
WITH
Customers (customer_id, customer_name, email)
AS (VALUES
(2, 'Alice', 'alice@leetcode.com'),
(3, 'Bob', 'bob@leetcode.com'),
(13, 'John', 'john@leetcode.com'),
(6, 'Alex', 'alex@leetcode.com')),

Contacts (user_id, contact_name, contact_email)
AS (VALUES
(1, 'Bob', 'bob@leetcode.com'),
(1, 'John', 'john@leetcode.com'),
(1, 'Jal', 'jal@leetcode.com'),
(2, 'Omar', 'omar@leetcode.com'),
(2, 'Meir', 'meir@leetcode.com'),
(6, 'Alice', 'alice@leetcode.com')),

Invoices (invoice_id, price, user_id)
AS (VALUES
(77, 100, 1),
(88, 200, 1),
(99, 300, 2),
(66, 400, 2),
(55, 500, 13),
(44, 60, 6))

SELECT *
FROM Customers

"""
pd.read_sql_query(sql_query,con)





In [46]:
# first attempt
sql_query = """
WITH
Customers (customer_id, customer_name, email)
AS (VALUES
(2, 'Alice', 'alice@leetcode.com'),
(3, 'Bob', 'bob@leetcode.com'),
(13, 'John', 'john@leetcode.com'),
(6, 'Alex', 'alex@leetcode.com')),

Contacts (user_id, contact_name, contact_email)
AS (VALUES
(1, 'Bob', 'bob@leetcode.com'),
(1, 'John', 'john@leetcode.com'),
(1, 'Jal', 'jal@leetcode.com'),
(2, 'Omar', 'omar@leetcode.com'),
(2, 'Meir', 'meir@leetcode.com'),
(6, 'Alice', 'alice@leetcode.com')),

Invoices (invoice_id, price, user_id)
AS (VALUES
(77, 100, 1),
(88, 200, 1),
(99, 300, 2),
(66, 400, 2),
(55, 500, 13),
(44, 60, 6)),

---- query starts here

contacts_count AS
    (SELECT DISTINCT c1.customer_id,
           SUM(CASE WHEN con1.contact_name IS NOT NULL THEN 1
                    ELSE 0 END) AS contacts_cnt
    FROM Customers c1
    LEFT JOIN Contacts con1
    ON c1.customer_id=con1.user_id
    GROUP BY c1.customer_id),

customer_contacts_overlap AS
    (SELECT c1.customer_id,
           c1.customer_name
    FROM Customers c1
    WHERE c1.customer_id IN
          (SELECT DISTINCT user_id
          FROM Contacts)),

trusted_contacts AS
    (SELECT DISTINCT c1.customer_id,
           SUM(CASE WHEN con1.customer_name IS NOT NULL THEN 1
                    ELSE 0 END) AS trusted_contacts_cnt
    FROM Customers c1
    LEFT JOIN customer_contacts_overlap con2
    ON c1.customer_id=con2.customer_id
    GROUP BY c1.customer_id)

SELECT i.invoice_id,
       cu.customer_name,
       i.price,
       contacts_count.contacts_cnt AS contacts_cnt,
       trusted_contacts. trusted_contacts_cnt AS trusted_contacts_cnt
FROM Customers cu
LEFT JOIN Invoices i
ON cu.customer_id=i.user_id

"""
pd.read_sql_query(sql_query,con)


DatabaseError: Execution failed on sql '
WITH
Customers (customer_id, customer_name, email)
AS (VALUES
(2, 'Alice', 'alice@leetcode.com'),
(3, 'Bob', 'bob@leetcode.com'),
(13, 'John', 'john@leetcode.com'),
(6, 'Alex', 'alex@leetcode.com')),

Contacts (user_id, contact_name, contact_email)
AS (VALUES
(1, 'Bob', 'bob@leetcode.com'),
(1, 'John', 'john@leetcode.com'),
(1, 'Jal', 'jal@leetcode.com'),
(2, 'Omar', 'omar@leetcode.com'),
(2, 'Meir', 'meir@leetcode.com'),
(6, 'Alice', 'alice@leetcode.com')),

Invoices (invoice_id, price, user_id)
AS (VALUES
(77, 100, 1),
(88, 200, 1),
(99, 300, 2),
(66, 400, 2),
(55, 500, 13),
(44, 60, 6)),

---- query starts here

contacts_count AS
    (SELECT DISTINCT c1.customer_id,
           SUM(CASE WHEN con1.contact_name IS NOT NULL THEN 1
                    ELSE 0 END) AS contacts_cnt
    FROM Customers c1
    LEFT JOIN Contacts con1
    ON c1.customer_id=con1.user_id
    GROUP BY c1.customer_id),

customer_contacts_overlap AS
    (SELECT c1.customer_id,
           c1.customer_name
    FROM Customers c1
    WHERE c1.customer_id IN
          (SELECT DISTINCT user_id
          FROM Contacts)),

trusted_contacts AS
    (SELECT DISTINCT c1.customer_id,
           SUM(CASE WHEN con1.customer_name IS NOT NULL THEN 1
                    ELSE 0 END) AS trusted_contacts_cnt
    FROM Customers c1
    LEFT JOIN customer_contacts_overlap con2
    ON c1.customer_id=con2.customer_id
    GROUP BY c1.customer_id)

SELECT i.invoice_id,
       cu.customer_name,
       i.price,
       contacts_count.contacts_cnt AS contacts_cnt,
       trusted_contacts. trusted_contacts_cnt AS trusted_contacts_cnt
FROM Customers cu
LEFT JOIN Invoices i
ON cu.customer_id=i.user_id

': missing FROM-clause entry for table "con1"
LINE 49:            SUM(CASE WHEN con1.customer_name IS NOT NULL THEN...
                                  ^


In [39]:
# diagnosing
sql_query = """
WITH
Customers (customer_id, customer_name, email)
AS (VALUES
(2, 'Alice', 'alice@leetcode.com'),
(3, 'Bob', 'bob@leetcode.com'),
(13, 'John', 'john@leetcode.com'),
(6, 'Alex', 'alex@leetcode.com')),

Contacts (user_id, contact_name, contact_email)
AS (VALUES
(1, 'Bob', 'bob@leetcode.com'),
(1, 'John', 'john@leetcode.com'),
(1, 'Jal', 'jal@leetcode.com'),
(2, 'Omar', 'omar@leetcode.com'),
(2, 'Meir', 'meir@leetcode.com'),
(6, 'Alice', 'alice@leetcode.com')),

Invoices (invoice_id, price, user_id)
AS (VALUES
(77, 100, 1),
(88, 200, 1),
(99, 300, 2),
(66, 400, 2),
(55, 500, 13),
(44, 60, 6)),

---- query starts here

contacts_count AS
    (SELECT DISTINCT c1.customer_id,
            SUM(WHEN con1.customer_name NOT NULL THEN 1
                ELSE 0 END) AS contacts_cnt
    FROM Customers c1
    LEFT JOIN Contacts con1
    ON c1.customer_id=con1.user_id),

SELECT *
FROM contacts_count

"""
pd.read_sql_query(sql_query,con)





DatabaseError: Execution failed on sql '
WITH
Customers (customer_id, customer_name, email)
AS (VALUES
(2, 'Alice', 'alice@leetcode.com'),
(3, 'Bob', 'bob@leetcode.com'),
(13, 'John', 'john@leetcode.com'),
(6, 'Alex', 'alex@leetcode.com')),

Contacts (user_id, contact_name, contact_email)
AS (VALUES
(1, 'Bob', 'bob@leetcode.com'),
(1, 'John', 'john@leetcode.com'),
(1, 'Jal', 'jal@leetcode.com'),
(2, 'Omar', 'omar@leetcode.com'),
(2, 'Meir', 'meir@leetcode.com'),
(6, 'Alice', 'alice@leetcode.com')),

Invoices (invoice_id, price, user_id)
AS (VALUES
(77, 100, 1),
(88, 200, 1),
(99, 300, 2),
(66, 400, 2),
(55, 500, 13),
(44, 60, 6)),

---- query starts here

contacts_count AS
    (SELECT DISTINCT c1.customer_id,
           SUM(WHEN con1.customer_name NOT NULL THEN 1
               ELSE 0 END) AS contacts_cnt
    FROM Customers c1
    LEFT JOIN Contacts con1
    ON c1.customer_id=con1.user_id),

SELECT *
FROM contacts_count

': syntax error at or near "WHEN"
LINE 32:            SUM(WHEN con1.customer_name NOT NULL THEN 1
                        ^


In [54]:
# Mike's entry

# diagnosing
sql_query = """
WITH
Customers (customer_id, customer_name, email)
AS (VALUES
(2, 'Alice', 'alice@leetcode.com'),
(3, 'Bob', 'bob@leetcode.com'),
(13, 'John', 'john@leetcode.com'),
(6, 'Alex', 'alex@leetcode.com')),

Contacts (user_id, contact_name, contact_email)
AS (VALUES
(1, 'Bob', 'bob@leetcode.com'),
(1, 'John', 'john@leetcode.com'),
(1, 'Jal', 'jal@leetcode.com'),
(2, 'Omar', 'omar@leetcode.com'),
(2, 'Meir', 'meir@leetcode.com'),
(6, 'Alice', 'alice@leetcode.com')),

Invoices (invoice_id, price, user_id)
AS (VALUES
(77, 100, 1),
(88, 200, 1),
(99, 300, 2),
(66, 400, 2),
(55, 500, 13),
(44, 60, 6)),


t1 AS (
SELECT user_id, count(contact_name) AS contacts_cnt
FROM contacts
GROUP BY user_id ),

-- User_id | contacts_cnt


t2 AS (
SELECT user_id, count(contact_name) AS trusted_contacts_cnt
FROM contacts
WHERE user_id IN (SELECT customer_id FROM customers)
GROUP BY user_id )

-- User_id | trusted_contacts_cnt [excludes those not in customers table]

SELECT invoice_id, c.customer_name, price, 
COALESCE(t1.contacts_cnt, 0), 
COALESCE(t2.trusted_contacts_cnt, 0)
FROM invoices i
LEFT JOIN customers c
ON c.customer_id = i.user_id
LEFT JOIN t1
ON i.user_id = t1.user_id
LEFT JOIN t2
ON i.user_id = t2.user_id
ORDER BY i.invoice_id

"""
pd.read_sql_query(sql_query,con)




Unnamed: 0,invoice_id,customer_name,price,coalesce,coalesce.1
0,44,Alex,60,1,1
1,55,John,500,0,0
2,66,Alice,400,2,2
3,77,,100,3,0
4,88,,200,3,0
5,99,Alice,300,2,2


# 1532. The Most Recent Three Orders

https://leetcode.com/problems/the-most-recent-three-orders/


Table: Customers

+---------------+---------+
| Column Name   | Type    |
+---------------+---------+
| customer_id   | int     |
| name          | varchar |
+---------------+---------+
customer_id is the primary key for this table.
This table contains information about customers.
 

Table: Orders

+---------------+---------+
| Column Name   | Type    |
+---------------+---------+
| order_id      | int     |
| order_date    | date    |
| customer_id   | int     |
| cost          | int     |
+---------------+---------+
order_id is the primary key for this table.
This table contains information about the orders made customer_id.
Each customer has one order per day.
 

Write an SQL query to find the most recent 3 orders of each user. If a user ordered less than 3 orders return all of their orders.

Return the result table sorted by customer_name in ascending order and in case of a tie by the customer_id in ascending order. If there still a tie, order them by the order_date in descending order.

The query result format is in the following example:

Customers
+-------------+-----------+
| customer_id | name      |
+-------------+-----------+
| 1           | Winston   |
| 2           | Jonathan  |
| 3           | Annabelle |
| 4           | Marwan    |
| 5           | Khaled    |
+-------------+-----------+

Orders
+----------+------------+-------------+------+
| order_id | order_date | customer_id | cost |
+----------+------------+-------------+------+
| 1        | 2020-07-31 | 1           | 30   |
| 2        | 2020-07-30 | 2           | 40   |
| 3        | 2020-07-31 | 3           | 70   |
| 4        | 2020-07-29 | 4           | 100  |
| 5        | 2020-06-10 | 1           | 1010 |
| 6        | 2020-08-01 | 2           | 102  |
| 7        | 2020-08-01 | 3           | 111  |
| 8        | 2020-08-03 | 1           | 99   |
| 9        | 2020-08-07 | 2           | 32   |
| 10       | 2020-07-15 | 1           | 2    |
+----------+------------+-------------+------+

Result table:
+---------------+-------------+----------+------------+
| customer_name | customer_id | order_id | order_date |
+---------------+-------------+----------+------------+
| Annabelle     | 3           | 7        | 2020-08-01 |
| Annabelle     | 3           | 3        | 2020-07-31 |
| Jonathan      | 2           | 9        | 2020-08-07 |
| Jonathan      | 2           | 6        | 2020-08-01 |
| Jonathan      | 2           | 2        | 2020-07-30 |
| Marwan        | 4           | 4        | 2020-07-29 |
| Winston       | 1           | 8        | 2020-08-03 |
| Winston       | 1           | 1        | 2020-07-31 |
| Winston       | 1           | 10       | 2020-07-15 |
+---------------+-------------+----------+------------+
Winston has 4 orders, we discard the order of "2020-06-10" because it is the oldest order.
Annabelle has only 2 orders, we return them.
Jonathan has exactly 3 orders.
Marwan ordered only one time.
We sort the result table by customer_name in ascending order, by customer_id in ascending order and by order_date in descending order in case of a tie.

In [9]:
# input table
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

SELECT *
FROM Customers

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,customer_id,name
0,1,Winston
1,2,Jonathan
2,3,Annabelle
3,4,Marwan
4,5,Khaled


In [16]:
# first attempt
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

--strategy is to add a column containing count of orders for each customer_id
--using a window function
--then filter by seeing if that order is within the last 3

SELECT *,
       RANK() OVER(PARTITION BY customer_id ORDER BY order_date) AS order_val,
       COUNT(*) OVER(PARTITION BY customer_id) AS n_orders
FROM Orders

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,order_id,order_date,customer_id,cost,order_val,n_orders
0,5,2020-06-10,1,1010,1,4
1,10,2020-07-15,1,2,2,4
2,1,2020-07-31,1,30,3,4
3,8,2020-08-03,1,99,4,4
4,2,2020-07-30,2,40,1,3
5,6,2020-08-01,2,102,2,3
6,9,2020-08-07,2,32,3,3
7,3,2020-07-31,3,70,1,2
8,7,2020-08-01,3,111,2,2
9,4,2020-07-29,4,100,1,1


In [22]:
# finish query
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2)),

--add a column containing count of orders for each customer_id using a window function
--then have a column of that customer's order number
--then filter by seeing if that order is within the last 3

t1 AS
(SELECT *,
       RANK() OVER(PARTITION BY customer_id ORDER BY order_date) AS order_val,
       COUNT(*) OVER(PARTITION BY customer_id) AS n_orders
FROM Orders),

t2 AS
(SELECT *
FROM t1
WHERE n_orders-order_val < 3)


--result table: customer_name, customer_id, order_id, order_date

SELECT c.name AS customer_name,
       c.customer_id,
       t2.order_id,
       t2.order_date
FROM t2
JOIN Customers c
ON t2.customer_id=c.customer_id
ORDER BY customer_name ASC, order_date DESC

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,customer_name,customer_id,order_id,order_date
0,Annabelle,3,7,2020-08-01
1,Annabelle,3,3,2020-07-31
2,Jonathan,2,9,2020-08-07
3,Jonathan,2,6,2020-08-01
4,Jonathan,2,2,2020-07-30
5,Marwan,4,4,2020-07-29
6,Winston,1,8,2020-08-03
7,Winston,1,1,2020-07-31
8,Winston,1,10,2020-07-15


## Iterate by trying without window function

In [25]:
# input table
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

SELECT *
FROM Orders
ORDER BY customer_id, order_date

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,order_id,order_date,customer_id,cost
0,5,2020-06-10,1,1010
1,10,2020-07-15,1,2
2,1,2020-07-31,1,30
3,8,2020-08-03,1,99
4,2,2020-07-30,2,40
5,6,2020-08-01,2,102
6,9,2020-08-07,2,32
7,3,2020-07-31,3,70
8,7,2020-08-01,3,111
9,4,2020-07-29,4,100


In [44]:
# try rank for each customer
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2)),

t AS
(SELECT *
FROM Orders
ORDER BY customer_id, order_date),

t2 AS
(SELECT *,
      (SELECT COUNT(*) FROM t
       WHERE t.order_date < t1.order_date
       AND t.customer_id=t1.customer_id) AS order_val,
       (SELECT COUNT(*) FROM t
       WHERE t.customer_id=t1.customer_id) AS n_orders
FROM t AS T1),

t3 AS
    (SELECT *
    FROM t2
    WHERE n_orders-order_val <= 3)


SELECT *
FROM t3
JOIN Customers c
ON t3.customer_id=c.customer_id

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,order_id,order_date,customer_id,cost,order_val,n_orders,customer_id.1,name
0,10,2020-07-15,1,2,1,4,1,Winston
1,1,2020-07-31,1,30,2,4,1,Winston
2,8,2020-08-03,1,99,3,4,1,Winston
3,2,2020-07-30,2,40,0,3,2,Jonathan
4,6,2020-08-01,2,102,1,3,2,Jonathan
5,9,2020-08-07,2,32,2,3,2,Jonathan
6,3,2020-07-31,3,70,0,2,3,Annabelle
7,7,2020-08-01,3,111,1,2,3,Annabelle
8,4,2020-07-29,4,100,0,1,4,Marwan


In [45]:
# final query
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2)),

t AS
(SELECT *
FROM Orders
ORDER BY customer_id, order_date),

t2 AS
(SELECT *,
      (SELECT COUNT(*) FROM t
       WHERE t.order_date < t1.order_date
       AND t.customer_id=t1.customer_id) AS order_val,
       (SELECT COUNT(*) FROM t
       WHERE t.customer_id=t1.customer_id) AS n_orders
FROM t AS T1),

t3 AS
    (SELECT *
    FROM t2
    WHERE n_orders-order_val <= 3)


SELECT c.name AS customer_name,
       c.customer_id,
       t3.order_id,
       t3.order_date
FROM t3
JOIN Customers c
ON t3.customer_id=c.customer_id
ORDER BY customer_name ASC, order_date DESC

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,customer_name,customer_id,order_id,order_date
0,Annabelle,3,7,2020-08-01
1,Annabelle,3,3,2020-07-31
2,Jonathan,2,9,2020-08-07
3,Jonathan,2,6,2020-08-01
4,Jonathan,2,2,2020-07-30
5,Marwan,4,4,2020-07-29
6,Winston,1,8,2020-08-03
7,Winston,1,1,2020-07-31
8,Winston,1,10,2020-07-15


## For SQL practice

In [47]:
# For practice - Minting 1
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

SELECT c.name AS customer_name, c.customer_id, o.order_id, o.order_date
FROM customers c
JOIN orders o
ON c.customer_id = o.customer_id
WHERE row_number() over (order by o.order_date DESC partition by c.customer_id) AS date_rank<=3
ORDER BY 1, 2,4 DESC


"""
pd.read_sql_query(sql_query,con)


DatabaseError: Execution failed on sql '
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

SELECT c.name AS customer_name, c.customer_id, o.order_id, o.order_date
FROM customers c
JOIN orders o
ON c.customer_id = o.customer_id
WHERE row_number() over (order by o.order_date DESC partition by c.customer_id) AS date_rank<=3
ORDER BY 1, 2,4 DESC


': syntax error at or near "partition"
LINE 27: ...ERE row_number() over (order by o.order_date DESC partition ...
                                                              ^


In [48]:
# For practice - Minting 2
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

SELECT c.name AS customer_name, c.customer_id, o.order_id, o.order_date
FROM customers c
JOIN orders o
ON c.customer_id = o.customer_id
WHERE row_number() over (order by o.order_date DESC partition by c.customer_id) <=3
ORDER BY 1, 2,4 DESC


"""
pd.read_sql_query(sql_query,con)


DatabaseError: Execution failed on sql '
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

SELECT c.name AS customer_name, c.customer_id, o.order_id, o.order_date
FROM customers c
JOIN orders o
ON c.customer_id = o.customer_id
WHERE row_number() over (order by o.order_date DESC partition by c.customer_id) <=3
ORDER BY 1, 2,4 DESC


': syntax error at or near "partition"
LINE 27: ...ERE row_number() over (order by o.order_date DESC partition ...
                                                              ^


In [53]:
# For practice - Steve
sql_query = """
WITH Customers (customer_id, name)
AS (VALUES
(1, 'Winston'),
(2, 'Jonathan'),
(3, 'Annabelle'),
(4, 'Marwan'),
(5, 'Khaled')),

Orders (order_id, order_date, customer_id, cost)
AS (VALUES
(1, CAST('2020-07-31' AS date), 1, 30),
(2, CAST('2020-07-30' AS date), 2, 40),
(3, CAST('2020-07-31' AS date), 3, 70),
(4, CAST('2020-07-29' AS date), 4, 100),
(5, CAST('2020-06-10' AS date), 1, 1010),
(6, CAST('2020-08-01' AS date), 2, 102),
(7, CAST('2020-08-01' AS date), 3, 111),
(8, CAST('2020-08-03' AS date), 1, 99),
(9, CAST('2020-08-07' AS date), 2, 32),
(10, CAST('2020-07-15' AS date), 1, 2))

SELECT c.name, c.customer_id,  t.order_id, t.order_date
FROM customers AS c
JOIN 
 (SELECT order_date, customer_id, order_id, 
  RANK() OVER(PARTITION BY customer_id ORDER BY order_date DESC) AS rnk
  FROM orders) AS t
ON t.customer_id = c.customer_id
WHERE t.rnk <= 3
ORDER BY c.name, t.order_date DESC

--no comma in the OVER() statement
--don't forget to order by name and order date

"""
pd.read_sql_query(sql_query,con)


Unnamed: 0,name,customer_id,order_id,order_date
0,Annabelle,3,7,2020-08-01
1,Annabelle,3,3,2020-07-31
2,Jonathan,2,9,2020-08-07
3,Jonathan,2,6,2020-08-01
4,Jonathan,2,2,2020-07-30
5,Marwan,4,4,2020-07-29
6,Winston,1,8,2020-08-03
7,Winston,1,1,2020-07-31
8,Winston,1,10,2020-07-15


# 7/9/20

You have a database consisting of a single table table of 9 students listed in alphabetical order. Each student has a class ID that is also alphabetical. Write a SQL query so that each consecutive pair of students are assigned the other students ID (Ex: Students 1 and 2 swap IDs, Students 3 and 4 swap IDs, etc…) 
	Table is Student, Columns are Name and ID


In [5]:
# Just use a date difference less than 30 days

sql_query = """
WITH Student (id, name)
AS (VALUES
(1, 'A'),
(2, 'B'),
(3, 'C'),
(4, 'D'),
(5, 'E'),
(6, 'F'),
(7, 'G'),
(8, 'H'),
(9, 'i'))

SELECT *
FROM Student
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,id,name
0,1,A
1,2,B
2,3,C
3,4,D
4,5,E
5,6,F
6,7,G
7,8,H
8,9,i


SELECT name,
       CASE WHEN id IN (1,3,5,7) THEN id+1
            WHEN id IN (2,4,6,8) THEN id-1
            ELSE id END AS new_id 
FROM Student

In [7]:
# Just use a date difference less than 30 days

sql_query = """
WITH Student (id, name)
AS (VALUES
(1, 'A'),
(2, 'B'),
(3, 'C'),
(4, 'D'),
(5, 'E'),
(6, 'F'),
(7, 'G'),
(8, 'H'),
(9, 'i'))

SELECT name,
       CASE WHEN id IN (1,3,5,7) THEN id+1
            WHEN id IN (2,4,6,8) THEN id-1
            ELSE id END AS new_id 
FROM Student
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,name,new_id
0,A,2
1,B,1
2,C,4
3,D,3
4,E,6
5,F,5
6,G,8
7,H,7
8,i,9


SELECT name,
       CASE WHEN id IN (1,3,5,7) THEN id+1
            WHEN id IN (2,4,6,8) THEN id-1
            ELSE id END AS new_id 
FROM Student

SELECT name,
       CASE WHEN id < 9 AND id % 2 <> 0 THEN id+1
            WHEN id < 9 AND id % 2 = 0 THEN id-1
            ELSE id END AS new_id 
FROM Student

In [8]:
# Just use a date difference less than 30 days

sql_query = """
WITH Student (id, name)
AS (VALUES
(1, 'A'),
(2, 'B'),
(3, 'C'),
(4, 'D'),
(5, 'E'),
(6, 'F'),
(7, 'G'),
(8, 'H'),
(9, 'i'))

SELECT name,
       CASE WHEN id < 9 AND id % 2 <> 0 THEN id+1
            WHEN id < 9 AND id % 2 = 0 THEN id-1
            ELSE id END AS new_id 
FROM Student
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,name,new_id
0,A,2
1,B,1
2,C,4
3,D,3
4,E,6
5,F,5
6,G,8
7,H,7
8,i,9


In [None]:
# Mike's solution 

t1 AS (
SELECT *, row_number() OVER (ORDER BY name) AS row_number
FROM students )

SELECT name, CASE WHEN row_number = max(row_number) THEN id 
ELSE WHEN mod(row_number) = 0 THEN lag(id, 1) 
ELSE lead(id, 1) END AS new_id
FROM t1


# 7/16/20


1-Write a query that counts how many orders have a total quantity of 500 and how many have a total equal or less than 500

SELECT COUNT(CASE WHEN total = 500 THEN 1 END ) AS n_orders_500,
       COUNT(CASE WHEN total <= 500 THEN 1 END ) AS n_orders_500orbelow
FROM orders


2-We would like to understand 3 different levels of customers based on the amount associated with their purchases. The top level includes anyone with a Lifetime Value (total sales of all orders) greater than 200,000 usd. The second level is between 200,000 and 100,000 usd. The lowest level is anyone under 100,000 usd. Provide a table that includes the level associated with each account. You should provide the account name, the total sales of all orders for the customer, and the level. Order with the top spending customers listed first.

In [None]:

-- account_name, total_sales_, level



WITH t AS
(SELECT a.name,
       SUM(o.total_amt_usd) AS total_sales
FROM accounts a
LEFT JOIN orders o
ON a.id=o.id
GROUP BY a.name)

SELECT name, 
       total_sales,
       CASE (WHEN total_sales > 200000  THEN 'high'
             WHEN total_sales < 100000  THEN 'low'
             ELSE 'mid' END) AS level
FROM t
        



# QotD 7/27/20

You work at Facebook. Your manager wants you to find out if users are more likely to leave facebook if their parents join. So what do you do?


AB testing is out.
Retention rate - Control for everything else besides parents joining
identifying that someone is the parents of another user
Active users a given # of days after their parents join
30 day retention?
Active user?
Liked, posted, commented on a post
Parent activity? Type of activity


USERS
user_id, name, DOB, gender, Looking_for, Mother, Father, Brother, Sister, Cousin, Significant Other

FRIEND_NETWORK
TO (USER ID), FROM (USER ID), RELATIONSHIP (MOTHER, FATHER, BROTHER, SISTER, COUSIN, SIGNIFICANT OTHER)

ACTIONS
user_id, date, action (sign up, login, deactivate, like, posts)


In [None]:

--user_id, pct_activity_from_other, activity_from_parents, activity_to_other, activity_to_parents, decativate (0 / 1)



WITH t1 AS
(SELECT user_id,
        AVG(CASE WHEN a.actions IN ('like', 'posts') THEN 1
            ELSE 0 END) AS pct_activity_from_other
FROM users u
LEFT JOIN friend_network f
ON f.to_user_id=u.user_id
JOIN actions a
ON f.from_user_id=a.user_id
WHERE relationship NOT IN ('mother', 'father')),

 t2 AS
(SELECT user_id,
        AVG(CASE WHEN a.actions IN ('like', 'posts') THEN 1
            ELSE 0 END) AS pct_activity_to_other
FROM users u
LEFT JOIN friend_network f
ON f.from_user_id=u.user_id
JOIN actions a
ON f.to_user_id=a.user_id
WHERE relationship NOT IN ('mother', 'father')),

t3 AS
(SELECT user_id,
        AVG(CASE WHEN a.actions IN ('like', 'posts') THEN 1
            ELSE 0 END) AS pct_activity_from_other
FROM users u
LEFT JOIN friend_network f
ON f.to_user_id=u.user_id
JOIN actions a
ON f.from_user_id=a.user_id
WHERE relationship IN ('mother', 'father')),

 t4 AS
(SELECT user_id,
        AVG(CASE WHEN a.actions IN ('like', 'posts') THEN 1
            ELSE 0 END) AS pct_activity_to_other
FROM users u
LEFT JOIN friend_network f
ON f.from_user_id=u.user_id
JOIN actions a
ON f.to_user_id=a.user_id
WHERE relationship IN ('mother', 'father'))




# QotD 7/29/20

You’re given the following table displaying weather stations across US states. 

A. Return the average latitude of weather stations from each state, rounding to nearest tenth of a degree.


In [7]:
# initial query
sql_query = """

WITH stations (id, city, state, latitude, longitude)
AS (VALUES
(1, 'Asheville', 'North Carolina', 35.6, 82.6),
(2, 'Burlington', 'North Carolina', 36.1, 79.4),
(3, 'Chapel Hill', 'North Carolina', 35.9, 79.1),
(4, 'Davidson', 'North Carolina', 35.5, 80.8),
(5, 'Elizabeth City', 'North Carolina', 36.3, 76.3),
(6, 'Fargo', 'North Dakota', 46.9, 96.8),
(7, 'Grand Forks', 'North Dakota', 47.9, 97.0),
(8, 'Hettinger', 'North Dakota', 46.0, 102.6),
(9, 'Inkster', 'North Dakota', 48.2, 97.6))

SELECT *
FROM stations

"""
pd.read_sql_query(sql_query,con)

Unnamed: 0,id,city,state,latitude,longitude
0,1,Asheville,North Carolina,35.6,82.6
1,2,Burlington,North Carolina,36.1,79.4
2,3,Chapel Hill,North Carolina,35.9,79.1
3,4,Davidson,North Carolina,35.5,80.8
4,5,Elizabeth City,North Carolina,36.3,76.3
5,6,Fargo,North Dakota,46.9,96.8
6,7,Grand Forks,North Dakota,47.9,97.0
7,8,Hettinger,North Dakota,46.0,102.6
8,9,Inkster,North Dakota,48.2,97.6


In [9]:
# Initial query
sql_query = """

WITH stations (id, city, state, latitude, longitude)
AS (VALUES
(1, 'Asheville', 'North Carolina', 35.6, 82.6),
(2, 'Burlington', 'North Carolina', 36.1, 79.4),
(3, 'Chapel Hill', 'North Carolina', 35.9, 79.1),
(4, 'Davidson', 'North Carolina', 35.5, 80.8),
(5, 'Elizabeth City', 'North Carolina', 36.3, 76.3),
(6, 'Fargo', 'North Dakota', 46.9, 96.8),
(7, 'Grand Forks', 'North Dakota', 47.9, 97.0),
(8, 'Hettinger', 'North Dakota', 46.0, 102.6),
(9, 'Inkster', 'North Dakota', 48.2, 97.6))

SELECT state,
       ROUND(AVG(latitude), 1) AS avg_latitude
FROM stations
GROUP BY state

"""
pd.read_sql_query(sql_query,con)

Unnamed: 0,state,avg_latitude
0,North Dakota,47.3
1,North Carolina,35.9


B. Return the median latitude of weather stations from each state, rounding to nearest tenth of a degree.

In [18]:
# Median function

# order by and do a row_number


sql_query = """

WITH stations (id, city, state, latitude, longitude)
AS (VALUES
(1, 'Asheville', 'North Carolina', 35.6, 82.6),
(2, 'Burlington', 'North Carolina', 36.1, 79.4),
(3, 'Chapel Hill', 'North Carolina', 35.9, 79.1),
(4, 'Davidson', 'North Carolina', 35.5, 80.8),
(5, 'Elizabeth City', 'North Carolina', 36.3, 76.3),
(6, 'Fargo', 'North Dakota', 46.9, 96.8),
(7, 'Grand Forks', 'North Dakota', 47.9, 97.0),
(8, 'Hettinger', 'North Dakota', 46.0, 102.6),
(9, 'Inkster', 'North Dakota', 48.2, 97.6))

SELECT state,
       latitude,
       ROW_NUMBER() OVER(PARTITION BY state ORDER BY latitude) AS latitude_rank,
       COUNT(*) OVER(PARTITION BY state) AS n_stations
FROM stations

"""
pd.read_sql_query(sql_query,con)

Unnamed: 0,state,latitude,latitude_rank,n_stations
0,North Carolina,35.5,1,5
1,North Carolina,35.6,2,5
2,North Carolina,35.9,3,5
3,North Carolina,36.1,4,5
4,North Carolina,36.3,5,5
5,North Dakota,46.0,1,4
6,North Dakota,46.9,2,4
7,North Dakota,47.9,3,4
8,North Dakota,48.2,4,4


In [35]:
# Median function
# order by and do a row_number

sql_query = """

WITH stations (id, city, state, latitude, longitude)
AS (VALUES
(1, 'Asheville', 'North Carolina', 35.6, 82.6),
(2, 'Burlington', 'North Carolina', 36.1, 79.4),
(3, 'Chapel Hill', 'North Carolina', 35.9, 79.1),
(4, 'Davidson', 'North Carolina', 35.5, 80.8),
(5, 'Elizabeth City', 'North Carolina', 36.3, 76.3),
(6, 'Fargo', 'North Dakota', 46.9, 96.8),
(7, 'Grand Forks', 'North Dakota', 47.9, 97.0),
(8, 'Hettinger', 'North Dakota', 46.0, 102.6),
(9, 'Inkster', 'North Dakota', 48.2, 97.6)),

t AS
(SELECT state,
       latitude,
       ROW_NUMBER() OVER(PARTITION BY state ORDER BY latitude) AS latitude_rank,
       COUNT(*) OVER(PARTITION BY state) AS n_stations
FROM stations)

SELECT state,
       latitude
FROM t
WHERE CASE WHEN n_stations % 2!=0 THEN latitude_rank=(ROUND(n_stations/2)+1)
           ELSE latitude_rank=(ROUND(n_stations/2)) OR latitude_rank=(ROUND(n_stations/2)+1))
           END

"""
pd.read_sql_query(sql_query,con)

DatabaseError: Execution failed on sql '

WITH stations (id, city, state, latitude, longitude)
AS (VALUES
(1, 'Asheville', 'North Carolina', 35.6, 82.6),
(2, 'Burlington', 'North Carolina', 36.1, 79.4),
(3, 'Chapel Hill', 'North Carolina', 35.9, 79.1),
(4, 'Davidson', 'North Carolina', 35.5, 80.8),
(5, 'Elizabeth City', 'North Carolina', 36.3, 76.3),
(6, 'Fargo', 'North Dakota', 46.9, 96.8),
(7, 'Grand Forks', 'North Dakota', 47.9, 97.0),
(8, 'Hettinger', 'North Dakota', 46.0, 102.6),
(9, 'Inkster', 'North Dakota', 48.2, 97.6)),

t AS
(SELECT state,
       latitude,
       ROW_NUMBER() OVER(PARTITION BY state ORDER BY latitude) AS latitude_rank,
       COUNT(*) OVER(PARTITION BY state) AS n_stations
FROM stations)

SELECT state,
       latitude
FROM t
WHERE CASE WHEN n_stations % 2!=0 THEN latitude_rank=(ROUND(n_stations/2)+1)
           ELSE latitude_rank=(ROUND(n_stations/2)) OR latitude_rank=(ROUND(n_stations/2)+1))
           END

': syntax error at or near ")"
LINE 26: ...UND(n_stations/2)) OR latitude_rank=(ROUND(n_stations/2)+1))
                                                                       ^


In [38]:
# Mike's median

sql_query = """

WITH stations (id, city, state, latitude, longitude)
AS (VALUES
(1, 'Asheville', 'North Carolina', 35.6, 82.6),
(2, 'Burlington', 'North Carolina', 36.1, 79.4),
(3, 'Chapel Hill', 'North Carolina', 35.9, 79.1),
(4, 'Davidson', 'North Carolina', 35.5, 80.8),
(5, 'Elizabeth City', 'North Carolina', 36.3, 76.3),
(6, 'Fargo', 'North Dakota', 46.9, 96.8),
(7, 'Grand Forks', 'North Dakota', 47.9, 97.0),
(8, 'Hettinger', 'North Dakota', 46.0, 102.6),
(9, 'Inkster', 'North Dakota', 48.2, 97.6)),

t1 AS (
SELECT *, row_number() OVER (PARTITION by state ORDER BY latitude asc) AS row_number_state,
        	count(*) OVER (PARTITION by state) AS row_count
FROM stations)

SELECT state, round(avg(latitude), 1) AS median_latitude
FROM t1
WHERE row_number_state >= 1.0*row_count/2
AND row_number_state <= 1.0*row_count/2 + 1
GROUP BY state
"""
pd.read_sql_query(sql_query,con)

Unnamed: 0,state,median_latitude
0,North Carolina,35.9
1,North Dakota,47.4


# HR The Blunder

[HR](https://www.hackerrank.com/challenges/the-blunder/problem)

In [5]:
# check input table

sql_query = """
WITH employees (id, name, salary)
AS (VALUES
(1, 'Kristeen', 1420),
(2, 'Ashley', 2006),
(3, 'Julia', 2210),
(4, 'Maria', 300))

SELECT *
FROM employees
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,id,name,salary
0,1,Kristeen,1420
1,2,Ashley,2006
2,3,Julia,2210
3,4,Maria,300


In [11]:
# check input table

sql_query = """
WITH employees (id, name, salary)
AS (VALUES
(1, 'Kristeen', 1420),
(2, 'Ashley', 2006),
(3, 'Julia', 2210),
(4, 'Maria', 300))

SELECT CAST(salary AS varchar)
FROM employees
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,salary
0,1420
1,2006
2,2210
3,300


In [30]:
# check input table

sql_query = """
WITH employees (id, name, salary)
AS (VALUES
(1, 'Kristeen', 1420),
(2, 'Ashley', 2006),
(3, 'Julia', 2210),
(4, 'Maria', 3000)),

t1 AS
(SELECT *,
       CAST(salary AS varchar) AS salary_string
FROM employees),

t2 AS
(SELECT REPLACE(salary_string, '0', '') AS sal_r
FROM t1)

SELECT (SELECT AVG(salary)
        FROM employees) - 
AVG(CAST(sal_r AS int)) 
FROM t2

"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,?column?
0,2061.0


In [33]:
# check input table

sql_query = """
WITH employees (id, name, salary)
AS (VALUES
(1, 'Kristeen', 1420),
(2, 'Ashley', 2006),
(3, 'Julia', 2210),
(4, 'Maria', 3000)),

t1 AS
(SELECT *,
       CAST(salary AS varchar) AS salary_string
FROM employees),

t2 AS
(SELECT REPLACE(salary_string, '0', '') AS sal_r
FROM t1)

SELECT AVG(salary)
        FROM employees

"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,avg
0,2159.0


**REPLACE in postgreSQL seems to work on strings specifically while solutions in HR seem to work on float/ints in mySQL**

# HR Top Earners

[HR](https://www.hackerrank.com/challenges/earnings-of-employees/problem)

In [34]:
# check input table

sql_query = """
WITH employees (employee_id, name, months, salary)
AS (VALUES
(1, 'Rose', 15, 1968),
(2, 'Angela', 1, 3443),
(3, 'Frank', 17, 1608),
(4, 'Patrick', 7,1345),
(5, 'Lisa', 11, 2330),
(6, 'Kimberly', 16, 4372),
(7, 'Bonnie', 8, 1771),
(8, 'Michael', 6, 2017),
(9, 'Todd', 5, 3396),
(10, 'Joe', 9, 3573))

SELECT *
FROM employees
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,employee_id,name,months,salary
0,1,Rose,15,1968
1,2,Angela,1,3443
2,3,Frank,17,1608
3,4,Patrick,7,1345
4,5,Lisa,11,2330
5,6,Kimberly,16,4372
6,7,Bonnie,8,1771
7,8,Michael,6,2017
8,9,Todd,5,3396
9,10,Joe,9,3573


In [50]:
# check input table

sql_query = """
WITH employees (employee_id, name, months, salary)
AS (VALUES
(1, 'Rose', 15, 1968),
(2, 'Angela', 1, 3443),
(3, 'Frank', 17, 1608),
(4, 'Patrick', 7,1345),
(5, 'Lisa', 11, 2330),
(6, 'Kimberly', 16, 4372),
(7, 'Bonnie', 8, 1771),
(8, 'Michael', 6, 2017),
(9, 'Todd', 5, 3396),
(10, 'Joe', 9, 3573))

SELECT *,
       months*salary AS total_earnings
FROM employees
"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,employee_id,name,months,salary,total_earnings
0,1,Rose,15,1968,29520
1,2,Angela,1,3443,3443
2,3,Frank,17,1608,27336
3,4,Patrick,7,1345,9415
4,5,Lisa,11,2330,25630
5,6,Kimberly,16,4372,69952
6,7,Bonnie,8,1771,14168
7,8,Michael,6,2017,12102
8,9,Todd,5,3396,16980
9,10,Joe,9,3573,32157


In [43]:
# check input table

sql_query = """
WITH employees (employee_id, name, months, salary)
AS (VALUES
(1, 'Rose', 15, 1968),
(2, 'Angela', 1, 3443),
(3, 'Frank', 17, 1608),
(4, 'Patrick', 7,1345),
(5, 'Lisa', 11, 2330),
(6, 'Kimberly', 16, 4372),
(7, 'Bonnie', 8, 1771),
(8, 'Michael', 6, 2017),
(9, 'Todd', 5, 3396),
(10, 'Joe', 9, 3573)),

--strategy: get total_earnings for each, determine max, COUNT the number that has max

t1 AS
(SELECT months*salary AS total_earnings
FROM employees),

t2 AS
(SELECT(MAX(total_earnings)) AS max_total_earnings
FROM t1)

SELECT max_total_earnings,
       (SELECT COUNT(*)
       FROM t1
       WHERE total_earnings = (SELECT max_total_earnings FROM t2))
FROM t2


"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,max_total_earnings,count
0,69952,1


## Alternate solution

In [49]:
# check input table

sql_query = """
WITH employees (employee_id, name, months, salary)
AS (VALUES
(1, 'Rose', 15, 1968),
(2, 'Angela', 1, 3443),
(3, 'Frank', 17, 1608),
(4, 'Patrick', 7,1345),
(5, 'Lisa', 11, 2330),
(6, 'Kimberly', 16, 4372),
(7, 'Bonnie', 8, 1771),
(8, 'Michael', 6, 2017),
(9, 'Todd', 5, 3396),
(10, 'Joe', 9, 3573),
(11, 'Rose2', 15, 1968))

--strategy: get total_earnings for each, determine max, COUNT the number that has max


SELECT months*salary AS total_earnings,
       COUNT(*)
FROM employees
GROUP BY total_earnings
ORDER BY total_earnings DESC


"""
df_query = pd.read_sql_query(sql_query,con)    
df_query

Unnamed: 0,total_earnings,count
0,69952,1
1,32157,1
2,29520,2
3,27336,1
4,25630,1
5,16980,1
6,14168,1
7,12102,1
8,9415,1
9,3443,1


# --

# --