In [3]:
import sqlite3
import pandas as pd
conn = sqlite3.Connection('data (3).sqlite')

In [4]:
# count the number of records in each group. To do that, we'll also use the COUNT aggregate function.
#OTHER WAY OF WRITTING, exactly the same
# When we say to count *, we mean count every row containing non-null column values. 
# You will also see examples using COUNT(1), which counts every row regardless of whether it contains non-null column values, 
# or something like COUNT(customerNumber), which just counts whether some particular column is non-null.

q = """
SELECT country, COUNT(*)
FROM customers
GROUP BY country
;
"""
# Displaying just the first 10 countries for readability
pd.read_sql(q, conn).head(10)

Unnamed: 0,country,COUNT(*)
0,Australia,5
1,Austria,2
2,Belgium,2
3,Canada,3
4,Denmark,2
5,Finland,3
6,France,12
7,Germany,13
8,Hong Kong,1
9,Ireland,2


In [5]:
# Group by Index
# Select Country, index 1 (There is only one)

q = """
SELECT country, COUNT(*)
FROM customers
GROUP BY 1
;
"""
# Displaying just the first 10 countries for readability
pd.read_sql(q, conn).head(10)

Unnamed: 0,country,COUNT(*)
0,Australia,5
1,Austria,2
2,Belgium,2
3,Canada,3
4,Denmark,2
5,Finland,3
6,France,12
7,Germany,13
8,Hong Kong,1
9,Ireland,2


In [6]:
# Alias

q = """
SELECT country, COUNT(*) AS customer_count
FROM customers
GROUP BY country
;
"""
# Displaying just the first 10 countries for readability
pd.read_sql(q, conn).head(10)

Unnamed: 0,country,customer_count
0,Australia,5
1,Austria,2
2,Belgium,2
3,Canada,3
4,Denmark,2
5,Finland,3
6,France,12
7,Germany,13
8,Hong Kong,1
9,Ireland,2


In [7]:
# Other aggregations

q = """
SELECT
   customerNumber,
   COUNT(*) AS number_payments,
   MIN(CAST(amount AS INTEGER)) AS min_purchase,
   MAX(CAST(amount AS INTEGER)) AS max_purchase,
   AVG(CAST(amount AS INTEGER)) AS avg_purchase,
   SUM(CAST(amount AS INTEGER)) AS total_spent
FROM payments
GROUP BY customerNumber
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,103,3,1676,14571,7437.666667,22313
1,112,3,14191,33347,26726.333333,80179
2,114,4,7565,82261,45146.000000,180584
3,119,3,19501,49523,38982.666667,116948
4,121,4,1491,50218,26055.750000,104223
...,...,...,...,...,...,...
93,486,3,5899,45994,25908.666667,77726
94,487,2,12573,29997,21285.000000,42570
95,489,2,7310,22275,14792.500000,29585
96,495,2,6276,59265,32770.500000,65541


In [9]:
# WHERE to filter the aggreagation

q = """
SELECT
   customerNumber,
   COUNT(*) AS number_payments,
   MIN(CAST(amount AS INTEGER)) AS min_purchase,
   MAX(CAST(amount AS INTEGER)) AS max_purchase,
   AVG(CAST(amount AS INTEGER)) AS avg_purchase,
   SUM(CAST(amount AS INTEGER)) AS total_spent
FROM payments
WHERE strftime('%Y', paymentDate) = '2004'
GROUP BY customerNumber
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,103,2,1676,6066,3871.0,7742
1,112,2,14191,33347,23769.0,47538
2,114,2,44894,82261,63577.5,127155
3,119,2,19501,47924,33712.5,67425
4,121,2,17876,34638,26257.0,52514
...,...,...,...,...,...,...
83,486,2,5899,45994,25946.5,51893
84,487,1,12573,12573,12573.0,12573
85,489,1,7310,7310,7310.0,7310
86,495,1,6276,6276,6276.0,6276


In [11]:
# HAVING
# The HAVING clause works similarly to the WHERE clause, except it is used to filter data selections on conditions after the GROUP BY clause.
# basically, Filter after GROUP BY, the data considerd is the aggregated
     
q = """
SELECT
   customerNumber,
   COUNT(*) AS number_payments,
   MIN(CAST(amount AS INTEGER)) AS min_purchase,
   MAX(CAST(amount AS INTEGER)) AS max_purchase,
   AVG(CAST(amount AS INTEGER)) AS avg_purchase,
   SUM(CAST(amount AS INTEGER)) AS total_spent
FROM payments
GROUP BY customerNumber
HAVING avg_purchase > 50000
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,124,9,11044,111654,64909.333333,584184
1,141,13,20009,120166,55056.384615,715733
2,239,1,80375,80375,80375.0,80375
3,298,2,47375,61402,54388.5,108777
4,321,2,46781,85559,66170.0,132340
5,450,1,59551,59551,59551.0,59551


In [12]:
# HAVING AND WHERE

q = """
SELECT
   customerNumber,
   COUNT(*) AS number_payments,
   MIN(CAST(amount AS INTEGER)) AS min_purchase,
   MAX(CAST(amount AS INTEGER)) AS max_purchase,
   AVG(CAST(amount AS INTEGER)) AS avg_purchase,
   SUM(CAST(amount AS INTEGER)) AS total_spent
FROM payments
WHERE amount > 50000
GROUP BY customerNumber
HAVING number_payments >= 2
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,124,5,55639,111654,87509.0,437545
1,141,5,59830,120166,85023.6,425118
2,151,2,58793,58841,58817.0,117634
3,363,2,50799,55425,53112.0,106224


In [14]:
# HAVING AND WHERE AND ORDER BY AND LIMIT

q = """
SELECT
   customerNumber,
   COUNT(*) AS number_payments,
   MIN(CAST(amount AS INTEGER)) AS min_purchase,
   MAX(CAST(amount AS INTEGER)) AS max_purchase,
   AVG(CAST(amount AS INTEGER)) AS avg_purchase,
   SUM(CAST(amount AS INTEGER)) AS total_spent
FROM payments
WHERE amount > 50000
GROUP BY customerNumber
HAVING number_payments >= 2
ORDER BY total_spent
LIMIT 1
;
"""
pd.read_sql(q, conn)

Unnamed: 0,customerNumber,number_payments,min_purchase,max_purchase,avg_purchase,total_spent
0,363,2,50799,55425,53112.0,106224
