<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#SQL-Introduction" data-toc-modified-id="SQL-Introduction-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>SQL Introduction</a></span></li><li><span><a href="#Connect-to-the-SQL" data-toc-modified-id="Connect-to-the-SQL-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Connect to the SQL</a></span></li><li><span><a href="#Know-your-database" data-toc-modified-id="Know-your-database-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Know your database</a></span></li><li><span><a href="#Create-pandas-dataframes" data-toc-modified-id="Create-pandas-dataframes-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create pandas dataframes</a></span></li><li><span><a href="#Make-dataframes-columns-dtype-good" data-toc-modified-id="Make-dataframes-columns-dtype-good-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Make dataframes columns dtype good</a></span></li><li><span><a href="#Create-pandas-df-of-all-tables-and-columns-names" data-toc-modified-id="Create-pandas-df-of-all-tables-and-columns-names-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Create pandas df of all tables and columns names</a></span></li><li><span><a href="#Section-3:-GROUP-BY" data-toc-modified-id="Section-3:-GROUP-BY-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Section 3: GROUP BY</a></span><ul class="toc-item"><li><span><a href="#Challanges" data-toc-modified-id="Challanges-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Challanges</a></span><ul class="toc-item"><li><span><a href="#qn1" data-toc-modified-id="qn1-7.1.1"><span class="toc-item-num">7.1.1&nbsp;&nbsp;</span>qn1</a></span></li><li><span><a href="#qn2" data-toc-modified-id="qn2-7.1.2"><span class="toc-item-num">7.1.2&nbsp;&nbsp;</span>qn2</a></span></li><li><span><a href="#qn3" data-toc-modified-id="qn3-7.1.3"><span class="toc-item-num">7.1.3&nbsp;&nbsp;</span>qn3</a></span></li></ul></li><li><span><a href="#Having-clause" data-toc-modified-id="Having-clause-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Having clause</a></span><ul class="toc-item"><li><span><a href="#Challanges" data-toc-modified-id="Challanges-7.2.1"><span class="toc-item-num">7.2.1&nbsp;&nbsp;</span>Challanges</a></span><ul class="toc-item"><li><span><a href="#qn1" data-toc-modified-id="qn1-7.2.1.1"><span class="toc-item-num">7.2.1.1&nbsp;&nbsp;</span>qn1</a></span></li><li><span><a href="#qn2" data-toc-modified-id="qn2-7.2.1.2"><span class="toc-item-num">7.2.1.2&nbsp;&nbsp;</span>qn2</a></span></li></ul></li></ul></li></ul></li></ul></div>

# SQL Introduction

![](../images/sql_clauses.png)
![](../images/sql_mnemonic.png)

# Connect to the SQL

In [1]:
import numpy as np
import pandas as pd
import os
import yaml

with open( os.path.expanduser('~') + "/.postgres_conf.yml", 'r') as stream:
    try:
        yaml_dict = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

pw = yaml_dict['password']
port = yaml_dict['port']

%load_ext sql
%sql postgres://postgres:$pw@localhost:$port/dvdrental

'Connected: postgres@dvdrental'

# Know your database

In [2]:
%%sql
SELECT TABLE_NAME
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_TYPE = 'BASE TABLE'
AND TABLE_CATALOG='dvdrental'
and TABLE_NAME  not like 'pg_%'
and TABLE_NAME  not like 'sql_%'
-- order by table_name

 * postgres://postgres:***@localhost:5432/dvdrental
20 rows affected.


table_name
actor
store
address
category
city
country
customer
film_actor
film_category
inventory


In [3]:
%%sql
select column_name, data_type, character_maximum_length
    from INFORMATION_SCHEMA.COLUMNS 
    where table_name = 'customer';

 * postgres://postgres:***@localhost:5432/dvdrental
10 rows affected.


column_name,data_type,character_maximum_length
customer_id,integer,
store_id,smallint,
first_name,character varying,45.0
last_name,character varying,45.0
email,character varying,50.0
address_id,smallint,
activebool,boolean,
create_date,date,
last_update,timestamp without time zone,
active,integer,


In [4]:
%%sql
-- select * from staff limit 2; -- This gives error in jupyter notebook
select * from actor limit 2;

 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.


actor_id,first_name,last_name,last_update
1,Penelope,Guiness,2013-05-26 14:47:57.620000
2,Nick,Wahlberg,2013-05-26 14:47:57.620000


# Create pandas dataframes

In [5]:
staffs = pd.read_csv('../data/dvdrental/2187.dat', sep=r'\t',
                     header=None, engine='python')

cols = ['staff_id', 'first_name', 'last_name', 'address_id', 'email',
        'store_id', 'active', 'username', 'password', 'last_update',
        'picture']

staffs.columns = cols
staffs = staffs.head(2)
staffs['active'] = True
staffs.drop('picture',axis=1,inplace=True)
print(staffs.shape)
staffs.head()

(2, 10)


Unnamed: 0,staff_id,first_name,last_name,address_id,email,store_id,active,username,password,last_update
0,1,Mike,Hillyer,3,Mike.Hillyer@sakilastaff.com,1,True,Mike,8cb2237d0679ca88db6464eac60da96345513964,2006-05-16 16:13:11.79328
1,2,Jon,Stephens,4,Jon.Stephens@sakilastaff.com,2,True,Jon,8cb2237d0679ca88db6464eac60da96345513964,2006-05-16 16:13:11.79328


In [6]:
tables = ['staffs', 'category', 'film_category', 'country', 'actor',
          'language', 'inventory', 'payment', 'rental', 'city',
          'store', 'film', 'address', 'film_actor', 'customer']

staff = tables[0] # we do not use it, we use staffs dataframe. but keep name staffs.
category = tables[1]
film_category = tables[2]
country = tables[3]
actor = tables[4]
language = tables[5]
inventory = tables[6]
payment = tables[7]
rental = tables[8]
city = tables[9]
store = tables[10]
film = tables[11]
address = tables[12]
film_actor = tables[13]
customer = tables[14]

In [7]:
# first create separate sql tables so that we can convert them to pandas dataframes.
# staff = %sql select * from $staff;  # this fails
category = %sql select * from $category;
film_category = %sql select * from $film_category;
country = %sql select * from $country;
actor = %sql select * from $actor;
language = %sql select * from $language;
inventory = %sql select * from $inventory;
payment = %sql select * from $payment;
rental = %sql select * from $rental;
city = %sql select * from $city;
store = %sql select * from $store;
film = %sql select * from $film;
address = %sql select * from $address;
film_actor = %sql select * from $film_actor;
customer = %sql select * from $customer;

 * postgres://postgres:***@localhost:5432/dvdrental
16 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
1000 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
109 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
200 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
6 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
4581 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
14596 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
16044 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
600 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
1000 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
603 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
5462 rows affected.
 * postgres://postgres:***@localhost:5432/dvdrental
599 rows affected.


In [8]:
category = category.DataFrame()
film_category = film_category.DataFrame()
country = country.DataFrame()
actor = actor.DataFrame()
language = language.DataFrame()
inventory = inventory.DataFrame()
payment = payment.DataFrame()
rental = rental.DataFrame()
city = city.DataFrame()
store = store.DataFrame()
film = film.DataFrame()
address = address.DataFrame()
film_actor = film_actor.DataFrame()
customer = customer.DataFrame()

# Make dataframes columns dtype good

In [9]:
df_tables = [staffs, category, film_category, country, actor,
            language, inventory, payment, rental, city,
            store, film, address, film_actor, customer]

In [10]:
def show_first_value_and_dtype(num):
    df_tables_dtypes = [df_tables[i].dtypes.to_frame()
                        for i in range(len(df_tables)) ]
    df_tables_first_value = [df_tables[i].head(1).T
                             for i in range(len(df_tables)) ]

    display(pd.concat([df_tables_first_value[num], df_tables_dtypes[num]],
                      axis=1, sort=True,ignore_index=True)
     .rename(columns={0: 'value', 1: 'dtype'})
     .style.apply(lambda x: ['background: lightblue' 
                             if x['dtype'] == 'object'
                             else ''
                             for _ in x],axis=1)
            .set_caption('Dataframe name: ' + tables[num])
    )

In [11]:
len(tables)

15

In [12]:
show_first_value_and_dtype(0)

Unnamed: 0,value,dtype
active,True,bool
address_id,3,int64
email,Mike.Hillyer@sakilastaff.com,object
first_name,Mike,object
last_name,Hillyer,object
last_update,2006-05-16 16:13:11.79328,object
password,8cb2237d0679ca88db6464eac60da96345513964,object
staff_id,1,int64
store_id,1,int64
username,Mike,object


In [13]:
staffs['last_update'] = pd.to_datetime(staffs['last_update'])

payment['amount'] = pd.to_numeric(payment['amount'], errors='coerce')

film['rental_rate'] = pd.to_numeric(film['rental_rate'], errors='coerce')
film['replacement_cost'] = pd.to_numeric(film['replacement_cost'], errors='coerce')

customer['create_date'] = pd.to_datetime(customer['create_date'])

# Create pandas df of all tables and columns names

In [14]:
df_tables = [staffs, category, film_category, country, actor,
            language, inventory, payment, rental, city,
            store, film, address, film_actor, customer]

all_columns = [df.columns.tolist() for df in df_tables]
df_tables_cols = pd.DataFrame(all_columns).T.fillna('')
df_tables_cols.columns = tables
df_tables_cols

Unnamed: 0,staffs,category,film_category,country,actor,language,inventory,payment,rental,city,store,film,address,film_actor,customer
0,staff_id,category_id,film_id,country_id,actor_id,language_id,inventory_id,payment_id,rental_id,city_id,store_id,film_id,address_id,actor_id,customer_id
1,first_name,name,category_id,country,first_name,name,film_id,customer_id,rental_date,city,manager_staff_id,title,address,film_id,store_id
2,last_name,last_update,last_update,last_update,last_name,last_update,store_id,staff_id,inventory_id,country_id,address_id,description,address2,last_update,first_name
3,address_id,,,,last_update,,last_update,rental_id,customer_id,last_update,last_update,release_year,district,,last_name
4,email,,,,,,,amount,return_date,,,language_id,city_id,,email
5,store_id,,,,,,,payment_date,staff_id,,,rental_duration,postal_code,,address_id
6,active,,,,,,,,last_update,,,rental_rate,phone,,activebool
7,username,,,,,,,,,,,length,last_update,,create_date
8,password,,,,,,,,,,,replacement_cost,,,last_update
9,last_update,,,,,,,,,,,rating,,,active


In [15]:
# find repeated column names
repeated_cols = (pd.Series([i for sub in all_columns for i in sub])
                 .value_counts()
                 .loc[lambda x: x>1]
                 .index.values.tolist()
                )
# create colors dict
cells = repeated_cols
colors = ['salmon', 'khaki','rosybrown','tomato',
          'olive', 'gray',  'mediumpurple',
          'orchid',  'plum','lavender', 'lightgreen',
          'lightsteelblue', 
          'lightblue','skyblue','orange','orangered'][:len(cells)]
cell_colors = dict(zip(cells,colors))

# colored dataframe
df_tables_cols.style.apply(lambda x: ["background: %s" % cell_colors[v] 
                          if  v in cell_colors.keys()
                          else "" for v in x], axis = 1)

Unnamed: 0,staffs,category,film_category,country,actor,language,inventory,payment,rental,city,store,film,address,film_actor,customer
0,staff_id,category_id,film_id,country_id,actor_id,language_id,inventory_id,payment_id,rental_id,city_id,store_id,film_id,address_id,actor_id,customer_id
1,first_name,name,category_id,country,first_name,name,film_id,customer_id,rental_date,city,manager_staff_id,title,address,film_id,store_id
2,last_name,last_update,last_update,last_update,last_name,last_update,store_id,staff_id,inventory_id,country_id,address_id,description,address2,last_update,first_name
3,address_id,,,,last_update,,last_update,rental_id,customer_id,last_update,last_update,release_year,district,,last_name
4,email,,,,,,,amount,return_date,,,language_id,city_id,,email
5,store_id,,,,,,,payment_date,staff_id,,,rental_duration,postal_code,,address_id
6,active,,,,,,,,last_update,,,rental_rate,phone,,activebool
7,username,,,,,,,,,,,length,last_update,,create_date
8,password,,,,,,,,,,,replacement_cost,,,last_update
9,last_update,,,,,,,,,,,rating,,,active


# Section 3: GROUP BY

![](../images/sql_aggregation.png)
![](../images/where_having.png)
![](../images/sql-count-group-by.png)

In [16]:
%%sql
select round(avg(amount),2) from payment;

 * postgres://postgres:***@localhost:5432/dvdrental
1 rows affected.


round
4.2


In [17]:
payment.amount.mean().round(2)

4.2

In [18]:
%%sql
select amount from payment
order by amount limit 3;

 * postgres://postgres:***@localhost:5432/dvdrental
3 rows affected.


amount
0.0
0.0
0.0


In [19]:
%%sql
select count(amount) from payment
where amount = 0.00;

 * postgres://postgres:***@localhost:5432/dvdrental
1 rows affected.


count
24


In [20]:
payment.query('amount == 0.00').shape

(24, 6)

In [21]:
%%sql
select customer_id, sum(amount)
from payment
group by customer_id
order by sum(amount) desc
limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


customer_id,sum
148,211.55
526,208.58
178,194.61
137,191.62
144,189.6


In [22]:
payment[['customer_id','amount']]\
.groupby('customer_id')['amount']\
.sum()\
.sort_values(ascending=False)\
.reset_index()\
.head()

Unnamed: 0,customer_id,amount
0,148,211.55
1,526,208.58
2,178,194.61
3,137,191.62
4,144,189.6


In [23]:
 %%sql
select * from payment limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


payment_id,customer_id,staff_id,rental_id,amount,payment_date
17503,341,2,1520,7.99,2007-02-15 22:25:46.996577
17504,341,1,1778,1.99,2007-02-16 17:23:14.996577
17505,341,1,1849,7.99,2007-02-16 22:41:45.996577
17506,341,2,2829,2.99,2007-02-19 19:39:56.996577
17507,341,2,3130,7.99,2007-02-20 17:31:48.996577


In [24]:
%%sql
select staff_id, count(payment_id)
from payment
group by staff_id;

 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.


staff_id,count
1,7292
2,7304


In [25]:
payment[['staff_id','payment_id']]\
.groupby('staff_id')\
.payment_id.count()

staff_id
1    7292
2    7304
Name: payment_id, dtype: int64

In [26]:
%%sql
select * from film limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


film_id,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
133,Chamber Italian,A Fateful Reflection of a Moose And a Husband who must Overcome a Monkey in Nigeria,2006,1,7,4.99,117,14.99,NC-17,2013-05-26 14:50:58.951000,['Trailers'],'chamber':1 'fate':4 'husband':11 'italian':2 'monkey':16 'moos':8 'must':13 'nigeria':18 'overcom':14 'reflect':5
384,Grosse Wonderful,A Epic Drama of a Cat And a Explorer who must Redeem a Moose in Australia,2006,1,5,4.99,49,19.99,R,2013-05-26 14:50:58.951000,['Behind the Scenes'],'australia':18 'cat':8 'drama':5 'epic':4 'explor':11 'gross':1 'moos':16 'must':13 'redeem':14 'wonder':2
8,Airport Pollock,A Epic Tale of a Moose And a Girl who must Confront a Monkey in Ancient India,2006,1,6,4.99,54,15.99,R,2013-05-26 14:50:58.951000,['Trailers'],'airport':1 'ancient':18 'confront':14 'epic':4 'girl':11 'india':19 'monkey':16 'moos':8 'must':13 'pollock':2 'tale':5
98,Bright Encounters,A Fateful Yarn of a Lumberjack And a Feminist who must Conquer a Student in A Jet Boat,2006,1,4,4.99,73,12.99,PG-13,2013-05-26 14:50:58.951000,['Trailers'],'boat':20 'bright':1 'conquer':14 'encount':2 'fate':4 'feminist':11 'jet':19 'lumberjack':8 'must':13 'student':16 'yarn':5
1,Academy Dinosaur,A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies,2006,1,6,0.99,86,20.99,PG,2013-05-26 14:50:58.951000,"['Deleted Scenes', 'Behind the Scenes']",'academi':1 'battl':15 'canadian':20 'dinosaur':2 'drama':5 'epic':4 'feminist':8 'mad':11 'must':14 'rocki':21 'scientist':12 'teacher':17


In [27]:
%%sql
select rating, count(rating) from film
group by rating;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


rating,count
R,195
NC-17,210
G,178
PG,194
PG-13,223


In [28]:
film.rating.value_counts()

PG-13    223
NC-17    210
R        195
PG       194
G        178
Name: rating, dtype: int64

In [29]:
film.rating.value_counts().reset_index()\
.rename(columns={'index': 'rating', 'rating':'count'})

Unnamed: 0,rating,count
0,PG-13,223
1,NC-17,210
2,R,195
3,PG,194
4,G,178


In [30]:
%%sql
select rental_duration, count(rental_duration)
from film
group by rental_duration;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


rental_duration,count
4,203
6,212
7,191
3,203
5,191


In [31]:
film.rental_duration.value_counts()

6    212
4    203
3    203
7    191
5    191
Name: rental_duration, dtype: int64

In [32]:
%%sql
select rating, avg(rental_rate)
from film
group by rating;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


rating,avg
R,2.9387179487179487
NC-17,2.970952380952381
G,2.888876404494382
PG,3.0518556701030928
PG-13,3.0348430493273546


In [33]:
film.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,133,Chamber Italian,A Fateful Reflection of a Moose And a Husband ...,2006,1,7,4.99,117,14.99,NC-17,2013-05-26 14:50:58.951,[Trailers],'chamber':1 'fate':4 'husband':11 'italian':2 ...
1,384,Grosse Wonderful,A Epic Drama of a Cat And a Explorer who must ...,2006,1,5,4.99,49,19.99,R,2013-05-26 14:50:58.951,[Behind the Scenes],'australia':18 'cat':8 'drama':5 'epic':4 'exp...


In [34]:
film[['rating','rental_rate']].head(2)

Unnamed: 0,rating,rental_rate
0,NC-17,4.99
1,R,4.99


In [35]:
film[['rating','rental_rate']].dtypes

rating          object
rental_rate    float64
dtype: object

In [36]:
film['rental_rate'] = pd.to_numeric(film.rental_rate, errors='coerce')

In [37]:
film[['rating','rental_rate']].dtypes

rating          object
rental_rate    float64
dtype: object

In [38]:
%%sql
select rating, avg(rental_rate)
from film
group by rating
order by rating;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


rating,avg
G,2.888876404494382
PG,3.0518556701030928
PG-13,3.0348430493273546
R,2.9387179487179487
NC-17,2.970952380952381


In [39]:
film[['rating','rental_rate']].groupby('rating')['rental_rate'].mean().sort_index()

rating
G        2.888876
NC-17    2.970952
PG       3.051856
PG-13    3.034843
R        2.938718
Name: rental_rate, dtype: float64

## Challanges

### qn1
- We have two staff members with Staff IDs 1 and 2.
We want to give a bonus to the staff member that handled the most payments.
- How many payments did each staff member handle?
And how much was the total amount processed by each staff member

In [40]:
%%sql
select * from payment limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


payment_id,customer_id,staff_id,rental_id,amount,payment_date
17503,341,2,1520,7.99,2007-02-15 22:25:46.996577
17504,341,1,1778,1.99,2007-02-16 17:23:14.996577
17505,341,1,1849,7.99,2007-02-16 22:41:45.996577
17506,341,2,2829,2.99,2007-02-19 19:39:56.996577
17507,341,2,3130,7.99,2007-02-20 17:31:48.996577


In [41]:
%%sql
select staff_id, count(*), sum(amount) from payment
group by staff_id;

 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.


staff_id,count,sum
1,7292,30252.12
2,7304,31059.92


In [42]:
payment[['staff_id','amount']]\
.groupby('staff_id')\
.agg(['count','sum'])

Unnamed: 0_level_0,amount,amount
Unnamed: 0_level_1,count,sum
staff_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,7292,30252.12
2,7304,31059.92


### qn2
- Corporate headquarters is auditing our store!
They want to know the average replacement
cost of movies by rating.
- For example, R rated movies have an average
replacement cost of $ 20.23

In [43]:
%%sql
select * from film limit 2;

 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.


film_id,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
133,Chamber Italian,A Fateful Reflection of a Moose And a Husband who must Overcome a Monkey in Nigeria,2006,1,7,4.99,117,14.99,NC-17,2013-05-26 14:50:58.951000,['Trailers'],'chamber':1 'fate':4 'husband':11 'italian':2 'monkey':16 'moos':8 'must':13 'nigeria':18 'overcom':14 'reflect':5
384,Grosse Wonderful,A Epic Drama of a Cat And a Explorer who must Redeem a Moose in Australia,2006,1,5,4.99,49,19.99,R,2013-05-26 14:50:58.951000,['Behind the Scenes'],'australia':18 'cat':8 'drama':5 'epic':4 'explor':11 'gross':1 'moos':16 'must':13 'redeem':14 'wonder':2


In [44]:
%%sql
select rating, avg(replacement_cost)
from film
group by rating;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


rating,avg
R,20.23102564102564
NC-17,20.137619047619047
G,20.12483146067416
PG,18.959072164948452
PG-13,20.402556053811654


In [45]:
film.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,133,Chamber Italian,A Fateful Reflection of a Moose And a Husband ...,2006,1,7,4.99,117,14.99,NC-17,2013-05-26 14:50:58.951,[Trailers],'chamber':1 'fate':4 'husband':11 'italian':2 ...
1,384,Grosse Wonderful,A Epic Drama of a Cat And a Explorer who must ...,2006,1,5,4.99,49,19.99,R,2013-05-26 14:50:58.951,[Behind the Scenes],'australia':18 'cat':8 'drama':5 'epic':4 'exp...


In [46]:
film.columns

Index(['film_id', 'title', 'description', 'release_year', 'language_id',
       'rental_duration', 'rental_rate', 'length', 'replacement_cost',
       'rating', 'last_update', 'special_features', 'fulltext'],
      dtype='object')

In [47]:
film[['rating','replacement_cost']].head(2)

Unnamed: 0,rating,replacement_cost
0,NC-17,14.99
1,R,19.99


In [48]:
film[['rating','replacement_cost']].dtypes

rating               object
replacement_cost    float64
dtype: object

In [49]:
film['replacement_cost'] = pd.to_numeric(film['replacement_cost'], errors='coerce')

In [50]:
film[['rating','replacement_cost']].dtypes

rating               object
replacement_cost    float64
dtype: object

In [51]:
film[['rating','replacement_cost']].groupby('rating')['replacement_cost'].mean().round(2)

rating
G        20.12
NC-17    20.14
PG       18.96
PG-13    20.40
R        20.23
Name: replacement_cost, dtype: float64

In [52]:
%%sql
select rating, round(avg(replacement_cost),2)
from film
group by rating;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


rating,round
R,20.23
NC-17,20.14
G,20.12
PG,18.96
PG-13,20.4


### qn3
- We want to send coupons to the 5 customers
who have spent the most amount of money.
- Get me the customer ids of the top 5
spenders.

In [53]:
%%sql
select * from payment limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


payment_id,customer_id,staff_id,rental_id,amount,payment_date
17503,341,2,1520,7.99,2007-02-15 22:25:46.996577
17504,341,1,1778,1.99,2007-02-16 17:23:14.996577
17505,341,1,1849,7.99,2007-02-16 22:41:45.996577
17506,341,2,2829,2.99,2007-02-19 19:39:56.996577
17507,341,2,3130,7.99,2007-02-20 17:31:48.996577


In [54]:
%%sql
select  sum(amount) from payment
limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
1 rows affected.


sum
61312.04


In [55]:
%%sql
select column_name, data_type, character_maximum_length
    from INFORMATION_SCHEMA.COLUMNS 
    where table_name = 'payment';

 * postgres://postgres:***@localhost:5432/dvdrental
6 rows affected.


column_name,data_type,character_maximum_length
payment_id,integer,
customer_id,smallint,
staff_id,smallint,
rental_id,integer,
amount,numeric,
payment_date,timestamp without time zone,


In [56]:
%%sql
select customer_id
from payment limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


customer_id
341
341
341
341
341


In [57]:
%%sql
-- select customer_id, sum(amount) from payment limit 5; -- fails

 * postgres://postgres:***@localhost:5432/dvdrental
(psycopg2.ProgrammingError) can't execute an empty query [SQL: '-- select customer_id, sum(amount) from payment limit 5; -- fails'] (Background on this error at: http://sqlalche.me/e/f405)


In [58]:
%%sql
select customer_id, sum(amount)
from payment
group by customer_id
order by sum(amount) desc
limit 5;

 * postgres://postgres:***@localhost:5432/dvdrental
5 rows affected.


customer_id,sum
148,211.55
526,208.58
178,194.61
137,191.62
144,189.6


In [59]:
payment.dtypes

payment_id               int64
customer_id              int64
staff_id                 int64
rental_id                int64
amount                 float64
payment_date    datetime64[ns]
dtype: object

In [60]:
payment['amount'] = pd.to_numeric(payment.amount, errors='coerce')

In [61]:
payment[['customer_id','amount']]\
.groupby('customer_id')['amount']\
.sum()\
.sort_values(ascending=False)\
.head()

customer_id
148    211.55
526    208.58
178    194.61
137    191.62
144    189.60
Name: amount, dtype: float64

In [62]:
payment[['customer_id','amount']]\
.groupby('customer_id')['amount']\
.sum()\
.nlargest(5)

customer_id
148    211.55
526    208.58
178    194.61
137    191.62
144    189.60
Name: amount, dtype: float64

## Having clause
```sql
SELECT column_name(s)
FROM table_name
WHERE condition
GROUP BY column_name(s)
HAVING condition
ORDER BY column_name(s);
```

![](../images/having.png)
![](../images/sql_mnemonic.png)

In [63]:
%%sql
select store_id, count(customer_id)
from customer
group by store_id;

 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.


store_id,count
1,326
2,273


In [64]:
%%sql
select store_id, count(customer_id)
from customer
group by store_id
having count(customer_id) > 300;

 * postgres://postgres:***@localhost:5432/dvdrental
1 rows affected.


store_id,count
1,326


In [65]:
%%sql
select rating, avg(rental_rate)
from film
where rating in ('R','G','PG')
group by rating
having avg(rental_rate) < 3
order by avg(rental_rate) desc;

 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.


rating,avg
R,2.9387179487179487
G,2.888876404494382


### Challanges

#### qn1
> We want to know what customers are eligible
for our platinum credit card. The requirements
are that the customer has at least a total of 40
transaction payments.

> What customers (by customer id) are eligible
for the credit card?

In [66]:
%%sql
select * from payment limit 2;

 * postgres://postgres:***@localhost:5432/dvdrental
2 rows affected.


payment_id,customer_id,staff_id,rental_id,amount,payment_date
17503,341,2,1520,7.99,2007-02-15 22:25:46.996577
17504,341,1,1778,1.99,2007-02-16 17:23:14.996577


In [67]:
%%sql
select customer_id, count(amount)
from payment
group by customer_id
having count(amount) >= 40;

 * postgres://postgres:***@localhost:5432/dvdrental
3 rows affected.


customer_id,count
144,40
526,42
148,45


In [68]:
payment[['customer_id','amount']]\
.groupby('customer_id')['amount']\
.count()\
.reset_index()\
.query('amount >= 40')

Unnamed: 0,customer_id,amount
143,144,40
147,148,45
525,526,42


In [69]:
payment[['customer_id','amount']]\
.groupby('customer_id')['amount']\
.count()\
.loc[lambda x: x >= 40]

customer_id
144    40
148    45
526    42
Name: amount, dtype: int64

#### qn2
> When grouped by rating, what movie ratings
have an average rental duration of more than
5 days?

In [70]:
%%sql
select * from film limit 1;

 * postgres://postgres:***@localhost:5432/dvdrental
1 rows affected.


film_id,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
133,Chamber Italian,A Fateful Reflection of a Moose And a Husband who must Overcome a Monkey in Nigeria,2006,1,7,4.99,117,14.99,NC-17,2013-05-26 14:50:58.951000,['Trailers'],'chamber':1 'fate':4 'husband':11 'italian':2 'monkey':16 'moos':8 'must':13 'nigeria':18 'overcom':14 'reflect':5


In [71]:
%%sql
select rating, avg(rental_duration)
from film
group by rating
having avg(rental_duration) > 5;

 * postgres://postgres:***@localhost:5432/dvdrental
3 rows affected.


rating,avg
NC-17,5.142857142857143
PG,5.082474226804124
PG-13,5.053811659192824


In [72]:
film[['rating','rental_duration']]\
.groupby('rating')['rental_duration']\
.mean().loc[lambda x: x > 5]

rating
NC-17    5.142857
PG       5.082474
PG-13    5.053812
Name: rental_duration, dtype: float64