# [Intermediate SQL](https://www.datacamp.com/completed/statement-of-accomplishment/course/f13fa89f9723010be3d128e1391f56ac914d3dbb)

In [1]:
%load_ext sql

%config SqlMagic.autopandas = False
%config SqlMagic.displaycon = False
%config SqlMagic.feedback = 0

In [2]:
import duckdb

conn = duckdb.connect(database=':memory:', read_only=False)
%sql conn

# load data
%sql CREATE TABLE films AS SELECT * FROM 'films.csv';
%sql CREATE TABLE people AS SELECT * FROM 'people.csv';
%sql CREATE TABLE reviews AS SELECT * FROM 'reviews.csv';
%sql CREATE TABLE roles AS SELECT * FROM 'roles.csv';

Count
19791


In [3]:
%%sql
DESCRIBE films;

column_name,column_type,null,key,default,extra
id,BIGINT,YES,,,
title,VARCHAR,YES,,,
release_year,DOUBLE,YES,,,
country,VARCHAR,YES,,,
duration,DOUBLE,YES,,,
language,VARCHAR,YES,,,
certification,VARCHAR,YES,,,
gross,DOUBLE,YES,,,
budget,DOUBLE,YES,,,


In [4]:
%%sql
DESCRIBE people;

column_name,column_type,null,key,default,extra
id,BIGINT,YES,,,
name,VARCHAR,YES,,,
birthdate,DATE,YES,,,
deathdate,DATE,YES,,,


In [5]:
%%sql
DESCRIBE reviews;

column_name,column_type,null,key,default,extra
id,BIGINT,YES,,,
film_id,BIGINT,YES,,,
num_user,DOUBLE,YES,,,
num_critic,DOUBLE,YES,,,
imdb_score,DOUBLE,YES,,,
num_votes,BIGINT,YES,,,
facebook_likes,BIGINT,YES,,,


In [6]:
%%sql
DESCRIBE roles;

column_name,column_type,null,key,default,extra
id,BIGINT,YES,,,
film_id,BIGINT,YES,,,
person_id,BIGINT,YES,,,
role,VARCHAR,YES,,,


## Selecting

### `COUNT`

The `COUNT` [aggregate function](https://www.postgresql.org/docs/current/functions-aggregate.html) returns the number of rows in a group. You can use `COUNT(*)` to count the number of rows in a table.

In [7]:
%%sql
-- total number of records with an alias
SELECT COUNT(*) AS count_records FROM people;

count_records
8397


In [8]:
%%sql
-- total with a birthdate
SELECT COUNT(*) AS count_birthdate FROM people WHERE birthdate IS NOT NULL;

count_birthdate
6152


In [9]:
%%sql
-- total languages and countries
SELECT COUNT(language) AS count_languages, COUNT(country) AS count_countries FROM films;

count_languages,count_countries
4957,4966


### `DISTINCT`

The [`DISTINCT` clause](https://www.postgresql.org/docs/current/sql-select.html#SQL-DISTINCT) filters duplicate rows from the result set.

In [10]:
%%sql
-- unique countries
SELECT DISTINCT country FROM films;

country
USA
Germany
Japan
Denmark
UK
Italy
France
West Germany
Sweden
Soviet Union


In [11]:
%%sql
-- number of unique countries
SELECT COUNT(DISTINCT country) AS count_distinct_countries FROM films;

count_distinct_countries
64


## Filtering

### `WHERE`

The [`WHERE` clause](https://www.postgresql.org/docs/current/sql-select.html#SQL-WHERE) filters rows from the result set that don't match the given condition.

In [12]:
%%sql
-- imdb scores greater than 7.0
SELECT film_id, imdb_score FROM reviews
WHERE imdb_score > 7.0;

film_id,imdb_score
3934,7.0999999
74,7.5999999
1254,8.0
4841,8.10000038
3252,7.19999981
1181,7.30000019
3929,7.0999999
3298,7.4000001
2744,7.4000001
4707,7.4000001


In [13]:
%%sql
-- facebook likes less than 1000
SELECT film_id, facebook_likes FROM reviews
WHERE facebook_likes < 1000 LIMIT 10;

film_id,facebook_likes
3405,0
478,491
74,930
740,0
2869,689
1181,0
2020,0
2312,912
1820,872
831,975


In [14]:
%%sql
-- number of films with over 100k votes
SELECT COUNT(*) AS films_over_100k_votes FROM reviews
WHERE reviews.num_votes >= 100000;

films_over_100k_votes
1211


In [15]:
%%sql
-- number of spanish language films
SELECT COUNT(*) AS count_spanish FROM films
WHERE films.language = 'Spanish';

count_spanish
40


In [16]:
%%sql
-- german films released before 2000
SELECT title, release_year FROM films
WHERE language = 'German'
AND release_year < 2000;

title,release_year
Metropolis,1927.0
Pandora's Box,1929.0
The Torture Chamber of Dr. Sadism,1967.0
Das Boot,1981.0
Run Lola Run,1998.0
Aimee & Jaguar,1999.0


In [17]:
%%sql
-- german films released between 2000 and 2010
SELECT * FROM films
WHERE release_year > 2000
AND release_year < 2010
AND language = 'German';

id,title,release_year,country,duration,language,certification,gross,budget
1952,Good Bye Lenin!,2003.0,Germany,121.0,German,R,4063859.0,4800000.0
2130,Downfall,2004.0,Germany,178.0,German,R,5501940.0,13500000.0
2224,Summer Storm,2004.0,Germany,98.0,German,R,95016.0,2700000.0
2709,The Lives of Others,2006.0,Germany,137.0,German,R,11284657.0,2000000.0
3100,The Baader Meinhof Complex,2008.0,Germany,184.0,German,R,476270.0,20000000.0
3143,The Wave,2008.0,Germany,107.0,German,,,5000000.0
3220,Cargo,2009.0,Switzerland,112.0,German,,,4500000.0
3346,Soul Kitchen,2009.0,Germany,99.0,German,,274385.0,4000000.0
3412,The White Ribbon,2009.0,Germany,144.0,German,R,2222647.0,12000000.0


In [18]:
%%sql
SELECT title, release_year FROM films
WHERE (release_year = 1990 OR release_year = 1999)
AND (films.language = 'English' OR films.language = 'Spanish')
AND films.gross > 2000000;

title,release_year
Arachnophobia,1990.0
Back to the Future Part III,1990.0
Child's Play 2,1990.0
Dances with Wolves,1990.0
Days of Thunder,1990.0
Dick Tracy,1990.0
Die Hard 2,1990.0
Edward Scissorhands,1990.0
Flatliners,1990.0
Ghost,1990.0


In [19]:
%%sql
SELECT title, release_year FROM films
WHERE release_year BETWEEN 1990 AND 2000
AND budget > 100000000
AND (language = 'Spanish' OR language = 'French');

title,release_year
Tango,1998.0
Les couloirs du temps: Les visiteurs II,1998.0


In [20]:
%%sql
-- name starts with "B"
SELECT name FROM people WHERE name LIKE 'B%';

name
B.J. Novak
Babak Najafi
Babar Ahmed
Bahare Seddiqi
Bai Ling
Bailee Madison
Balinese Tari Legong Dancers
BÃ¡lint PÃ©ntek
Baltasar KormÃ¡kur
Balthazar Getty


In [21]:
%%sql
-- "r" is 2nd letter in name
SELECT name FROM people WHERE name LIKE '_r%';

name
Ara Celi
Aramis Knight
Arben Bajraktaraj
Arcelia RamÃ­rez
Archie Kao
Archie Panjabi
Aretha Franklin
Ari Folman
Ari Gold
Ari Graynor


In [22]:
%%sql
-- name doesn't start with "A"
SELECT name FROM people WHERE name NOT LIKE 'A%';

name
50 Cent
Ãlex Angulo
Ãlex de la Iglesia
Ãngela Molina
B.J. Novak
Babak Najafi
Babar Ahmed
Bahare Seddiqi
Bai Ling
Bailee Madison


In [23]:
%%sql
-- over 2 hours released in '90 or '00
SELECT title, release_year FROM films WHERE release_year IN (1990, 2000) AND duration > 120;

title,release_year
Dances with Wolves,1990.0
Die Hard 2,1990.0
Ghost,1990.0
Goodfellas,1990.0
Mo' Better Blues,1990.0
Pretty Woman,1990.0
The Godfather: Part III,1990.0
The Hunt for Red October,1990.0
All the Pretty Horses,2000.0
Almost Famous,2000.0


In [24]:
%%sql
-- yes, si, oui
SELECT title, language FROM films WHERE language IN ('English', 'Spanish', 'French');

title,language
The Broadway Melody,English
Hell's Angels,English
A Farewell to Arms,English
42nd Street,English
She Done Him Wrong,English
It Happened One Night,English
Top Hat,English
Modern Times,English
The Charge of the Light Brigade,English
Snow White and the Seven Dwarfs,English


In [25]:
%%sql
SELECT title, certification, language FROM films WHERE certification IN ('NC-17', 'R') AND language IN ('English', 'Italian', 'Greek');

title,certification,language
Pink Flamingos,NC-17,English
The Evil Dead,NC-17,English
Showgirls,NC-17,English
Orgazmo,NC-17,English
L.I.E.,NC-17,English
Psycho,R,English
Rosemary's Baby,R,English
The Wild Bunch,R,English
Catch-22,R,English
Cotton Comes to Harlem,R,English


In [26]:
%%sql
SELECT COUNT(DISTINCT title) AS nineties_english_films_for_teens FROM films
WHERE release_year BETWEEN 1990 AND 1999
AND language = 'English'
AND certification IN ('G', 'PG', 'PG-13');

nineties_english_films_for_teens
310


### `NULL`

The `NULL` value represents a missing or unknown value. You can test for `NULL` values using the `IS NULL` and `IS NOT NULL` operators in the `WHERE` clause.

Note that `COUNT(column_name)` doesn't count `NULL` values, whereas `COUNT(*)` does.

In [27]:
%%sql
-- no budget
SELECT title AS no_budget_info FROM films WHERE films.budget IS NULL;

no_budget_info
Pandora's Box
The Prisoner of Zenda
The Blue Bird
Bambi
State Fair
Open Secret
Deadline - U.S.A.
Ordet
The Party's Over
The Torture Chamber of Dr. Sadism


In [28]:
%%sql
-- number with known language
SELECT COUNT(*) AS count_language_known FROM films WHERE films.language IS NOT NULL;

count_language_known
4957


## Summarizing

### Aggregate Functions

In addition to `COUNT`, there are a number of other _aggregate functions_. Common functions include:
  * `AVG`
  * `SUM`
  * `MIN`
  * `MAX`
  * `ROUND`

`AVG`, `SUM` and `ROUND` can only be used with numeric values; `MIN` and `MAX` may be used with various data types.

In [29]:
%%sql
-- total all durations
SELECT SUM(duration) AS total_duration FROM films;

total_duration
534882.0


In [30]:
%%sql
-- average duration
SELECT AVG(duration) AS average_duration FROM films;

average_duration
107.94793138244198


In [31]:
%%sql
-- latest release year
SELECT MAX(release_year) AS latest_year FROM films;

latest_year
2016.0


In [32]:
%%sql
-- shortest duration
SELECT MIN(duration) AS shortest_film FROM films;

shortest_film
7.0


In [33]:
%%sql
-- total gross '00 and later
SELECT SUM(gross) AS total_gross FROM films WHERE films.release_year >= 2000;

total_gross
150900926358.0


In [34]:
%%sql
-- average gross starting with A
SELECT AVG(gross) AS avg_gross_A FROM films WHERE films.title LIKE 'A%';

avg_gross_A
47893236.42248062


In [35]:
%%sql
-- lowest grossing film of '94
SELECT MIN(gross) AS lowest_gross FROM films WHERE films.release_year = 1994;

lowest_gross
125169.0


In [36]:
%%sql
-- highest grossing film between '00 and '12
SELECT MAX(gross) AS highest_gross FROM films WHERE films.release_year BETWEEN 2000 AND 2012;

highest_gross
760505847.0


In [37]:
%%sql
-- average facebook likes rounded to 1 decimal
SELECT ROUND(AVG(facebook_likes), 1) AS avg_facebook_likes FROM reviews;

avg_facebook_likes
7802.9


In [38]:
%%sql
-- average budget rounded to thousands
SELECT ROUND(AVG(budget), -3) AS avg_budget_thousands FROM films;

avg_budget_thousands
39903000.0


### Aliasing and Arithmetic

You can use the `AS` keyword to alias column names and values in the result set. You can use arithmetic operators like `+` and `-` in the `SELECT` clause to compute values.

In [39]:
%%sql
-- duration in hours not minutes
SELECT title, ROUND(duration / 60.0, 2) AS duration_hours FROM films;

title,duration_hours
Intolerance: Love's Struggle Throughout the Ages,2.05
Over the Hill to the Poorhouse,1.83
The Big Parade,2.52
Metropolis,2.42
Pandora's Box,1.83
The Broadway Melody,1.67
Hell's Angels,1.6
A Farewell to Arms,1.32
42nd Street,1.48
She Done Him Wrong,1.1


In [40]:
%%sql
-- percent of people who died
SELECT COUNT(deathdate) * 100.0 / COUNT(*) AS percentage_dead FROM people;

percentage_dead
9.372394902941526


In [41]:
%%sql
-- number of decades
SELECT (MAX(release_year) - MIN(release_year)) / 10.0 AS number_of_decades FROM films;

number_of_decades
10.0


## Sorting and Grouping

### `ORDER BY`

The [`ORDER BY` clause](https://www.postgresql.org/docs/current/sql-select.html#SQL-ORDERBY) sorts the result set. `ASC` (default) or `DESC` can be used to specify sort order.

Note that `ORDER BY` always comes after `GROUP BY`, but before `LIMIT`.

In [42]:
%%sql
-- names sorted alphabetically
SELECT name FROM people ORDER BY name;

name
50 Cent
A. Michael Baldwin
A. Raven Cruz
A.J. Buckley
A.J. DeLucia
A.J. Langer
AJ Michalka
Aaliyah
Aaron Ashmore
Aaron Hann


In [43]:
%%sql
-- sorted by duration descending
SELECT title, duration FROM films ORDER BY duration DESC;

title,duration
Carlos,334.0
"Blood In, Blood Out",330.0
Heaven's Gate,325.0
The Legend of Suriyothai,300.0
Das Boot,293.0
Apocalypse Now,289.0
The Company,286.0
Gods and Generals,280.0
Gettysburg,271.0
Arn: The Knight Templar,270.0


In [44]:
%%sql
-- sorted by release_year and duration (in that order)
SELECT release_year, duration, title FROM films ORDER BY release_year, duration;

release_year,duration,title
1916.0,123.0,Intolerance: Love's Struggle Throughout the Ages
1920.0,110.0,Over the Hill to the Poorhouse
1925.0,151.0,The Big Parade
1927.0,145.0,Metropolis
1929.0,100.0,The Broadway Melody
1929.0,110.0,Pandora's Box
1930.0,96.0,Hell's Angels
1932.0,79.0,A Farewell to Arms
1933.0,66.0,She Done Him Wrong
1933.0,89.0,42nd Street


In [45]:
%%sql
-- sorted by certification alphabetically and release_year descending
SELECT certification, release_year, title FROM films ORDER BY certification, release_year DESC;

certification,release_year,title
Approved,1967.0,You Only Live Twice
Approved,1967.0,In Cold Blood
Approved,1967.0,Point Blank
Approved,1966.0,A Funny Thing Happened on the Way to the Forum
Approved,1966.0,A Man for All Seasons
Approved,1966.0,Batman: The Movie
Approved,1966.0,"The Good, the Bad and the Ugly"
Approved,1966.0,Torn Curtain
Approved,1965.0,Major Dundee
Approved,1965.0,Thunderball


### `GROUP BY`

The [`GROUP BY` clause](https://www.postgresql.org/docs/current/sql-select.html#SQL-GROUPBY) groups rows that have matching values in the specified column(s). It is frequently used with aggregate functions to calculate summary statistics for each "group". For example, you might want to get the average salary for each department.

In [46]:
%%sql
-- total per year
SELECT release_year, COUNT(*) AS film_count FROM films GROUP BY release_year;

release_year,film_count
1916.0,1
1920.0,1
1925.0,1
1927.0,1
1929.0,2
1930.0,1
1932.0,1
1933.0,2
1934.0,1
1935.0,1


In [47]:
%%sql
-- average duration per year
SELECT release_year, AVG(duration) AS avg_duration FROM films GROUP BY release_year;

release_year,avg_duration
1916.0,123.0
1920.0,110.0
1925.0,151.0
1927.0,145.0
1929.0,105.0
1930.0,96.0
1932.0,79.0
1933.0,77.5
1934.0,65.0
1935.0,81.0


In [48]:
%%sql
-- biggest budget per year and country
SELECT release_year, country, MAX(budget) AS max_budget FROM films
GROUP BY release_year, country
ORDER BY release_year, country;

release_year,country,max_budget
1916.0,USA,385907.0
1920.0,USA,100000.0
1925.0,USA,245000.0
1927.0,Germany,6000000.0
1929.0,Germany,
1929.0,USA,379000.0
1930.0,USA,3950000.0
1932.0,USA,800000.0
1933.0,USA,439000.0
1934.0,USA,325000.0


### `HAVING`

`WHERE` filters individual records; `HAVING` filters grouped records. This is because of the execution order in SQL. `WHERE` filtering is done before grouping and aggregation, whereas `HAVING` filtering is done after.

In [49]:
%%sql
-- select country, distinct certification count as `certification_count`, grouped by country, having certifications greater than 10
SELECT country, COUNT(DISTINCT certification) AS certification_count FROM films GROUP BY country HAVING COUNT(DISTINCT certification) > 10;

country,certification_count
USA,12


In [50]:
%%sql
-- select country, average budget rounded to 2 decimals, grouped by country, having an average budget greater than one billion, sorted by largest average budget first
SELECT country, ROUND(AVG(budget), 2) AS average_budget FROM films GROUP BY country HAVING AVG(budget) > 1000000000 ORDER BY AVG(budget) DESC;

country,average_budget
South Korea,1383960000.0
Hungary,1260000000.0


In [51]:
%%sql
SELECT release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross FROM films
WHERE release_year > 1990
GROUP BY release_year
HAVING AVG(budget) > 60000000
ORDER BY avg_gross DESC LIMIT 1;

release_year,avg_budget,avg_gross
2005.0,70323938.23152709,41159143.29064039
