In [None]:
%load_ext sql
%sql postgres://jovyan:si330studentuser@localhost:5432/si330

* To get data out of the database and work with it we use the SELECT statement
* The select statement requires us at a minimum to indicate the columns we are interested in and the table we are interested in
* Base form is SELECT cols FROM table
* Note that the return value of the select statement is itself a relation (table)

In [None]:
%sql select first_name from person;
# we can limit this to a certain number of rows with the LIMIT clause
%sql select first_name from person limit 10;

* Statements in SQL are terminated with a semicolon, but when we are executing them in python the library which handles the connection will generally terminate the statement for us at the end of the line
* Notice that the return relation isn't sorted. It's up to the database to determine what order the items you get are in
* Beyond this, the database can choose any ten items when you make a limit call, there is no intrinsic ordering of your results, though some database vendors may choose to do so based on recency

In [None]:
# when you select multiple columns from a single database the results are row consistent, e.g. the first name and last names align
%sql select first_name, last_name from person limit 5

In [None]:
# it's common to use an * as a wildcard for any column
%sql select * from person limit 5

In [None]:
# we can limit the results we want to return using a WHERE clouse
%sql select first_name from person where last_name='King' limit 5

In [None]:
# note the single quotes for strings in sql!
# there are two wildcard options with varchar matching in SQL:
# _ matches a single character
# % matches any number of characters
# to use these we must use the LIKE operator

# find all people who have a name which starts with Chris
%sql select first_name from person where first_name like 'Chris%' limit 5

In [None]:
%sql select first_name from person where first_name like 'Chris_' limit 5

* Unfortunatly, this form of string comparison is super limited. But it's pretty easy to optimize to be fast, so you should be aware of how to use it
* SQL has no regex functionality built into it :(
* (But the dirty truth is everyone loves regex so much you can use regex with a few custom functions)
* String matching is of course, case sensitive
* We can negate the like operator too with NOT

In [None]:
%sql select first_name from person where last_name not like 'K%' limit 5

In [None]:
# for numeric columns we can also use our regular numeric operators
%sql select * from person where index <2

In [None]:
# SQL uses an odd syntax for not equals, the <> operator
%sql select * from person where index <> 2 limit 5

In [None]:
# we can chain multiple where comparisons together using AND
%sql select * from person where index <100 and index > 20 and first_name like '%ar%' limit 5

In [None]:
# how would you have written the above in pandas?

In [None]:
# another nice function in sql for ranges is BETWEEN and NOT BETWEEN (which are inclusive)
%sql select * from person where index between 10 and 13

In [None]:
# another important operator is IN, which does set comparison
%sql select * from person where first_name in ('Christopher','Michael','George') limit 10

In [None]:
# it is common to format your sql statements over multiple lines, and in jupyter we can do this with the sql cell magic
# we use -- to denote a comment

In [None]:
%%sql
select first_name, last_name, street_address
from person
where first_name like 'Chris%' -- name has to start with chris
and last_name like '____' -- name has to end with five characters
-- note that all the text highlighting/code formatting is off because jupyter things this is python

In [None]:
# in sql we can use aggregation functions as well.
# An aggregation converts a vector into a scalar, just like in pandas
# lots of values in, one value out.
# we use these on the columns
%sql select count(first_name) from person
# How many first_name rows are there in the table person?

In [None]:
# It's more common to see people count all of the columns, functionally there is a difference but the pattern is so common
# datbases return the result quickly. Of course, the length (count) of each column is the same
%sql select count(*) from person

In [None]:
# lots of other aggregation functions exist as you might expect
%sql select max(index) from person

In [None]:
%sql select max(index), min(index), count(*) from person

In [None]:
# this is an interesting query, because the return table has one row, three columns, and is just a bunch of
# summary information. Remember, the return value of a select statement is always itself a table (relation)
# How might we try and get a list of all unique firstnames with a count of how many occur in our dataset?

In [None]:
%sql select count(first_name), first_name from person

In [None]:
# this doesn't do what we want, just like in pandas we need to tell SQL how we want to group the data.
# once we group the data then the return result is just a combination of the aggregation functions
%sql select first_name, count(first_name) from person group by first_name limit 5

In [None]:
# this is just like we've been doing in pandas!

In [None]:
%%sql
select first_name, count(*) 
from person 
where first_name like 'Chris%' 
group by first_name

In [None]:
# just like in pandas we can group by multiple columns. This means we need a unique combination of the two columns
# remember that cell magics (%%) must start the cell, can't have comments up top :(

In [None]:
%%sql
select first_name, last_name, count(*)
from person
where first_name like 'Chris%'
group by first_name, last_name
limit 5

In [None]:
# we can have other functions operate on columns as well these functions can be user defined (hard, unusual) or 
# are more likely built into the rdbms. They are not standard so each engine has it's own set of functions
# Often this breaks with SQL norms as well, which can be frustrating for portability
%sql select * from person where first_name ~ '[A|B|C].*' limit 5
# this returns the number of people who have a name starting with A B or C
# yum, regex!

In [None]:
# ok, you've heard me say again and again that every select returns a table, and we know that
# select statements work on tables, so why not have a select statement work on a select statement
# result?

# These are called subselects, and it's a beautiful beautiful thing!


In [None]:
%sql select * from person where first_name in (select first_name from person where city like 'Port%') and index <100