In [None]:
# this code is needed for the lecture, but I want to defer discussion of how it works to the next lecture...
%load_ext sql
%sql postgres://jovyan:si330studentuser@localhost:5432/si330
import psycopg2
import sqlalchemy
import pandas as pd
engine = sqlalchemy.create_engine('postgres://jovyan:si330studentuser@localhost:5432/si330')
%sql drop table if exists person cascade
pd.read_csv("person.csv").to_sql("person",engine)
%sql drop table if exists sales cascade
pd.read_csv("sales.csv").to_sql("sales",engine)

# A brief reminder of the power of SQL
* There is no ordering in SQL
* Everything is a table (relation), and we can thus pass around select statements as if they were tables
* This allows us to make subselects, and doing so can be very efficient
* We can restrict the data we want back as far as the columns and we can use where clauses to restrict rows on criteria
* We can run aggregation functions across sets of data, and we can group data by column values if we want to

In [None]:
%%sql 
select
   count(*) as num,
   avg(price) as avg_price,
   sum(price) as total,
   credit_card_provider 
from
   sales 
where
   price between 1000 and 2000 
   and id in 
   (
      select
         id 
      from
         person
   )
group by
   credit_card_provider

# Knowing SQL is like waterbending
* Practice shaping the data to your needs, not fighting to force it to be your result
* Think about writing complex SQL queries as a choreography of manipulations and not a single monolithic "correct" statement
<img src="https://media1.tenor.com/images/2bea107f3e0d43b0610ab11324ba2b6d/tenor.gif?itemid=4849726" />

# Joins
* It is common to bring together datasets through SQL joins
* Unfortunatly, there are many different syntaxes for joining in SQL, which can impede learning
* Just like in pandas, joining in SQL (which we called merging in pandas) is based on 2 tables only, a left and a right
* A quick reminder on joining in general

<img src="https://i.stack.imgur.com/hMKKt.jpg" />

In [None]:
# to join we need a common column between the two tables
# usually this column is unique in one table (like a person 
# person in our person table)
%sql select * from person limit 1;

In [None]:
# and usually this column is not unique in the other table (like
# the person identifier in the sales table)
%sql select * from sales limit 1;

In [None]:
# whoever designed this data did a poor job, since it's unclear
# where we expect to link data. We can strengthen this in a few ways,
# including key constraints. Or a better naming convention, like
%sql alter table sales rename column id to person_id;

In [None]:
%sql select * from sales limit 1;

In [None]:
# in this instance, we might want to answer the a question like "Give me all
# of the sales and people information for people who bought things". This is
# an inner join, since we want all data but on strict condition checking.
%sql select * from sales s, person p where s.person_id=p.id limit 5;

In [None]:
# we have can limit these columns to just include those from the two named tables
# using glob functions, or specific names
%sql select p.*, s.price, s.purchase_date from sales s, person p where s.person_id=p.id limit 5;

In [None]:
# ok, the inner join is easy, and that's the syntax I use all the time
# but we can do the same inner join using the SQL join syntax, and this give us the ability
# to do other kinds of joins as well
# the join syntax is
# SELECT FROM table1 JOIN table2 ON table1.column=table2.column
%sql select * from person p join sales s on p.id=s.person_id limit 2;

* to do the other kinds of joins, we just change JOIN to one of
  * INNER JOIN (this is a synonym for what we already have)
  * LEFT JOIN (give us everything in the left as well as things that might match in the right, or null if no match)
  * RIGHT JOIN (give us everything in the right as well as things that might match in the left, or null if no match)
  * FULL JOIN (give us everything everywhere with nulls as appropriate)

In [None]:
# First, let's do a sanity check, how many people are in the system but have not bought anything
%sql select count(*) from person p where p.id not in (select person_id from sales);

In [None]:
# and how many people are in the sales but are not in the system?
%sql select count(*) from sales s where s.person_id not in (select id from person);

In [None]:
# So our sales table is, with respect to person identifier, a subset of our person table

# This means that if we do a left join on person, we will have all of the data from sales
# and all of the data from person, including details of people who have no sales

# Question 1 -- how many rows should we expect back in our result set if we do this right?

# Question 2 -- how would you write the query for this, given what I just wrote above?

In [None]:
# To demonstrate the other kinds of joins we need to remove some data from persons so that
# the one table isn't a complete superset of the other. This is just for demonstration purposes
%sql delete from person where id %2=0

In [None]:
# What did I just do?

In [None]:
# Ok, let's demonstrate the inner join again
%sql select p.first_name, s.price from person p join sales s on p.id=s.person_id;

In [None]:
# now if I left join it, I could have more records. These would be people who are in our table
# but are not in our sales table. And we should see some null values on the right hand side (sales)
%sql select p.first_name, s.price from person p left join sales s on p.id=s.person_id;

In [None]:
# now if I right join it, I could have more records. These would be sales who are in our sales table
# but are not in our people table. And we should see some null values on the left hand side (person)
%sql select p.first_name, s.price from person p right join sales s on p.id=s.person_id limit 10;

In [None]:
# now a full join will bring me the most rows back, because it will bring me back all people and
# all sales regardless of whether they exist in each table, but it will align them if possible (and
# null padd if not)
%sql select p.first_name, s.price from person p full join sales s on p.id=s.person_id;

# Joining
* joining is fundamental in SQL
* I've covered it here in a nutshell, you can already use your pandas powers to do this!
* You can use joining to weed out values, (many ways to skin a cat with SQL!)

In [None]:
# how would we get the top 25 people we have made money from?

And frankly, this makes SQL some bad assed waterbending.
<img src="https://img.memecdn.com/badass-korra_o_3075605.jpg" />