# Lecture 11: Baseball Only Examples

In [1]:
import numpy as np
import pandas as pd

---

# Scalar Functions and Query Plans

In [2]:
## we'll use the Lahman baseball database in our examples today.
## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball
%config SqlMagic.displaylimit = 20

In [3]:
%%sql
WITH year_num AS
  (SELECT year_id, (year_id % 100) as year
     FROM batting
  )
SELECT year_id, CONCAT('''', LPAD(year::text, 2, '0')) as year
  FROM year_num
 LIMIT 5;

year_id,year
2004,'04
2007,'07
2009,'09
2010,'10
2012,'12


Let's analyze the below query (we've flattened it for convenience):

In [4]:
%%sql
EXPLAIN (VERBOSE true)
SELECT year_id,
       CONCAT('''', LPAD((year_id % 100)::text, 2, '0')) AS year
FROM batting;


QUERY PLAN
Seq Scan on public.batting (cost=0.00..3922.29 rows=104324 width=36)
"Output: year_id, concat('''', lpad(((year_id % 100))::text, 2, '0'::text))"


What if scalar functions mention multiple tables?

The below query computes an arbitrary statistic for pitchers:
* 1 point for every strikeout they throw as pitcher
* –1 for every point they themselves struck out as batter

If the notebook-like output is hard to read, try out the query in `psql`. Note that notebooks don't preserve whitespace when displaying dataframes.

In [5]:
%%sql
EXPLAIN (VERBOSE true)
SELECT p.player_id, p.so - b.so
  FROM pitching p
  INNER JOIN batting b
  ON p.player_id=b.player_id;

QUERY PLAN
Nested Loop (cost=0.43..12961.23 rows=336307 width=13)
"Output: p.player_id, (p.so - b.so)"
-> Seq Scan on public.pitching p (cost=0.00..1374.06 rows=45806 width=13)
"Output: p.player_id, p.year_id, p.stint, p.team_id, p.lg_id, p.w, p.l, p.g, p.gs, p.cg, p.sho, p.sv, p.ipouts, p.h, p.er, p.hr, p.bb, p.so, p.baopp, p.era, p.ibb, p.wp, p.hbp, p.bk, p.bfp, p.gf, p.r, p.sh, p.sf, p.gidp"
-> Memoize (cost=0.43..0.73 rows=7 width=13)
"Output: b.so, b.player_id"
Cache Key: p.player_id
Cache Mode: logical
-> Index Scan using batting_pkey on public.batting b (cost=0.42..0.72 rows=7 width=13)
"Output: b.so, b.player_id"


### Window Functions

In [6]:
%%sql
SELECT name_first, name_last, year_id, HR,
       rank() OVER (ORDER BY HR DESC),
       avg(HR)    OVER (PARTITION BY b.player_id ORDER BY year_id ROWS 3 PRECEDING) as avg_3yr,
       lag(HR, 7) OVER (PARTITION BY b.player_id ORDER BY year_id) as previous,
       lag(HR, 2) OVER (PARTITION BY b.player_id ORDER BY year_id) as lag2
FROM batting b, people p
WHERE p.player_id = b.player_id
   AND (name_last = 'Bonds' or name_last = 'Ruth')
ORDER BY HR DESC
LIMIT 10;

name_first,name_last,year_id,hr,rank,avg_3yr,previous,lag2
Barry,Bonds,2001,73,1,48.25,37.0,34
Babe,Ruth,1927,60,2,44.5,54.0,25
Babe,Ruth,1921,59,3,38.25,0.0,29
Babe,Ruth,1920,54,4,24.0,,11
Babe,Ruth,1928,54,4,46.5,59.0,47
Barry,Bonds,2000,49,6,40.0,46.0,37
Babe,Ruth,1930,49,6,52.25,41.0,54
Babe,Ruth,1926,47,8,39.75,29.0,46
Barry,Bonds,1993,46,9,34.5,16.0,25
Barry,Bonds,2002,46,9,50.5,33.0,49


### Inverse Distribution Window Functions

In [7]:
%%sql
SELECT MIN(HR),
       percentile_cont(0.25) WITHIN GROUP (ORDER BY HR) AS p25,
       percentile_cont(0.50) WITHIN GROUP (ORDER BY HR) AS median,
       percentile_cont(0.75) WITHIN GROUP (ORDER BY HR) AS p75,
       percentile_cont(0.99) WITHIN GROUP (ORDER BY HR) AS p99,
       MAX(HR),
       AVG(HR) AS "average hit rate"
FROM batting;

min,p25,median,p75,p99,max,average hit rate
0,0.0,0.0,2.0,31.0,73,2.831582377976305


In [8]:
%%sql
SELECT HR, COUNT(*) FROM batting GROUP BY HR ORDER BY HR DESC;

hr,count
73,1
70,1
66,1
65,1
64,1
63,1
61,1
60,1
59,2
58,3


### Hypothetical-Set Window Functions

In [9]:
hrs = 4 # hypothetically, four home runs

In [10]:
%%sql
SELECT {{hrs}} as hypothetical,
       rank({{hrs}}) WITHIN GROUP (ORDER BY HR DESC),
       dense_rank({{hrs}}) WITHIN GROUP (ORDER BY HR DESC),
       percent_rank({{hrs}}) WITHIN GROUP (ORDER BY HR DESC) * 100 AS pct_rank,
       cume_dist({{hrs}}) WITHIN GROUP (ORDER BY HR)
FROM batting
LIMIT 10;

hypothetical,rank,dense_rank,pct_rank,cume_dist
4,18420,63,17.655573022506807,0.823445962137551


Without `jupysql` variable substituion

In [11]:
%%sql
SELECT 4 as hypothetical,
       rank(4) WITHIN GROUP (ORDER BY HR DESC),
       dense_rank(4) WITHIN GROUP (ORDER BY HR DESC),
       percent_rank(4) WITHIN GROUP (ORDER BY HR DESC) * 100 AS pct_rank,
       cume_dist(4) WITHIN GROUP (ORDER BY HR)
FROM batting
LIMIT 10;

hypothetical,rank,dense_rank,pct_rank,cume_dist
4,18420,63,17.655573022506807,0.823445962137551


<hr style="height: 3px">

# Demo 2: Connections to Statistics
## ...back to baseball... ⚾️

## Roll up with marginal distributions

In [36]:
%%sql
SELECT state_numeric,
       AVG(elev_in_m),
       STDDEV(elev_in_m), COUNT(*),
       SUM(COUNT(*)) OVER () AS total,
       COUNT(*)/SUM(COUNT(*)) OVER () AS marginal
FROM national TABLESAMPLE Bernoulli(.07)
GROUP BY state_numeric;

state_numeric,avg,stddev,count,total,marginal
1,62.0,,1,6,0.1666666666666666
2,95.0,,1,6,0.1666666666666666
6,835.0,,1,6,0.1666666666666666
12,3.0,,1,6,0.1666666666666666
48,129.0,,1,6,0.1666666666666666
55,391.0,,1,6,0.1666666666666666


In [37]:
%%sql
SELECT COUNT(DISTINCT county_numeric) FROM national;

count
291


## Drill down with normally-distributed elevations:

Start with the `state_elevations` view from earlier:

In [38]:
%sql SELECT * FROM state_elevations;

state_numeric,avg,stddev,count
54,363.6190476190476,199.26650831834743,204
29,246.09152542372885,80.2483078596168,343
68,6.666666666666667,7.99166232186187,14
4,1315.3798076923076,672.6305522946129,208
34,40.08943089430894,59.88896941733248,123
51,254.55197132616487,260.54513270095333,283
70,18.33333333333333,31.75426480542942,3
10,22.11111111111111,28.015563440198648,27
35,1756.8467432950192,471.8002505531821,273
45,122.83240223463687,123.96059930539184,181


The `fips_counties` relation has all counties, including those not in `national`:

In [39]:
%sql SELECT * FROM fips_counties LIMIT 10;

fips,county,state_numeric
1000,Alabama,1
1001,Autauga County,1
1003,Baldwin County,1
1005,Barbour County,1
1007,Bibb County,1
1009,Blount County,1
1011,Bullock County,1
1013,Butler County,1
1015,Calhoun County,1
1017,Chambers County,1


If we wanted to **drill down** to the FIPS counties, we'd need to simulate an elevation for those counties that don't exist in `national`.

Here's the first step in that process, which creates a simulated value for *every* county in `fips_counties`.
* The value is simulated from a normal distribution using that state's elevation statistics (average, standard deviation).
* Just like a Python package, we would need to import `tablefunc` in order to use the `normal_rand` function.

In [40]:
%sql CREATE EXTENSION IF NOT EXISTS tablefunc;

In [41]:
%%sql
WITH state_cty AS
(SELECT s.state_numeric, f.fips as county_numeric, s.avg, s.stddev, s.count
  FROM state_elevations s, fips_counties f
  WHERE s.state_numeric = f.state_numeric
)
SELECT s.*,
       n.n AS elev_in_m,
       true as elev_in_m_sim -- user-facing flag
  FROM state_cty s,
       LATERAL normal_rand(CAST(s.count AS INTEGER), s.avg, s.stddev) AS n
LIMIT 10;

state_numeric,county_numeric,avg,stddev,count,elev_in_m,elev_in_m_sim
1,1000,146.37888198757764,102.92185851771194,339,-20.93026613338796,True
1,1000,146.37888198757764,102.92185851771194,339,58.95021780764658,True
1,1000,146.37888198757764,102.92185851771194,339,212.70376441982532,True
1,1000,146.37888198757764,102.92185851771194,339,205.5423727961629,True
1,1000,146.37888198757764,102.92185851771194,339,77.34247267304332,True
1,1000,146.37888198757764,102.92185851771194,339,70.2839794758738,True
1,1000,146.37888198757764,102.92185851771194,339,27.07306266006553,True
1,1000,146.37888198757764,102.92185851771194,339,-73.27433823237695,True
1,1000,146.37888198757764,102.92185851771194,339,-109.03278980437204,True
1,1000,146.37888198757764,102.92185851771194,339,180.1364815259515,True


# Assembling an Explicit Hierarchy

In [42]:
## we'll use the Lahman baseball database in our initial examples today.
## replace the database connection with a database of your own!
%reload_ext sql
%sql postgresql://localhost:5432/baseball

Two relations have the pieces of the hierarchy we want:

In [43]:
%sql SELECT * FROM Appearances WHERE year_id > 1970 LIMIT 2;

year_id,team_id,lg_id,player_id,g_all,gs,g_batting,g_defense,g_p,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr
1971,ATL,NL,aaronha01,139,129,139,129,0,0,71,0,0,0,0,0,60,60,0,10,0
1971,ATL,NL,aaronto01,25,10,25,18,0,0,11,0,7,0,0,0,0,0,0,8,0


In [44]:
%sql SELECT * FROM Teams LIMIT 1;

year_id,lg_id,team_id,franch_id,div_id,rank,g,ghome,w,l,divwin,wcwin,lgwin,wswin,r,ab,h,h2b,h3b,hr,bb,so,sb,cs,hbp,sf,ra,er,era,cg,sho,sv,ipouts,ha,hra,bba,soa,e,dp,fp,name,park,attendance,bpf,ppf,team_idbr,team_idlahman45,team_idretro
1871,,BS1,BNA,,3,31,,20,10,,,N,,401,1372,426,70,37,3,60,19,73,16,,,303,109,3.55,22,1,3,828,367,2,42,23,243,24,0.834,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1


Let's join these two to make our hierarchy! Which way should we make this?

In [45]:
%%sql
SELECT a.player_id, a.team_id, t.div_id, a.*
FROM Appearances a
NATURAL JOIN Teams t
WHERE a.year_id = 2015
LIMIT 100;

player_id,team_id,div_id,year_id,team_id_1,lg_id,player_id_1,g_all,gs,g_batting,g_defense,g_p,g_c,g_1b,g_2b,g_3b,g_ss,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr
alvarda02,BAL,E,2015,BAL,AL,alvarda02,12,10,12,12,0,0,0,0,0,0,0,1,12,12,0,0,0
brachbr01,BAL,E,2015,BAL,AL,brachbr01,62,0,5,62,62,0,0,0,0,0,0,0,0,0,0,0,0
brittza01,BAL,E,2015,BAL,AL,brittza01,64,0,2,64,64,0,0,0,0,0,0,0,0,0,0,0,0
cabrace01,BAL,E,2015,BAL,AL,cabrace01,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0
cabreev01,BAL,E,2015,BAL,AL,cabreev01,29,28,29,28,0,0,0,2,0,27,0,0,0,0,0,0,1
chenwe02,BAL,E,2015,BAL,AL,chenwe02,31,31,0,31,31,0,0,0,0,0,0,0,0,0,0,0,0
clevest01,BAL,E,2015,BAL,AL,clevest01,30,24,30,10,0,9,1,0,0,0,0,0,0,0,18,4,0
davisch02,BAL,E,2015,BAL,AL,davisch02,160,159,160,138,0,0,111,0,0,0,0,0,30,30,22,0,0
deazaal01,BAL,E,2015,BAL,AL,deazaal01,30,27,30,27,0,0,0,0,0,0,19,0,13,27,0,3,0
drakeol01,BAL,E,2015,BAL,AL,drakeol01,13,0,1,13,13,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
%%sql
CREATE OR REPLACE VIEW bball_tree AS (
    SELECT DISTINCT
        a.player_id, a.team_id, t.div_id,
        a.lg_id, a.year_id
    FROM appearances a
    NATURAL JOIN teams t
);

In [48]:
%sql SELECT * FROM bball_tree LIMIT 25;

player_id,team_id,div_id,lg_id,year_id
gumbeha01,NY1,,NL,1935
gradymi01,SLN,,NL,1897
deshoji01,WS1,,AL,1938
prattla01,BRF,,FL,1915
thompsa01,PHI,,NL,1890
hollica01,DET,,AL,1922
halege01,SLA,,AL,1916
mamaual01,NYA,,AL,1924
henryji01,BOS,,AL,1937
cristch01,PHI,,NL,1906


### Revisiting the Home Run Query

Recall our old home run query:

In [49]:
%%sql
SELECT name_first, name_last, year_id,
       MIN(hr), MAX(hr), AVG(hr), STDDEV(hr), SUM(hr)
FROM batting b, people p
WHERE b.player_id = p.player_id
GROUP BY name_last, name_first, year_id
ORDER BY max DESC
LIMIT 10;

name_first,name_last,year_id,min,max,avg,stddev,sum
Barry,Bonds,2001,73,73,73.0,,73
Mark,McGwire,1998,70,70,70.0,,70
Sammy,Sosa,1998,66,66,66.0,,66
Mark,McGwire,1999,65,65,65.0,,65
Sammy,Sosa,2001,64,64,64.0,,64
Sammy,Sosa,1999,63,63,63.0,,63
Roger,Maris,1961,61,61,61.0,,61
Babe,Ruth,1927,60,60,60.0,,60
Babe,Ruth,1921,59,59,59.0,,59
Giancarlo,Stanton,2017,59,59,59.0,,59


Set up for roll up/drill down on `bball_tree` hierarchy.
* Join each (raw) person with the associated `bball_tree` entry by `(playerid, yearid)` in a CTE
* Use this result for roll-up and drill-down.

(blank space before we get to the next exercise....)
<br/><br/><br/><br/><br/>
<br/><br/><br/><br/><br/>
<br/><br/><br/><br/><br/>
<br/><br/><br/><br/><br/>

In [50]:
%%sql
WITH batting_tree AS (
    SELECT b.*, t.div_id
    FROM batting b, bball_tree t
    WHERE b.player_id = t.player_id
      AND b.year_id = t.year_id
)
SELECT name_first, name_last,
       bt.team_id, bt.lg_id, bt.div_id, bt.year_id,
       MIN(hr), MAX(hr), AVG(hr), STDDEV(hr), SUM(hr)
FROM batting_tree bt, people p
WHERE bt.player_id = p.player_id
GROUP BY bt.player_id, bt.team_id, bt.lg_id, bt.div_id, bt.year_id, name_last, name_first
ORDER BY max DESC
LIMIT 10;


name_first,name_last,team_id,lg_id,div_id,year_id,min,max,avg,stddev,sum
Barry,Bonds,SFN,NL,W,2001,73,73,73.0,,73
Mark,McGwire,SLN,NL,C,1998,70,70,70.0,,70
Sammy,Sosa,CHN,NL,C,1998,66,66,66.0,,66
Mark,McGwire,SLN,NL,C,1999,65,65,65.0,,65
Sammy,Sosa,CHN,NL,C,2001,64,64,64.0,,64
Sammy,Sosa,CHN,NL,C,1999,63,63,63.0,,63
Roger,Maris,NYA,AL,,1961,61,61,61.0,,61
Babe,Ruth,NYA,AL,,1927,60,60,60.0,,60
Babe,Ruth,NYA,AL,,1921,59,59,59.0,,59
Giancarlo,Stanton,MIA,NL,E,2017,59,59,59.0,,59
