**Querying postgreSQL in Jupyter notebook**

Updated Padres info for SQL querying.

# Setup and custom tables

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import datetime

# Multiprocessing/threading
import multiprocess
import threading
from threading import Thread

# Code formatting with Jupyter black
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# Web/database stuff
import sqlalchemy
import sqlalchemy_utils
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

# Multiprocessing/threading
import multiprocess
import threading  # included in base

<IPython.core.display.Javascript object>

In [3]:
# Check versioning
print("numpy: ", np.__version__)
print("pandas: ", pd.__version__)
print("matplotlib: ", matplotlib.__version__)
print("seaborn: ", sns.__version__)
print("sklearn: ", sklearn.__version__)

print("psycopg2: ", psycopg2.__version__)
print("sqlalchemy: ", sqlalchemy.__version__)
print("sqlalchemy_utils: ", sqlalchemy_utils.__version__)
print("multiprocess: ", multiprocess.__version__)

numpy:  1.17.4
pandas:  0.25.3
matplotlib:  3.1.3
seaborn:  0.9.0
sklearn:  0.22
psycopg2:  2.8.4 (dt dec pq3 ext lo64)
sqlalchemy:  1.3.11
sqlalchemy_utils:  0.36.1
multiprocess:  0.70.9


<IPython.core.display.Javascript object>

In [4]:
from pybaseball import pitching_stats
from pybaseball import batting_stats
from pybaseball import statcast_pitcher
from pybaseball import playerid_lookup
from pybaseball import statcast
from pybaseball import playerid_reverse_lookup

<IPython.core.display.Javascript object>

In [5]:
# Define a database name
# Set your postgres username
dbname = "baseball"
username = "lacar"  # change this to your username

# Working with PostgreSQL in Python
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database=dbname, user=username)

# Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine("postgres://%s@localhost/%s" % (username, dbname))
print(engine.url)

postgres://lacar@localhost/baseball


<IPython.core.display.Javascript object>

## Importing new data with pybaseball, sending to database

### Player key info

### Statcast data

In [56]:
current_date = datetime.date.today()
print(current_date)

2020-09-01


<IPython.core.display.Javascript object>

In [57]:
# Update database based on current date

# This can be made more efficient when appending

date_list = [
    ("2020-08-30", str(current_date)),
    #     ("2016-04-03", "2016-10-02"),   # didn't get
]

date_pair = date_list[0]
df_sc = statcast(date_pair[0], date_pair[1])
df_sc.to_sql("statcast2020", engine, if_exists="append")

# for i, date_pair in enumerate(date_list):
#     if i < 1: # don't think this makes sense
#         df_sc = statcast(date_pair[0], date_pair[1])
#         df_sc.to_sql("statcast2020", engine, if_exists="replace")
#         print(date_pair, "replace mode")
#     else:
#         df_sc = statcast(date_pair[0], date_pair[1])
#         df_sc.to_sql("statcast2020", engine, if_exists="append")
#         print(date_pair, "append mode")

<IPython.core.display.Javascript object>

In [58]:
df_sc.shape

(7923, 90)

<IPython.core.display.Javascript object>

In [59]:
df_sc.columns

Index(['index', 'pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
     

<IPython.core.display.Javascript object>

In [60]:
# Make a query to get all unique players

# pitchers and batters
sql_query = """
(SELECT DISTINCT pitcher FROM statcast2020)
UNION
(SELECT DISTINCT batter FROM statcast2020);
"""

# Note parantheses around each one helps account for order by or limit
# which could kill subquery after first error

p_from_sql = pd.read_sql_query(sql_query, con)
p_list = p_from_sql.iloc[:, 0].tolist()

# Number of unique players
print("No. of unique players: ", len(p_list))

No. of unique players:  1171


<IPython.core.display.Javascript object>

In [61]:
# find the names of the players in player_ids, along with their ids from other data sources
df_pid = playerid_reverse_lookup(p_list, key_type="mlbam")
df_pid.to_sql("player_id", engine, if_exists="append")

Gathering player lookup table. This may take a moment.


<IPython.core.display.Javascript object>

# Show tables and what statcast table looks like

In [62]:
# Show all tables
sql_query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_type='BASE TABLE';
"""
df_query = pd.read_sql_query(sql_query, con)
df_query

Unnamed: 0,table_name
0,statcast
1,player_id
2,batting_stats
3,pitching_stats
4,pitching_stats_wpid
5,batting_stats18_wpid
6,statcast_15
7,statcast_16
8,temp_table
9,input_table


<IPython.core.display.Javascript object>

## Test queries

In [63]:
# Check statcast query, see latest date
sql_query = """
SELECT *
FROM statcast2020
ORDER BY game_date DESC
LIMIT 3;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,level_0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,117,FC,2020-08-31,91.9,-2.645198,6.464136,Mark Melancon,606299.0,453343.0,...,3.0,6.0,3.0,6.0,6.0,3.0,3.0,6.0,Standard,Standard
1,1,123,FC,2020-08-31,92.8,-2.661022,6.411351,Mark Melancon,606299.0,453343.0,...,3.0,6.0,3.0,6.0,6.0,3.0,3.0,6.0,Standard,Standard
2,2,133,KC,2020-08-31,82.6,-2.377211,6.449001,Mark Melancon,660620.0,453343.0,...,3.0,6.0,3.0,6.0,6.0,3.0,3.0,6.0,Infield shift,Standard


<IPython.core.display.Javascript object>

In [19]:
# Check player_id query
sql_query = """
SELECT *
FROM player_id
LIMIT 3;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,index,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,0,abad,fernando,472551,abadf001,abadfe01,4994,2010.0,2019.0
1,1,abreu,bryan,650556,abreb002,abreubr01,16609,2019.0,2019.0
2,2,abreu,jose,547989,abrej003,abreujo02,15676,2014.0,2019.0


<IPython.core.display.Javascript object>

In [20]:
# Check player_id query
sql_query = """
SELECT COUNT(*)
FROM player_id;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,count
0,5134


<IPython.core.display.Javascript object>

# Generating a running total of hits

In [None]:
# Get Padres statcast data only

sql_query = """
SELECT COUNT(*)
FROM player_id;
"""
pd.read_sql_query(sql_query, con)

# Machado's stats

In [64]:
sql_query = """
SELECT *
FROM player_id
WHERE name_last = 'machado';
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,index,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,1050,machado,andres,600921,macha003,machaan02,14681,2017.0,2017.0
1,1051,machado,dixon,553988,machd001,machadi01,11472,2015.0,2018.0
2,1052,machado,manny,592518,machm001,machama01,11493,2012.0,2019.0
3,560,machado,manny,592518,machm001,machama01,11493,2012.0,2020.0
4,560,machado,manny,592518,machm001,machama01,11493,2012.0,2020.0
5,632,machado,manny,592518,machm001,machama01,11493,2012.0,2020.0
6,643,machado,manny,592518,machm001,machama01,11493,2012.0,2020.0


<IPython.core.display.Javascript object>

In [25]:
sql_query = """
SELECT DISTINCT key_mlbam
FROM player_id
WHERE name_last = 'machado'
AND name_first = 'manny';
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,key_mlbam
0,592518


<IPython.core.display.Javascript object>

In [31]:
# Building query
sql_query = """
SELECT *
FROM statcast2020
WHERE batter = (SELECT DISTINCT key_mlbam
                FROM player_id
                WHERE name_last = 'machado'
                AND name_first = 'manny')
AND events IS NOT NULL
ORDER BY game_date DESC
LIMIT 3;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,level_0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,5510,3538,FF,2020-08-25,94.0,-2.660588,5.220247,Joey Gerber,592518.0,680702.0,...,3.0,8.0,3.0,8.0,8.0,3.0,3.0,8.0,Strategic,Standard
1,5577,4121,SL,2020-08-25,88.1,-2.159494,5.443239,Dan Altavilla,592518.0,656186.0,...,3.0,8.0,3.0,8.0,8.0,3.0,3.0,8.0,Strategic,Standard
2,5800,7606,FC,2020-08-25,86.2,2.496718,5.634573,Marco Gonzales,592518.0,594835.0,...,0.0,2.0,0.0,2.0,2.0,0.0,0.0,2.0,Standard,Standard


<IPython.core.display.Javascript object>

In [35]:
# Machado's number of home runs in last 10 days
sql_query = """
SELECT CURRENT_DATE
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,current_date
0,2020-08-27


<IPython.core.display.Javascript object>

In [65]:
# Machado's stats in last 10 days
sql_query = """
SELECT game_date, player_name, events
FROM statcast2020
WHERE batter = (SELECT DISTINCT key_mlbam
                FROM player_id
                WHERE name_last = 'machado'
                AND name_first = 'manny')
AND events IS NOT NULL
AND CURRENT_DATE - CAST(game_date AS DATE) < 10
ORDER BY game_date ASC;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,game_date,player_name,events
0,2020-08-18,Mike Minor,walk
1,2020-08-18,Joely Rodriguez,field_out
2,2020-08-18,Mike Minor,field_out
3,2020-08-18,Taylor Hearn,single
4,2020-08-18,Mike Minor,single
...,...,...,...
58,2020-08-30,James Pazos,single
59,2020-08-31,Jeff Hoffman,field_out
60,2020-08-31,German Marquez,field_out
61,2020-08-31,German Marquez,sac_fly


<IPython.core.display.Javascript object>

In [66]:
# Machado's breakdown of stats in last 10 days
sql_query = """
SELECT COUNT(*), events
FROM statcast2020
WHERE batter = (SELECT DISTINCT key_mlbam
                FROM player_id
                WHERE name_last = 'machado'
                AND name_first = 'manny')
AND events IS NOT NULL
AND CURRENT_DATE - CAST(game_date AS DATE) < 10
GROUP BY events
ORDER BY count;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,count,events
0,1,triple
1,1,fielders_choice_out
2,2,sac_fly
3,2,force_out
4,2,grounded_into_double_play
5,4,double
6,4,walk
7,5,strikeout
8,6,home_run
9,16,single


<IPython.core.display.Javascript object>

In [78]:
# Machado's running total of stats in last 10 days
sql_query = """
SELECT game_date, at_bat_number, player_name, events,
       COUNT(events) OVER(PARTITION BY events ORDER BY game_date, at_bat_number) AS n_events
FROM statcast2020
WHERE batter = (SELECT DISTINCT key_mlbam
                FROM player_id
                WHERE name_last = 'machado'
                AND name_first = 'manny')
AND events IS NOT NULL
AND CURRENT_DATE - CAST(game_date AS DATE) < 10
ORDER BY game_date, at_bat_number;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,game_date,at_bat_number,player_name,events,n_events
0,2020-08-23,9.0,Zack Greinke,force_out,1
1,2020-08-23,29.0,Zack Greinke,field_out,1
2,2020-08-23,44.0,Zack Greinke,field_out,2
3,2020-08-23,59.0,Enoli Paredes,home_run,1
4,2020-08-25,10.0,Marco Gonzales,single,1
5,2020-08-25,28.0,Marco Gonzales,double,1
6,2020-08-25,43.0,Marco Gonzales,field_out,3
7,2020-08-25,66.0,Dan Altavilla,single,2
8,2020-08-25,82.0,Joey Gerber,triple,1
9,2020-08-27,6.0,Ljay Newsome,field_out,4


<IPython.core.display.Javascript object>

## Correlated subquery (lesson from datacamp)



In [80]:
# Find where Machado he reached two events for the first time

sql_query = """

WITH t AS
    (SELECT game_date, at_bat_number, player_name, events,
           COUNT(events) OVER(PARTITION BY events ORDER BY game_date, at_bat_number) AS n_events
    FROM statcast2020
    WHERE batter = (SELECT DISTINCT key_mlbam
                    FROM player_id
                    WHERE name_last = 'machado'
                    AND name_first = 'manny')
    AND events IS NOT NULL
    AND CURRENT_DATE - CAST(game_date AS DATE) < 10
    ORDER BY game_date, at_bat_number)

SELECT main.game_date,
       main.player_name,
       main.events,
       main.n_events
FROM t AS main
WHERE
      (main.n_events) >= 
          (SELECT (MAX(sub.n_events))
           FROM t AS sub
           WHERE main.events = sub.events)
       
;
"""
pd.read_sql_query(sql_query, con)


Unnamed: 0,game_date,player_name,events,n_events
0,2020-08-25,Marco Gonzales,double,1
1,2020-08-25,Joey Gerber,triple,1
2,2020-08-27,Yusei Kikuchi,grounded_into_double_play,1
3,2020-08-27,Matt Magill,home_run,4
4,2020-08-27,Yoshihisa Hirano,force_out,2
5,2020-08-29,Antonio Senzatela,fielders_choice_out,1
6,2020-08-30,Ryan Castellani,strikeout,2
7,2020-08-30,Ryan Castellani,strikeout,2
8,2020-08-30,James Pazos,single,11
9,2020-08-30,James Pazos,single,11


<IPython.core.display.Javascript object>

## Run CTE, correlated subquery, nested subquery

# Starting lineup of stats

# --

# --