**Querying postgreSQL in Jupyter notebook**

Updated Padres info for SQL querying.

# Setup and custom tables

In [10]:
import os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import datetime

# Multiprocessing/threading
import multiprocess
import threading
from threading import Thread

# Code formatting with Jupyter black
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [6]:
# Web/database stuff
import sqlalchemy
import sqlalchemy_utils
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

# Multiprocessing/threading
import multiprocess
import threading  # included in base

<IPython.core.display.Javascript object>

In [7]:
# Check versioning
print("numpy: ", np.__version__)
print("pandas: ", pd.__version__)
print("matplotlib: ", matplotlib.__version__)
print("seaborn: ", sns.__version__)
print("sklearn: ", sklearn.__version__)

print("psycopg2: ", psycopg2.__version__)
print("sqlalchemy: ", sqlalchemy.__version__)
print("sqlalchemy_utils: ", sqlalchemy_utils.__version__)
print("multiprocess: ", multiprocess.__version__)

numpy:  1.17.4
pandas:  0.25.3
matplotlib:  3.1.3
seaborn:  0.9.0
sklearn:  0.22
psycopg2:  2.8.4 (dt dec pq3 ext lo64)
sqlalchemy:  1.3.11
sqlalchemy_utils:  0.36.1
multiprocess:  0.70.9


<IPython.core.display.Javascript object>

In [8]:
from pybaseball import pitching_stats
from pybaseball import batting_stats
from pybaseball import statcast_pitcher
from pybaseball import playerid_lookup
from pybaseball import statcast
from pybaseball import playerid_reverse_lookup

<IPython.core.display.Javascript object>

In [9]:
# Define a database name
# Set your postgres username
dbname = "baseball"
username = "lacar"  # change this to your username

# Working with PostgreSQL in Python
# Connect to make queries using psycopg2
con = None
con = psycopg2.connect(database=dbname, user=username)

# Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine("postgres://%s@localhost/%s" % (username, dbname))
print(engine.url)

postgres://lacar@localhost/baseball


<IPython.core.display.Javascript object>

## Importing new data with pybaseball, sending to database

### Player key info

### Statcast data

In [14]:
current_date = datetime.date.today()
print(current_date)

2020-08-09


<IPython.core.display.Javascript object>

In [19]:
# Update database based on current date

# This can be made more efficient when appending

# date_list = [
#     ("2020-07-19", str(current_date)),
#     #     ("2016-04-03", "2016-10-02"),   # didn't get
# ]

# for i, date_pair in enumerate(date_list):
#     if i < 1:
#         df_sc = statcast(date_pair[0], date_pair[1])
#         df_sc.to_sql("statcast2020", engine, if_exists="replace")
#         print(date_pair, "replace mode")
#     else:
#         df_sc = statcast(date_pair[0], date_pair[1])
#         df_sc.to_sql("statcast2020", engine, if_exists="append")
#         print(date_pair, "append mode")

This is a large query, it may take a moment to complete
Completed sub-query from 2020-07-19 to 2020-07-24
Completed sub-query from 2020-07-25 to 2020-07-30
Completed sub-query from 2020-07-31 to 2020-08-05
Completed sub-query from 2020-08-06 to 2020-08-09
('2020-07-19', '2020-08-09') replace mode


<IPython.core.display.Javascript object>

In [18]:
df_sc.shape

(60110, 90)

<IPython.core.display.Javascript object>

In [22]:
df_sc.columns

Index(['index', 'pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
     

<IPython.core.display.Javascript object>

In [25]:
# Make a query to get all unique players

# pitchers and batters
sql_query = """
(SELECT DISTINCT pitcher FROM statcast2020)
UNION
(SELECT DISTINCT batter FROM statcast2020);
"""

# Note parantheses around each one helps account for order by or limit
# which could kill subquery after first error

p_from_sql = pd.read_sql_query(sql_query, con)
p_list = p_from_sql.iloc[:, 0].tolist()

# Number of unique players
print("No. of unique players: ", len(p_list))

No. of unique players:  1009


<IPython.core.display.Javascript object>

In [26]:
# find the names of the players in player_ids, along with their ids from other data sources
df_pid = playerid_reverse_lookup(p_list, key_type="mlbam")
df_pid.to_sql("player_id", engine, if_exists="append")

Gathering player lookup table. This may take a moment.


<IPython.core.display.Javascript object>

# Show tables and what statcast table looks like

In [30]:
# Show all tables
sql_query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema='public'
AND table_type='BASE TABLE';
"""
df_query = pd.read_sql_query(sql_query, con)
df_query

Unnamed: 0,table_name
0,statcast
1,player_id
2,batting_stats
3,pitching_stats
4,pitching_stats_wpid
5,batting_stats18_wpid
6,statcast_15
7,statcast_16
8,temp_table
9,input_table


<IPython.core.display.Javascript object>

## Test queries

In [31]:
# Check statcast query
sql_query = """
SELECT *
FROM statcast2020
LIMIT 3;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,level_0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,...,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment
0,0,243,SI,2020-08-08,94.0,-0.429445,5.744621,Anthony Bass,598265.0,542914.0,...,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Standard
1,1,246,FS,2020-08-08,86.3,-0.543656,5.754739,Anthony Bass,598265.0,542914.0,...,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Standard
2,2,260,SL,2020-08-08,83.6,-0.827589,5.75878,Anthony Bass,598265.0,542914.0,...,1.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,Infield shift,Standard


<IPython.core.display.Javascript object>

In [32]:
# Check player_id query
sql_query = """
SELECT *
FROM player_id
LIMIT 3;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,index,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,0,abad,fernando,472551,abadf001,abadfe01,4994,2010.0,2019.0
1,1,abreu,bryan,650556,abreb002,abreubr01,16609,2019.0,2019.0
2,2,abreu,jose,547989,abrej003,abreujo02,15676,2014.0,2019.0


<IPython.core.display.Javascript object>

In [33]:
# Check player_id query
sql_query = """
SELECT COUNT(*)
FROM player_id;
"""
pd.read_sql_query(sql_query, con)

Unnamed: 0,count
0,2975


<IPython.core.display.Javascript object>

# Generating a running total of hits

In [None]:
# Get Padres statcast data only

sql_query = """
SELECT COUNT(*)
FROM player_id;
"""
pd.read_sql_query(sql_query, con)

# --

# --