 # Data Sourcing

In [1]:
import pandas as pd
import numpy as np
from pybaseball import statcast
import io
import requests

date = '06152018'

def get_lookup_table():
    print('Gathering player lookup table. This may take a moment.')
    url = "https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv"
    s=requests.get(url).content
    table = pd.read_csv(io.StringIO(s.decode('utf-8')), dtype={'key_sr_nfl': object, 'key_sr_nba': object, 'key_sr_nhl': object})
    #subset columns
    cols_to_keep = ['name_last','name_first','key_mlbam', 'key_retro', 'key_bbref', 'key_fangraphs', 'mlb_played_first','mlb_played_last']
    table = table[cols_to_keep]
    #make these lowercase to avoid capitalization mistakes when searching
    table['name_last'] = table['name_last'].str.lower()
    table['name_first'] = table['name_first'].str.lower()
    # Pandas cannot handle NaNs in integer columns. We need IDs to be ints for successful queries in statcast, etc. 
    # Workaround: replace ID NaNs with -1, then convert columns to integers. User will have to understand that -1 is not a valid ID. 
    table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].fillna(-1)
    table[['key_mlbam', 'key_fangraphs']] = table[['key_mlbam', 'key_fangraphs']].astype(int) # originally returned as floats which is wrong
    return table


In [2]:
#scrape statcast data
stat = statcast(start_dt='2018-06-10', end_dt='2018-06-15')

players = get_lookup_table()
players['name'] = players['name_first'] + ' ' + players['name_last']

players = players[['key_mlbam', 'name']]

Gathering player lookup table. This may take a moment.


In [3]:
stat['batter'] = stat['batter'].astype(int)
stat = pd.merge(stat, players, left_on = ['batter'], right_on = ['key_mlbam'], how = 'left')

#rename batter and pitcher name
stat['batter_name'] = stat['name']
stat['b_stands'] = stat['stand']
stat['pitcher_name'] = stat['player_name']

stat.to_csv('statRaw' + date + '.csv')

In [4]:
players.to_csv('players.csv')