<hr />

# Baseball App Example

In this example we use Pandas, sklearn, and Bokeh to explore the Lahman Baseball Statistics database.



In [2]:
## import statements
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import bokeh
from bokeh.io import output_notebook, show
output_notebook()

## Read data and explore

In [3]:
import sqlite3
con = sqlite3.connect("../data/baseball/lahman2013.sqlite")

with con:
    cur = con.cursor()
    cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
    data = cur.fetchall()
print([datum[0] for datum in data])

['AllstarFull', 'Appearances', 'AwardsManagers', 'AwardsPlayers', 'AwardsShareManagers', 'AwardsSharePlayers', 'Batting', 'BattingPost', 'Fielding', 'FieldingOF', 'FieldingPost', 'HallOfFame', 'Managers', 'ManagersHalf', 'Master', 'Pitching', 'PitchingPost', 'Salaries', 'Schools', 'SchoolsPlayers', 'SeriesPost', 'Teams', 'TeamsFranchises', 'TeamsHalf', 'temp']


In [5]:
# Read sqlite query results into a pandas DataFrame
with con:
    df_salaries = pd.read_sql_query("SELECT * from Salaries", con)

# verify that result of SQL query is stored in the dataframe
print(df.head())

   yearID teamID lgID   playerID    salary
0    1985    ATL   NL  barkele01  870000.0
1    1985    ATL   NL  bedrost01  550000.0
2    1985    ATL   NL  benedbr01  545000.0
3    1985    ATL   NL   campri01  633333.0
4    1985    ATL   NL  ceronri01  625000.0


In [7]:
list(df_salaries.teamID.unique())

['ATL',
 'BAL',
 'BOS',
 'CAL',
 'CHA',
 'CHN',
 'CIN',
 'CLE',
 'DET',
 'HOU',
 'KCA',
 'LAN',
 'MIN',
 'ML4',
 'MON',
 'NYA',
 'NYN',
 'OAK',
 'PHI',
 'PIT',
 'SDN',
 'SEA',
 'SFN',
 'SLN',
 'TEX',
 'TOR',
 'COL',
 'FLO',
 'ANA',
 'ARI',
 'MIL',
 'TBA',
 'LAA',
 'WAS',
 'MIA']

In [32]:
gb = df_salaries.groupby('teamID')
salary_by_team = gb['salary'].agg([np.mean, np.std, np.max, np.min] )
salary_by_team['ratio'] = salary_by_team.amax / salary_by_team.amin

In [30]:
salary_by_team.head()

Unnamed: 0_level_0,mean,std,amax,amin,ratio
teamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ANA,1895109.0,2667147.0,13166667.0,150000.0,87.77778
ARI,2428196.0,2971520.0,16000000.0,170000.0,94.117647
ATL,2130475.0,3373518.0,16061802.0,60000.0,267.6967
BAL,1785712.0,2491109.0,17000000.0,60000.0,283.333333
BOS,2692114.0,3861802.0,22500000.0,60000.0,375.0


In [39]:
salary_by_team.index

Index(['ANA', 'ARI', 'ATL', 'BAL', 'BOS', 'CAL', 'CHA', 'CHN', 'CIN', 'CLE',
       'COL', 'DET', 'FLO', 'HOU', 'KCA', 'LAA', 'LAN', 'MIA', 'MIL', 'MIN',
       'ML4', 'MON', 'NYA', 'NYN', 'OAK', 'PHI', 'PIT', 'SDN', 'SEA', 'SFN',
       'SLN', 'TBA', 'TEX', 'TOR', 'WAS'],
      dtype='object', name='teamID')

In [76]:
salary_by_team['teamID'] = salary_by_team.index

In [41]:
df = salary_by_team.sort('mean')
source = ColumnDataSource(df)
p = bokeh.plotting.figure(x_range=list(df.index))
p.scatter(x="teamID", y="mean", source=source)
show(p)

  if __name__ == '__main__':


Hmm, can't read the y axis very well...

In [43]:
df = df.sort('mean')
source = ColumnDataSource(df)
p = figure(x_range=list(df["teamID"]))
p.scatter(x="teamID", y="mean", source=source)
p.xaxis.major_label_orientation = np.pi/3

show(p)

  if __name__ == '__main__':


Let's view a max versus ratio

In [153]:
TOOLS = "pan,wheel_zoom,box_zoom,reset,save,lasso_select"

df.sort_values('mean', inplace=True)
source = ColumnDataSource(df)
s1 = figure(title="Pay Avg",x_range=source.data["teamID"], tools=TOOLS, width=500)
s1.scatter(x="teamID", y="mean", source=source)
s1.xaxis.major_label_orientation = np.pi/3

s2 = figure(title="Pay Ratio", x_range=s1.x_range, tools=TOOLS, width=500)
s2.scatter(x="teamID", y="ratio", source=source)
s2.xaxis.major_label_orientation = np.pi/3

p = gridplot([[s1, s2]])
show(p)

<hr/>

Now let's join on the AllStars table to see how max salaries and all star count correlate.

In [46]:
with con:
    df_allstar = pd.read_sql_query("SELECT * from AllstarFull", con)

In [47]:
df_allstar.head()

Unnamed: 0,playerID,yearID,gameNum,gameID,teamID,lgID,GP,startingPos
0,aaronha01,1955,0,NLS195507120,ML1,NL,1.0,
1,aaronha01,1956,0,ALS195607100,ML1,NL,1.0,
2,aaronha01,1957,0,NLS195707090,ML1,NL,1.0,9.0
3,aaronha01,1958,0,ALS195807080,ML1,NL,1.0,9.0
4,aaronha01,1959,1,NLS195907070,ML1,NL,1.0,9.0


In [53]:
df_allstar_count = df_allstar.groupby('teamID')['playerID'].agg(np.count_nonzero)

In [54]:
df_allstar_count.head()

teamID
ANA     14
ARI     33
ARL      1
ATL    134
BAL    169
Name: playerID, dtype: int64

In [56]:
df_allstar_count[df_allstar_count == df_allstar_count.max()]

teamID
NYA    412
Name: playerID, dtype: int64

Now let's use this as the size of the circles in the scatter plot

In [58]:
df_allstar_count_circle_size = 10 * df_allstar_count / df_allstar_count.max() + 10

<hr/>

Now lets join the data to all_star sizes

In [67]:
df = pd.DataFrame()

In [68]:
df = df.append(salary_by_team)

In [72]:
salary_by_team['all_stars'] = df_allstar_count_circle_size

In [66]:
df.append?

In [71]:
df.head()

Unnamed: 0_level_0,mean,std,amax,amin,ratio,all_stars
teamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ANA,1895109.0,2667147.0,13166667.0,150000.0,87.77778,10.339806
ARI,2428196.0,2971520.0,16000000.0,170000.0,94.117647,10.800971
ATL,2130475.0,3373518.0,16061802.0,60000.0,267.6967,13.252427
BAL,1785712.0,2491109.0,17000000.0,60000.0,283.333333,14.101942
BOS,2692114.0,3861802.0,22500000.0,60000.0,375.0,16.917476


In [None]:
r = bz.join(bz.Data(df1), bz.Data(df), 'teamID')
r.head()

In [78]:
df = salary_by_team.sort("amax")
print(df.head())
source = ColumnDataSource(df)
p = figure(x_range=list(df["teamID"]))
p.scatter(x="teamID", y="amax", size="all_stars", source=source, fill_alpha=0.5, )
p.xaxis.major_label_orientation = np.pi/3

show(p)

                mean           std        amax      amin       ratio  \
teamID                                                                 
CAL     7.390732e+05  9.606285e+05   5375000.0   60000.0   89.583333   
ML4     6.132436e+05  8.535337e+05   5875000.0   60000.0   97.916667   
TBA     1.528400e+06  2.011091e+06  10125000.0  170000.0   59.558824   
MON     7.074589e+05  1.192678e+06  11500000.0   60000.0  191.666667   
KCA     1.299026e+06  1.951653e+06  13000000.0   60000.0  216.666667   

        all_stars teamID  
teamID                    
CAL     11.626214    CAL  
ML4     11.165049    ML4  
TBA     10.703883    TBA  
MON     11.699029    MON  
KCA     12.014563    KCA  


  if __name__ == '__main__':


<hr/>

Now let's make this an interactive plot!

In [145]:
# Read sqlite query results into a pandas DataFrame
with con:
    df_salaries = pd.read_sql_query("SELECT yearID, teamID, salary from Salaries", con)
    df_allstar = pd.read_sql_query("SELECT yearID, teamID, playerID from AllstarFull", con)

def compute_df(year=2012):
    salaries_for_year = df_salaries[ df_salaries.yearID == year]
    max_salaries = salaries_for_year.groupby('teamID')['salary'].agg([np.max])
    max_salaries.sort_values('amax', ascending=False, inplace=True)
    allstar_for_year = df_allstar[ df_allstar.yearID == year]
    allstar_count = allstar_for_year.groupby('teamID')['playerID'].count()
    allstar_count_circle_size = 10 * allstar_count / allstar_count.max() + 10
    result = pd.DataFrame(max_salaries)
    result['all_stars']= allstar_count_circle_size
    result['teamID'] = result.index
    return result

In [146]:
compute_df().head()

Unnamed: 0_level_0,amax,all_stars,teamID
teamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NYA,30000000.0,14.444444,NYA
LAA,24187500.0,12.222222,LAA
NYN,23145011.0,13.333333,NYN
MIN,23000000.0,11.111111,MIN
DET,23000000.0,15.555556,DET


In [147]:
source = ColumnDataSource(compute_df())
p = figure(x_range=list(source.data["teamID"]))
p.scatter(x="teamID", y="amax", size="all_stars", source=source, fill_alpha=0.5, )
p.xaxis.major_label_orientation = np.pi/3

In [150]:
from ipywidgets import interact, widgets 
def update(year):
    df = compute_df(year)
    source.data['all_stars'] = df['all_stars']
    source.data['amax'] = df['amax']
    bokeh.io.push_notebook()

In [151]:
show(p)

In [152]:
interact(update, year=widgets.IntSlider(min=1985, max=2013, value=2013))