In [76]:
import pybaseball
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
data = pybaseball.standings(2012)
data

[                  Tm   W   L  W-L%    GB
 1   New York Yankees  95  67  .586    --
 2  Baltimore Orioles  93  69  .574   2.0
 3     Tampa Bay Rays  90  72  .556   5.0
 4  Toronto Blue Jays  73  89  .451  22.0
 5     Boston Red Sox  69  93  .426  26.0,
                    Tm   W   L  W-L%    GB
 1      Detroit Tigers  88  74  .543    --
 2   Chicago White Sox  85  77  .525   3.0
 3  Kansas City Royals  72  90  .444  16.0
 4   Cleveland Indians  68  94  .420  20.0
 5     Minnesota Twins  66  96  .407  22.0,
                               Tm   W   L  W-L%    GB
 1              Oakland Athletics  94  68  .580    --
 2                  Texas Rangers  93  69  .574   1.0
 3  Los Angeles Angels of Anaheim  89  73  .549   5.0
 4               Seattle Mariners  75  87  .463  19.0,
                       Tm   W   L  W-L%    GB
 1   Washington Nationals  98  64  .605    --
 2         Atlanta Braves  94  68  .580   4.0
 3  Philadelphia Phillies  81  81  .500  17.0
 4          New York Mets  74  88

## We can observe that the team with the best record in 2012 were the Nationals with a 98-64 record

* Let's plot a regression equation for Wins and Run Differential until 2012

In [3]:
df = pd.read_csv("baseball.csv")
df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424


In [4]:
df.columns

Index(['Team', 'League', 'Year', 'RS', 'RA', 'W', 'OBP', 'SLG', 'BA',
       'Playoffs', 'RankSeason', 'RankPlayoffs', 'G', 'OOBP', 'OSLG'],
      dtype='object')

In [5]:
df["RD"] = df["RS"] - df["RA"]

In [6]:
df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,RD
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,46
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,100
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,7
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,-72
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,-146


### All Star Batters and Pitchers

#### Batters

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

lr = LinearRegression()

y = df["W"]
x = df["RD"]

reg = lr.fit(x.values.reshape(-1,1),y.values.reshape(-1,1))
print(reg.coef_, reg.intercept_)

[[0.10454822]] [80.90422078]


* We can write regression equation as:

        W = 0.104*RD + 80.90
        
* Plugging in value of 98 for highest wins,
    
        RD = 164.4

In [113]:
runs_allowed = list(df[df["Year"]==2012]["RA"])
max(runs_allowed)

890

* Assuming most runs allowed = 890
* We would need to construct a batting lineup to score 
        
        890 + 164 = 1054

* We need to calculate necessary OBP and SLG to obtain this result, so we'll build a regression model between Runs Scored and OBP and SLG, using the crude, yet efficient metric OPS (On base percentage + slugging rate), which is simply a sum of OBP and SLG, i.e, OPS = OBP + SLG

In [120]:
df["OPS"] = df["OBP"] + df["SLG"]
df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,RD,OPS,OOPS
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,46,0.746,0.732
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,100,0.709,0.684
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,7,0.728,0.718
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,-72,0.73,0.759
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,-146,0.68,0.759


In [121]:
x = df["OPS"]
y = df["RS"]

lr = LinearRegression()

reg = lr.fit(x.values.reshape(-1,1), y.values.reshape(-1,1))

print(reg.coef_, reg.intercept_)

[[1901.46929642]] [-660.95979973]


* Regression model for OPS(OBP + SLG) vs Runs Scored can be written as
        
        RS = 1901.5*OPS - 661
        
* Substituting 1054 for RS, we get
        
        OPS = 0.902

* Which means that we need 9 batters who average more than 0.902 OPS.

In [122]:
data = pybaseball.batting_stats_range('2012-03-28', '2012-10-28')

In [124]:
# A regular batter faces more than 400 at-bats
data[(data["OPS"] > 0.902) & (data["AB"] > 400)]

Unnamed: 0,Name,Age,#days,Lev,Tm,G,PA,AB,R,H,...,HBP,SH,SF,GDP,SB,CS,BA,OBP,SLG,OPS
61,Adrian Beltre,33,2497,MLB-AL,Texas,156,654,604,95,194,...,5,0,9,8,1,0,0.321,0.359,0.561,0.921
93,Ryan Braun,28,2497,MLB-NL,Milwaukee,154,677,598,108,191,...,11,0,5,12,30,7,0.319,0.391,0.595,0.987
119,Melky Cabrera,27,2547,MLB-NL,San Francisco,113,501,459,84,159,...,0,1,5,8,13,4,0.346,0.39,0.516,0.906
120,Miguel Cabrera,29,2497,MLB-AL,Detroit,161,697,622,109,205,...,3,0,6,28,4,1,0.33,0.393,0.606,0.999
128,Robinson Cano,29,2497,MLB-AL,New York,161,697,627,105,196,...,7,0,2,22,3,1,0.313,0.379,0.55,0.929
263,Edwin Encarnacion,29,2500,MLB-AL,Toronto,151,644,542,93,152,...,11,0,7,6,13,3,0.28,0.384,0.557,0.941
280,Prince Fielder,28,2497,MLB-AL,Detroit,162,690,581,83,182,...,17,0,7,19,1,0,0.313,0.412,0.528,0.94
371,Josh Hamilton,31,2497,MLB-AL,Texas,147,636,562,103,160,...,5,0,9,9,7,3,0.285,0.354,0.577,0.93
485,Matt Kemp,27,2497,MLB-NL,Los Angeles,106,449,403,74,122,...,3,0,3,10,9,3,0.303,0.367,0.538,0.906
603,Andrew McCutchen,25,2497,MLB-NL,Pittsburgh,157,673,593,107,194,...,5,0,5,9,20,12,0.327,0.4,0.553,0.953


* We can compare our prediction of these 13 batters to the batters in the All Star Team that year

In [125]:
df_as = pd.read_csv("./core/AllstarFull.csv")

In [126]:
df_as.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5291 entries, 0 to 5290
Data columns (total 8 columns):
playerID       5291 non-null object
yearID         5291 non-null int64
gameNum        5291 non-null int64
gameID         5242 non-null object
teamID         5291 non-null object
lgID           5291 non-null object
GP             5272 non-null float64
startingPos    1640 non-null float64
dtypes: float64(2), int64(2), object(4)
memory usage: 330.8+ KB


In [127]:
# print(type(df_as["startingPos"]))
df_asbat = df_as[(df_as["yearID"] == 2012)]
print(df_asbat.shape)
df_asbat

(74, 8)


Unnamed: 0,playerID,yearID,gameNum,gameID,teamID,lgID,GP,startingPos
4760,verlaju01,2012,0,ALS201207100,DET,AL,1.0,1.0
4761,napolmi01,2012,0,ALS201207100,TEX,AL,1.0,2.0
4762,fieldpr01,2012,0,ALS201207100,DET,AL,1.0,3.0
4763,canoro01,2012,0,ALS201207100,NYA,AL,1.0,4.0
4764,beltrad01,2012,0,ALS201207100,TEX,AL,1.0,5.0
4765,jeterde01,2012,0,ALS201207100,NYA,AL,1.0,6.0
4766,hamiljo03,2012,0,ALS201207100,TEX,AL,1.0,7.0
4767,grandcu01,2012,0,ALS201207100,NYA,AL,1.0,8.0
4768,bautijo02,2012,0,ALS201207100,TOR,AL,1.0,9.0
4769,ortizda01,2012,0,ALS201207100,BOS,AL,1.0,10.0


In [128]:
#Actual starting batters
df_start = df_as[(df_as["yearID"] == 2012)& (df_as["startingPos"] <= 10.0) & (df_as["startingPos"] > 1.0)]
df_start

Unnamed: 0,playerID,yearID,gameNum,gameID,teamID,lgID,GP,startingPos
4761,napolmi01,2012,0,ALS201207100,TEX,AL,1.0,2.0
4762,fieldpr01,2012,0,ALS201207100,DET,AL,1.0,3.0
4763,canoro01,2012,0,ALS201207100,NYA,AL,1.0,4.0
4764,beltrad01,2012,0,ALS201207100,TEX,AL,1.0,5.0
4765,jeterde01,2012,0,ALS201207100,NYA,AL,1.0,6.0
4766,hamiljo03,2012,0,ALS201207100,TEX,AL,1.0,7.0
4767,grandcu01,2012,0,ALS201207100,NYA,AL,1.0,8.0
4768,bautijo02,2012,0,ALS201207100,TOR,AL,1.0,9.0
4769,ortizda01,2012,0,ALS201207100,BOS,AL,1.0,10.0
4797,poseybu01,2012,0,ALS201207100,SFN,NL,1.0,2.0


* Upon observation, we can see that 12/13 predicted batters made the 2012 MLB All-Star game.

* 7/12 of our successfully predicted batters also were starters in the All-Star Game.

#### Pitchers

* We got Run Differential as 164.

In [129]:
#Let us consider All Star pitchers be in the 75th percentile, it is harder to be a pitcher than a batter.
runs_scored = df[df["Year"]==2012]["RS"]
avg_rs = runs_scored.sum()/len(runs_scored)
print((math.ceil(avg_rs) + runs_scored.min())/2)

642.0


* Assuming least runs scored, we need the best pitchers to maintain a run differential of 164.
* Implies,
        
        RD = RS - RA
     => RA = RS - RD = 642 - 164
           = 478

* We can build a regression model for pitcher ability for RA vs. Opponent OPS(On base percentage + Slugging Rate)

In [89]:
df["OOPS"] = df["OOBP"] + df["OSLG"]
df.head()

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,RD,OPS,OOPS
0,ARI,NL,2012,734,688,81,0.328,0.418,0.259,0,,,162,0.317,0.415,46,0.746,0.732
1,ATL,NL,2012,700,600,94,0.32,0.389,0.247,1,4.0,5.0,162,0.306,0.378,100,0.709,0.684
2,BAL,AL,2012,712,705,93,0.311,0.417,0.247,1,5.0,4.0,162,0.315,0.403,7,0.728,0.718
3,BOS,AL,2012,734,806,69,0.315,0.415,0.26,0,,,162,0.331,0.428,-72,0.73,0.759
4,CHC,NL,2012,613,759,61,0.302,0.378,0.24,0,,,162,0.335,0.424,-146,0.68,0.759


In [90]:
cols = ["Year","RA","OOBP","OSLG","OOPS"]
df1 = df[cols]
df1 = df1.dropna(subset=["OOBP","OSLG","OOPS"],how="all")
df1.info()
# idx 1 = df1[:idx]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420 entries, 0 to 419
Data columns (total 5 columns):
Year    420 non-null int64
RA      420 non-null int64
OOBP    420 non-null float64
OSLG    420 non-null float64
OOPS    420 non-null float64
dtypes: float64(3), int64(2)
memory usage: 19.7 KB


In [91]:
x = df1["OOPS"]
y = df1["RA"]
print(x.isnull().values.any())
print(y.isnull().values.any())
reg = lr.fit(x.values.reshape(-1,1), y.values.reshape(-1,1))

False
False


In [92]:
print(reg.coef_, reg.intercept_)

[[2069.38916057]] [-796.65733487]


* We have obtained a regression model for Runs Allowed(RA) vs OOPS as follows,
        
        RA = 2069.4*(OOPS) - 796.66

* Plugging in RA = 478, we get,
   
       OOPS = 0.616

* We need pitching staff that have OOPS less than 0.616.

In [58]:
pitch = pd.read_csv("pitch_2012.csv")

In [60]:
pitch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 30 columns):
Rk       782 non-null int64
Name     782 non-null object
Age      782 non-null int64
Tm       782 non-null object
IP       782 non-null float64
PAu      0 non-null float64
G        782 non-null int64
PA       782 non-null int64
AB       782 non-null int64
R        782 non-null int64
H        782 non-null int64
2B       782 non-null int64
3B       782 non-null int64
HR       782 non-null int64
SB       782 non-null int64
CS       782 non-null int64
BB       782 non-null int64
SO       782 non-null int64
BA       782 non-null float64
OBP      782 non-null float64
SLG      782 non-null float64
OPS      782 non-null float64
BAbip    781 non-null float64
TB       782 non-null int64
GDP      782 non-null int64
HBP      782 non-null int64
SH       782 non-null int64
SF       782 non-null int64
IBB      782 non-null int64
ROE      782 non-null int64
dtypes: float64(7), int64(21), object(2)


In [61]:
pitch["OOPS"] = pitch["OBP"] + pitch["SLG"]

In [64]:
pitch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 31 columns):
Rk       782 non-null int64
Name     782 non-null object
Age      782 non-null int64
Tm       782 non-null object
IP       782 non-null float64
PAu      0 non-null float64
G        782 non-null int64
PA       782 non-null int64
AB       782 non-null int64
R        782 non-null int64
H        782 non-null int64
2B       782 non-null int64
3B       782 non-null int64
HR       782 non-null int64
SB       782 non-null int64
CS       782 non-null int64
BB       782 non-null int64
SO       782 non-null int64
BA       782 non-null float64
OBP      782 non-null float64
SLG      782 non-null float64
OPS      782 non-null float64
BAbip    781 non-null float64
TB       782 non-null int64
GDP      782 non-null int64
HBP      782 non-null int64
SH       782 non-null int64
SF       782 non-null int64
IBB      782 non-null int64
ROE      782 non-null int64
OOPS     782 non-null float64
dtypes: flo

In [137]:
#starting pitchers pitch more than 150 innings in a season,in earlier years 200+ innings were the minimum
pitch[(pitch["OOPS"] <= 0.616) & (pitch["IP"] > 150)]

Unnamed: 0,Rk,Name,Age,Tm,IP,PAu,G,PA,AB,R,...,OPS,BAbip,TB,GDP,HBP,SH,SF,IBB,ROE,OOPS
273,274,Gio Gonzalez*\gonzagi01,26,WSN,199.1,,32,822,725,69,...,0.582,0.271,217,9,5,9,7,3,8,0.582
372,373,Clayton Kershaw*\kershcl01,24,LAD,227.2,,33,901,810,70,...,0.593,0.271,262,17,5,18,4,5,10,0.593
573,574,David Price*\priceda01,26,TBR,211.0,,31,836,767,63,...,0.602,0.286,244,20,5,2,3,2,4,0.602
731,732,Justin Verlander\verlaju01,29,DET,238.1,,33,956,884,81,...,0.601,0.275,293,16,5,4,3,2,9,0.601
748,749,Jered Weaver\weaveje02,29,LAA,188.2,,30,739,686,63,...,0.605,0.241,233,13,4,0,4,0,5,0.605
