<h1>TEAM Prediction Machine Learning Models - Batting</h1>

<h4>Import Dependencies</h4>

In [1]:
%matplotlib inline

In [2]:
import os
import csv
import pandas as pd

import sqlite3
import csv
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import numpy as np

from config import pgPassword

<h4>Create a connection to SQL database</h4>

In [3]:
pg_user = 'postgres'
pg_password = pgPassword
db_name = 'baseball_db'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')

<h4>Read in databases tables as DatFrames</h4>

In [135]:
teamsStatsDF = pd.read_sql_table("Team-Stats", con = engine)
battingDF = pd.read_sql_table("Batting", con = engine)
pitchingDF = pd.read_sql_table("Pitching", con = engine)
playersDF = pd.read_sql_table("Players", con = engine)
franchisesDF = pd.read_sql_table("Franchises", con = engine)
salariesDF = pd.read_sql_table("Salaries", con = engine)
teamsDF = pd.read_sql_table("Teams", con = engine)
fPlayersDF = pd.read_sql_table("FranchisePlayers", con = engine)

In [136]:
teamsStatsDF.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA,statID,teamID
0,BNA,1871,31,20,401,426,3,60.0,19.0,73.0,303,3.55,367,2,42,23,1000,BS1-1871
1,CNA,1871,28,19,302,323,10,60.0,22.0,69.0,241,2.76,308,6,28,22,1003,CH1-1871
2,CFC,1871,29,10,249,328,7,26.0,25.0,18.0,341,4.11,346,13,53,34,1008,CL1-1871
3,KEK,1871,19,7,137,178,2,33.0,9.0,16.0,243,5.17,261,5,21,17,1015,FW1-1871
4,NNA,1871,33,16,302,403,1,33.0,15.0,46.0,313,3.72,373,7,42,22,1024,NY2-1871


<h4>Manipulate Data</h4>

In [137]:
# Get data going back to latest year of interest
latestDataYear = 1980
targetTeamsDF = teamsStatsDF[teamsStatsDF["yearID"] >=latestDataYear]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA,statID,teamID
1787,ATL,1980,161,81,630,1352,144,434.0,899.0,73.0,660,3.77,1397,131,454,696,3197943,ATL-1980
1788,BAL,1980,162,100,805,1523,156,587.0,766.0,111.0,640,3.64,1438,134,507,789,3201520,BAL-1980
1789,BOS,1980,160,83,757,1588,162,475.0,720.0,79.0,767,4.38,1557,129,481,696,3205099,BOS-1980
1790,ANA,1980,160,65,698,1442,106,539.0,889.0,91.0,797,4.52,1548,141,529,725,3208680,CAL-1980
1791,CHW,1980,162,70,587,1408,91,399.0,670.0,68.0,722,3.92,1434,108,563,724,3212263,CHA-1980


In [138]:
# Limit to columns of interest
colsOfInterst = ["franchiseID", "yearID","teamID", "G", "W", "R", "H", "HR", "BB"]
targetTeamsDF = targetTeamsDF[colsOfInterst]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,teamID,G,W,R,H,HR,BB
1787,ATL,1980,ATL-1980,161,81,630,1352,144,434.0
1788,BAL,1980,BAL-1980,162,100,805,1523,156,587.0
1789,BOS,1980,BOS-1980,160,83,757,1588,162,475.0
1790,ANA,1980,CAL-1980,160,65,698,1442,106,539.0
1791,CHW,1980,CHA-1980,162,70,587,1408,91,399.0


In [139]:
# add a teams only column
targetTeamsDF["teamOnly"] = ""
for index, row in targetTeamsDF.iterrows():
    teamID = row["teamID"]
    teamOnly = teamID.split("-")[0]
    targetTeamsDF.at[index, "teamOnly"] = teamOnly
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,teamID,G,W,R,H,HR,BB,teamOnly
1787,ATL,1980,ATL-1980,161,81,630,1352,144,434.0,ATL
1788,BAL,1980,BAL-1980,162,100,805,1523,156,587.0,BAL
1789,BOS,1980,BOS-1980,160,83,757,1588,162,475.0,BOS
1790,ANA,1980,CAL-1980,160,65,698,1442,106,539.0,CAL
1791,CHW,1980,CHA-1980,162,70,587,1408,91,399.0,CHA


In [140]:
# Find first year, last year, and total years of franchise
teamYearsDF = targetTeamsDF[["teamOnly", "yearID"]]
teamYearsDF = teamYearsDF.groupby("teamOnly").agg(['min', 'max', 'count'])
teamYearsDF = teamYearsDF.reset_index()
teamYearsDF.columns = teamYearsDF.columns.droplevel()
teamYearsDF = teamYearsDF.rename(columns={"": "teamOnly",
                                          "min": "firstYear",
                                         "max": "lastYear",
                                         "count": "totalYears"})
teamYearsDF.head()

Unnamed: 0,teamOnly,firstYear,lastYear,totalYears
0,ANA,1997,2004,8
1,ARI,1998,2019,22
2,ATL,1980,2019,40
3,BAL,1980,2019,40
4,BOS,1980,2019,40


In [141]:
# Merge Years Data with target data
targetTeamsDF = pd.merge(targetTeamsDF, teamYearsDF, on = ["teamOnly"])

# Create field for yearofTeam
# targetTeamsDF["teamYr"] = targetTeamsDF["yearID"] + 1 - targetTeamsDF["firstYear"]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,teamID,G,W,R,H,HR,BB,teamOnly,firstYear,lastYear,totalYears
0,ATL,1980,ATL-1980,161,81,630,1352,144,434.0,ATL,1980,2019,40
1,ATL,1981,ATL-1981,107,50,395,886,64,321.0,ATL,1980,2019,40
2,ATL,1982,ATL-1982,162,89,739,1411,146,554.0,ATL,1980,2019,40
3,ATL,1983,ATL-1983,162,88,746,1489,130,582.0,ATL,1980,2019,40
4,ATL,1984,ATL-1984,162,80,632,1338,111,555.0,ATL,1980,2019,40


In [142]:
# Create field for yearofTeam
targetTeamsDF["teamYr"] = targetTeamsDF["yearID"] + 1 - targetTeamsDF["firstYear"]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,teamID,G,W,R,H,HR,BB,teamOnly,firstYear,lastYear,totalYears,teamYr
0,ATL,1980,ATL-1980,161,81,630,1352,144,434.0,ATL,1980,2019,40,1
1,ATL,1981,ATL-1981,107,50,395,886,64,321.0,ATL,1980,2019,40,2
2,ATL,1982,ATL-1982,162,89,739,1411,146,554.0,ATL,1980,2019,40,3
3,ATL,1983,ATL-1983,162,88,746,1489,130,582.0,ATL,1980,2019,40,4
4,ATL,1984,ATL-1984,162,80,632,1338,111,555.0,ATL,1980,2019,40,5


In [143]:
# Add a winPCT field and make stats per game stats
targetTeamsDF["winPct"] = targetTeamsDF.W / targetTeamsDF.G
targetTeamsDF["RpG"] = targetTeamsDF.R / targetTeamsDF.G
targetTeamsDF["HpG"] = targetTeamsDF.H / targetTeamsDF.G
targetTeamsDF["HRpG"] = targetTeamsDF.HR / targetTeamsDF.G
targetTeamsDF["BBpG"] = targetTeamsDF.BB / targetTeamsDF.G

# Then keep only perGame cols
perGameCols = ['franchiseID', 'yearID', "teamID", "teamOnly", 'G', 'winPct', 
               'RpG', 'HpG', 'HRpG', 'BBpG', 'firstYear', 'lastYear', 'totalYears', 'teamYr']
targetTeamsDF = targetTeamsDF[perGameCols]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,teamID,teamOnly,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr
0,ATL,1980,ATL-1980,ATL,161,0.503106,3.913043,8.397516,0.89441,2.695652,1980,2019,40,1
1,ATL,1981,ATL-1981,ATL,107,0.46729,3.691589,8.280374,0.598131,3.0,1980,2019,40,2
2,ATL,1982,ATL-1982,ATL,162,0.549383,4.561728,8.709877,0.901235,3.419753,1980,2019,40,3
3,ATL,1983,ATL-1983,ATL,162,0.54321,4.604938,9.191358,0.802469,3.592593,1980,2019,40,4
4,ATL,1984,ATL-1984,ATL,162,0.493827,3.901235,8.259259,0.685185,3.425926,1980,2019,40,5


In [144]:
# NOTE - This cell take a while to execute

# Add a column to indicate rows that should be skipped
#  years < latestDataYear + 2 (we can't get 2 year previous data for these years)
#  teamYr < 3 (we can't get 2 year previous data for these years)
#  year = lastYear (we can't use next year to check model)
#  year = 2019 (last year of our data so )

targetTeamsDF["skip"] = 0
for index, row in targetTeamsDF.iterrows():
    if row["yearID"] < (latestDataYear + 2):
        targetTeamsDF.at[index, "skip"] = 1
    elif row["teamYr"] < 3:
        targetTeamsDF.at[index, "skip"] = 1
    elif row["yearID"] == row["lastYear"]:
        targetTeamsDF.at[index, "skip"] = 1
    elif row["yearID"] == 2019:
        targetTeamsDF.at[index, "skip"] = 2
        
targetTeamsDF.tail()


Unnamed: 0,franchiseID,yearID,teamID,teamOnly,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip
1133,FLA,2015,MIA-2015,MIA,162,0.438272,3.783951,8.765432,0.740741,2.314815,2012,2019,8,4,0
1134,FLA,2016,MIA-2016,MIA,161,0.490683,4.068323,9.068323,0.795031,2.776398,2012,2019,8,5,0
1135,FLA,2017,MIA-2017,MIA,162,0.475309,4.802469,9.240741,1.197531,3.0,2012,2019,8,6,0
1136,FLA,2018,MIA-2018,MIA,161,0.391304,3.658385,8.093168,0.795031,2.826087,2012,2019,8,7,0
1137,FLA,2019,MIA-2019,MIA,162,0.351852,3.796296,8.185185,0.901235,2.438272,2012,2019,8,8,1


In [145]:
# SOrt by Franchise and year - to get all franchise data together
sortedTeamsDF = targetTeamsDF.sort_values(by = ["teamID"])
sortedTeamsDF = sortedTeamsDF.reset_index(drop=True)
sortedTeamsDF.head()

Unnamed: 0,franchiseID,yearID,teamID,teamOnly,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip
0,ANA,1997,ANA-1997,ANA,162,0.518519,5.117284,9.450617,0.993827,3.808642,1997,2004,8,1,1
1,ANA,1998,ANA-1998,ANA,162,0.524691,4.858025,9.444444,0.907407,3.148148,1997,2004,8,2,1
2,ANA,1999,ANA-1999,ANA,162,0.432099,4.388889,8.666667,0.975309,3.154321,1997,2004,8,3,0
3,ANA,2000,ANA-2000,ANA,162,0.506173,5.333333,9.716049,1.45679,3.753086,1997,2004,8,4,0
4,ANA,2001,ANA-2001,ANA,162,0.462963,4.265432,8.932099,0.975309,3.049383,1997,2004,8,5,0


In [152]:
#NOTE - This cell takes a while to execute

# Iterate through the sorted batting and grab previous stats
mlDF = sortedTeamsDF.copy()
# playersMLBatting = playersMLBatting.drop(columns=["birthYear", "debuYear"])

# Designate the stat of interes
stat = "winPct"
# Make Columns labels based on stat
p2Label = "p2-" + stat
p1Label = "p1-" + stat
f1Label = "f1-" + stat
# Add those columns to DF
mlDF[p2Label] = ""
mlDF[p1Label] = ""
mlDF[f1Label] = ""

# Itterate through DF and populate those columns
i = 0
for index, row in mlDF.iterrows():
    if row["skip"] == 1:
        continue
    p2Stat = mlDF.iloc[index - 2][stat]
    p1Stat = mlDF.iloc[index - 1][stat]
    f1Stat = mlDF.iloc[index + 1][stat]
    
    mlDF.at[index, p2Label] = p2Stat
    mlDF.at[index, p1Label] = p1Stat
    mlDF.at[index, f1Label] = f1Stat
    i +=1
    
    if (i % 100000 ==0):
        print(i)

    
    
mlDF.tail(10)

Unnamed: 0,franchiseID,yearID,teamID,teamOnly,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip,p2-winPct,p1-winPct,f1-winPct
1128,WSN,2010,WAS-2010,WAS,162,0.425926,4.04321,8.364198,0.919753,3.104938,2005,2019,15,6,0,0.36646,0.364198,0.496894
1129,WSN,2011,WAS-2011,WAS,161,0.496894,3.875776,8.192547,0.956522,2.919255,2005,2019,15,7,0,0.364198,0.425926,0.604938
1130,WSN,2012,WAS-2012,WAS,162,0.604938,4.512346,9.061728,1.197531,2.95679,2005,2019,15,8,0,0.425926,0.496894,0.530864
1131,WSN,2013,WAS-2013,WAS,162,0.530864,4.049383,8.425926,0.993827,2.864198,2005,2019,15,9,0,0.496894,0.604938,0.592593
1132,WSN,2014,WAS-2014,WAS,162,0.592593,4.234568,8.660494,0.938272,3.191358,2005,2019,15,10,0,0.604938,0.530864,0.512346
1133,WSN,2015,WAS-2015,WAS,162,0.512346,4.339506,8.41358,1.092593,3.32716,2005,2019,15,11,0,0.530864,0.592593,0.58642
1134,WSN,2016,WAS-2016,WAS,162,0.58642,4.709877,8.660494,1.253086,3.308642,2005,2019,15,12,0,0.592593,0.512346,0.598765
1135,WSN,2017,WAS-2017,WAS,162,0.598765,5.055556,9.117284,1.32716,3.345679,2005,2019,15,13,0,0.512346,0.58642,0.506173
1136,WSN,2018,WAS-2018,WAS,162,0.506173,4.759259,8.654321,1.179012,3.895062,2005,2019,15,14,0,0.58642,0.598765,0.574074
1137,WSN,2019,WAS-2019,WAS,162,0.574074,5.388889,9.012346,1.425926,3.604938,2005,2019,15,15,1,,,


In [147]:
# # Get rid of the skipped rows, then all complete data
mlData = mlDF.loc[mlDF['skip'] == 0]
mlData.head()

Unnamed: 0,franchiseID,yearID,teamID,teamOnly,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip,p2-winPct,p1-winPct,f1-winPct
2,ANA,1999,ANA-1999,ANA,162,0.432099,4.388889,8.666667,0.975309,3.154321,1997,2004,8,3,0,0.518519,0.524691,0.506173
3,ANA,2000,ANA-2000,ANA,162,0.506173,5.333333,9.716049,1.45679,3.753086,1997,2004,8,4,0,0.524691,0.432099,0.462963
4,ANA,2001,ANA-2001,ANA,162,0.462963,4.265432,8.932099,0.975309,3.049383,1997,2004,8,5,0,0.432099,0.506173,0.611111
5,ANA,2002,ANA-2002,ANA,162,0.611111,5.253086,9.895062,0.938272,2.851852,1997,2004,8,6,0,0.506173,0.462963,0.475309
6,ANA,2003,ANA-2003,ANA,162,0.475309,4.54321,9.092593,0.925926,2.938272,1997,2004,8,7,0,0.462963,0.611111,0.567901


In [148]:
# Now Working on a model
inputFactors = ["p2-winPct", "p1-winPct", "winPct", "RpG", "HpG", "HRpG", "BBpG"]
X = mlData[inputFactors]
y = mlData[["f1-winPct"]]
print(X.shape, y.shape)

(1033, 7) (1033, 1)


In [149]:
# Split data into training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [150]:
# Create a model Using LinearRegression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [151]:
# Fit the model to the Training Set
model.fit(X_train, y_train)

# Calculate the R2 scores
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.24005090321517164
Testing Score: 0.24014874293726748


In [132]:
# Print the Coefficients of the Model
coeffs = model.coef_.tolist()[0]
y_int = model.intercept_.tolist()[0]
print('Weight coefficients: ', coeffs)
print('y-axis intercept: ', y_int) 

Weight coefficients:  [-0.06961936549776153, 0.15283378613572646, 0.4260299051336414, -0.015717895312880963, 0.007578627580656025, 0.02547260622559739, 0.015771491513422217]
y-axis intercept:  0.17403753370739827


In [133]:
# Create a model Using LogisticRegression
from sklearn.linear_model import LogisticRegression
y_train = y_train.astype('int')
y_test = y_test.astype('int')

# Fit the model to the Training Set
model_log = LogisticRegression(solver='liblinear').fit(X_train, y_train)

# Calculate the R2 scores
training_score_log = model_log.score(X_train, y_train)
testing_score_log = model_log.score(X_test, y_test)

print(f"Training Score: {training_score_log}")
print(f"Testing Score: {testing_score_log}")

  y = column_or_1d(y, warn=True)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0