<h1>TEAM Prediction Machine Learning Models - Batting</h1>

<h4>Import Dependencies</h4>

In [1]:
%matplotlib inline

In [2]:
import os
import csv
import pandas as pd

import sqlite3
import csv
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import numpy as np

from config import pgPassword

<h4>Create a connection to SQL database</h4>

In [3]:
pg_user = 'postgres'
pg_password = pgPassword
db_name = 'baseball_db'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')

<h4>Read in databases tables as DatFrames</h4>

In [4]:
teamsStatsDF = pd.read_sql_table("Team-Stats", con = engine)
battingDF = pd.read_sql_table("Batting", con = engine)
pitchingDF = pd.read_sql_table("Pitching", con = engine)
playersDF = pd.read_sql_table("Players", con = engine)
franchisesDF = pd.read_sql_table("Franchises", con = engine)
salariesDF = pd.read_sql_table("Salaries", con = engine)
teamsDF = pd.read_sql_table("Teams", con = engine)
fPlayersDF = pd.read_sql_table("FranchisePlayers", con = engine)

In [63]:
teamsStatsDF.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA,statID,teamID
0,BNA,1871,31,20,401,426,3,60.0,19.0,73.0,303,3.55,367,2,42,23,1000,BS1-1871
1,CNA,1871,28,19,302,323,10,60.0,22.0,69.0,241,2.76,308,6,28,22,1003,CH1-1871
2,CFC,1871,29,10,249,328,7,26.0,25.0,18.0,341,4.11,346,13,53,34,1008,CL1-1871
3,KEK,1871,19,7,137,178,2,33.0,9.0,16.0,243,5.17,261,5,21,17,1015,FW1-1871
4,NNA,1871,33,16,302,403,1,33.0,15.0,46.0,313,3.72,373,7,42,22,1024,NY2-1871


<h4>Manipulate Data</h4>

In [64]:
# Get data going back to latest year of interest
latestDataYear = 1980
targetTeamsDF = teamsStatsDF[teamsStatsDF["yearID"] >=latestDataYear]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA,statID,teamID
1787,ATL,1980,161,81,630,1352,144,434.0,899.0,73.0,660,3.77,1397,131,454,696,3197943,ATL-1980
1788,BAL,1980,162,100,805,1523,156,587.0,766.0,111.0,640,3.64,1438,134,507,789,3201520,BAL-1980
1789,BOS,1980,160,83,757,1588,162,475.0,720.0,79.0,767,4.38,1557,129,481,696,3205099,BOS-1980
1790,ANA,1980,160,65,698,1442,106,539.0,889.0,91.0,797,4.52,1548,141,529,725,3208680,CAL-1980
1791,CHW,1980,162,70,587,1408,91,399.0,670.0,68.0,722,3.92,1434,108,563,724,3212263,CHA-1980


In [65]:
# Limit to columns of interest
colsOfInterst = ["franchiseID", "yearID", "G", "W", "R", "H", "HR", "BB"]
targetTeamsDF = targetTeamsDF[colsOfInterst]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB
1787,ATL,1980,161,81,630,1352,144,434.0
1788,BAL,1980,162,100,805,1523,156,587.0
1789,BOS,1980,160,83,757,1588,162,475.0
1790,ANA,1980,160,65,698,1442,106,539.0
1791,CHW,1980,162,70,587,1408,91,399.0


In [66]:
# Find first year, last year, and total years of franchise
teamYearsDF = targetTeamsDF[["franchiseID", "yearID"]]
teamYearsDF = teamYearsDF.groupby("franchiseID").agg(['min', 'max', 'count'])
teamYearsDF = teamYearsDF.reset_index()
teamYearsDF.columns = teamYearsDF.columns.droplevel()
teamYearsDF = teamYearsDF.rename(columns={"": "franchiseID",
                                          "min": "firstYear",
                                         "max": "lastYear",
                                         "count": "totalYears"})
teamYearsDF.head()

Unnamed: 0,franchiseID,firstYear,lastYear,totalYears
0,ANA,1980,2019,40
1,ARI,1998,2019,22
2,ATL,1980,2019,40
3,BAL,1980,2019,40
4,BOS,1980,2019,40


In [67]:
# Merge Years Data with target data
targetTeamsDF = pd.merge(targetTeamsDF, teamYearsDF, on = ["franchiseID"])

# Create field for yearofTeam
targetTeamsDF["teamYr"] = targetTeamsDF["yearID"] + 1 - targetTeamsDF["firstYear"]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB,firstYear,lastYear,totalYears,teamYr
0,ATL,1980,161,81,630,1352,144,434.0,1980,2019,40,1
1,ATL,1981,107,50,395,886,64,321.0,1980,2019,40,2
2,ATL,1982,162,89,739,1411,146,554.0,1980,2019,40,3
3,ATL,1983,162,88,746,1489,130,582.0,1980,2019,40,4
4,ATL,1984,162,80,632,1338,111,555.0,1980,2019,40,5


In [68]:
# Add a winPCT field and make stats per game stats
targetTeamsDF["winPct"] = targetTeamsDF.W / targetTeamsDF.G
targetTeamsDF["RpG"] = targetTeamsDF.R / targetTeamsDF.G
targetTeamsDF["HpG"] = targetTeamsDF.H / targetTeamsDF.G
targetTeamsDF["HRpG"] = targetTeamsDF.HR / targetTeamsDF.G
targetTeamsDF["BBpG"] = targetTeamsDF.BB / targetTeamsDF.G

# Then keep only perGame cols
perGameCols = ['franchiseID', 'yearID', 'G', 'winPct', 
               'RpG', 'HpG', 'HRpG', 'BBpG', 'firstYear', 'lastYear', 'totalYears', 'teamYr']
targetTeamsDF = targetTeamsDF[perGameCols]
targetTeamsDF.head()

Unnamed: 0,franchiseID,yearID,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr
0,ATL,1980,161,0.503106,3.913043,8.397516,0.89441,2.695652,1980,2019,40,1
1,ATL,1981,107,0.46729,3.691589,8.280374,0.598131,3.0,1980,2019,40,2
2,ATL,1982,162,0.549383,4.561728,8.709877,0.901235,3.419753,1980,2019,40,3
3,ATL,1983,162,0.54321,4.604938,9.191358,0.802469,3.592593,1980,2019,40,4
4,ATL,1984,162,0.493827,3.901235,8.259259,0.685185,3.425926,1980,2019,40,5


In [69]:
# NOTE - This cell take a while to execute

# Add a column to indicate rows that should be skipped
#  years < latestDataYear + 2 (we can't get 2 year previous data for these years)
#  teamYr < 3 (we can't get 2 year previous data for these years)
#  year = lastYear (we can't use next year to check model)
#  year = 2019 (last year of our data so )

targetTeamsDF["skip"] = 0
for index, row in targetTeamsDF.iterrows():
    if row["yearID"] < (latestDataYear + 2):
        targetTeamsDF.at[index, "skip"] = 1
    elif row["teamYr"] < 3:
        targetTeamsDF.at[index, "skip"] = 1
    elif row["yearID"] == row["lastYear"]:
        targetTeamsDF.at[index, "skip"] = 1
    elif row["yearID"] == 2019:
        targetTeamsDF.at[index, "skip"] = 1
        
targetTeamsDF.tail()


Unnamed: 0,franchiseID,yearID,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip
1133,TBD,2015,162,0.493827,3.975309,8.537037,1.030864,2.691358,1998,2019,22,18,0
1134,TBD,2016,162,0.419753,4.148148,8.228395,1.333333,2.771605,1998,2019,22,19,0
1135,TBD,2017,162,0.493827,4.283951,8.271605,1.407407,3.364198,1998,2019,22,20,0
1136,TBD,2018,162,0.555556,4.419753,8.734568,0.925926,3.333333,1998,2019,22,21,0
1137,TBD,2019,162,0.592593,4.746914,8.808642,1.339506,3.345679,1998,2019,22,22,1


In [70]:
# SOrt by Franchise and year - to get all franchise data together
sortedTeamsDF = targetTeamsDF.sort_values(by = ["franchiseID", "yearID"])
sortedTeamsDF = sortedTeamsDF.reset_index(drop=True)
sortedTeamsDF.head()

Unnamed: 0,franchiseID,yearID,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip
0,ANA,1980,160,0.40625,4.3625,9.0125,0.6625,3.36875,1980,2019,40,1,1
1,ANA,1981,110,0.463636,4.327273,8.581818,0.881818,3.572727,1980,2019,40,2,1
2,ANA,1982,162,0.574074,5.024691,9.37037,1.148148,3.783951,1980,2019,40,3,0
3,ANA,1983,162,0.432099,4.45679,9.055556,0.950617,3.141975,1980,2019,40,4,0
4,ANA,1984,162,0.5,4.296296,8.41358,0.925926,3.432099,1980,2019,40,5,0


In [71]:
#NOTE - This cell takes a while to execute

# Iterate through the sorted batting and grab previous stats
mlDF = sortedTeamsDF.copy()
# playersMLBatting = playersMLBatting.drop(columns=["birthYear", "debuYear"])

# Designate the stat of interes
stat = "winPct"
# Make Columns labels based on stat
p2Label = "p2-" + stat
p1Label = "p1-" + stat
f1Label = "f1-" + stat
# Add those columns to DF
mlDF[p2Label] = ""
mlDF[p1Label] = ""
mlDF[f1Label] = ""

# Itterate through DF and populate those columns
i = 0
for index, row in mlDF.iterrows():
    if row["skip"] == 1:
        continue
    p2Stat = mlDF.iloc[index - 2][stat]
    p1Stat = mlDF.iloc[index - 1][stat]
    f1Stat = mlDF.iloc[index + 1][stat]
    
    mlDF.at[index, p2Label] = p2Stat
    mlDF.at[index, p1Label] = p1Stat
    mlDF.at[index, f1Label] = f1Stat
    i +=1
    
    if (i % 100000 ==0):
        print(i)

    
    
mlDF.head(10)

Unnamed: 0,franchiseID,yearID,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip,p2-winPct,p1-winPct,f1-winPct
0,ANA,1980,160,0.40625,4.3625,9.0125,0.6625,3.36875,1980,2019,40,1,1,,,
1,ANA,1981,110,0.463636,4.327273,8.581818,0.881818,3.572727,1980,2019,40,2,1,,,
2,ANA,1982,162,0.574074,5.024691,9.37037,1.148148,3.783951,1980,2019,40,3,0,0.40625,0.463636,0.432099
3,ANA,1983,162,0.432099,4.45679,9.055556,0.950617,3.141975,1980,2019,40,4,0,0.463636,0.574074,0.5
4,ANA,1984,162,0.5,4.296296,8.41358,0.925926,3.432099,1980,2019,40,5,0,0.574074,0.432099,0.555556
5,ANA,1985,162,0.555556,4.518519,8.419753,0.944444,4.0,1980,2019,40,6,0,0.432099,0.5,0.567901
6,ANA,1986,162,0.567901,4.851852,8.561728,1.030864,4.141975,1980,2019,40,7,0,0.5,0.555556,0.462963
7,ANA,1987,162,0.462963,4.753086,8.679012,1.061728,3.641975,1980,2019,40,8,0,0.555556,0.567901,0.462963
8,ANA,1988,162,0.462963,4.407407,9.0,0.765432,2.895062,1980,2019,40,9,0,0.567901,0.462963,0.561728
9,ANA,1989,162,0.561728,4.12963,8.777778,0.895062,2.648148,1980,2019,40,10,0,0.462963,0.462963,0.493827


In [72]:
# # Get rid of the skipped rows, then all complete data
mlData = mlDF.loc[mlDF['skip'] == 0]
mlData.head()

Unnamed: 0,franchiseID,yearID,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip,p2-winPct,p1-winPct,f1-winPct
2,ANA,1982,162,0.574074,5.024691,9.37037,1.148148,3.783951,1980,2019,40,3,0,0.40625,0.463636,0.432099
3,ANA,1983,162,0.432099,4.45679,9.055556,0.950617,3.141975,1980,2019,40,4,0,0.463636,0.574074,0.5
4,ANA,1984,162,0.5,4.296296,8.41358,0.925926,3.432099,1980,2019,40,5,0,0.574074,0.432099,0.555556
5,ANA,1985,162,0.555556,4.518519,8.419753,0.944444,4.0,1980,2019,40,6,0,0.432099,0.5,0.567901
6,ANA,1986,162,0.567901,4.851852,8.561728,1.030864,4.141975,1980,2019,40,7,0,0.5,0.555556,0.462963


In [86]:
# Now Working on a model
inputFactors = ["p2-winPct", "p1-winPct", "winPct", "RpG", "HpG", "HRpG", "BBpG"]
X = mlData[inputFactors]
y = mlData[["f1-winPct"]]
print(X.shape, y.shape)

(1048, 7) (1048, 1)


In [87]:
# Split data into training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [88]:
# Create a model Using LinearRegression
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [89]:
# Fit the model to the Training Set
model.fit(X_train, y_train)

# Calculate the R2 scores
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.239473036978051
Testing Score: 0.23792586096036472


In [77]:
# Print the Coefficients of the Model
coeffs = model.coef_.tolist()[0]
y_int = model.intercept_.tolist()[0]
print('Weight coefficients: ', coeffs)
print('y-axis intercept: ', y_int) 

Weight coefficients:  [-0.05834660507350355, 0.15762533483212976, 0.39805548220715725, -0.008463274352752462, 0.007982508438001223, 0.014318026824395171, 0.01076024404924125]
y-axis intercept:  0.17077072633737866


In [85]:
# Create a model Using LogisticRegression
from sklearn.linear_model import LogisticRegression
y_train = y_train.astype('int')
y_test = y_test.astype('int')

# Fit the model to the Training Set
model_log = LogisticRegression(solver='liblinear').fit(X_train, y_train)

# Calculate the R2 scores
training_score_log = model_log.score(X_train, y_train)
testing_score_log = model_log.score(X_test, y_test)

print(f"Training Score: {training_score_log}")
print(f"Testing Score: {testing_score_log}")

  y = column_or_1d(y, warn=True)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0