<h1>Player Prediction Machine Learning Models - Batting</h1>

<h4>Import Dependencies</h4>

In [5]:
%matplotlib inline

In [61]:
import warnings
warnings.simplefilter('ignore')

import os
import csv
import pandas as pd

import sqlite3
import csv
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import numpy as np

from config import pgPassword

<h4>Create a connection to SQL database</h4>

In [2]:
pg_user = 'postgres'
pg_password = pgPassword
db_name = 'baseball_db'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')

<h4>Read in databases tables as DatFrames</h4>

In [3]:
teamsStatsDF = pd.read_sql_table("Team-Stats", con = engine)
battingDF = pd.read_sql_table("Batting", con = engine)
pitchingDF = pd.read_sql_table("Pitching", con = engine)
playersDF = pd.read_sql_table("Players", con = engine)
franchisesDF = pd.read_sql_table("Franchises", con = engine)
salariesDF = pd.read_sql_table("Salaries", con = engine)
teamsDF = pd.read_sql_table("Teams", con = engine)
fPlayersDF = pd.read_sql_table("FranchisePlayers", con = engine)

In [4]:
battingDF.head()

Unnamed: 0,yearID,stint,G,R,H,HR,BB,IBB,SO,SB,fpID
0,1954,1,35,2,10,0,3,0.0,15.0,0.0,336907024
1,1955,1,46,1,2,0,1,0.0,0.0,0.0,336907024
2,1956,1,49,4,11,0,2,0.0,28.0,0.0,336907024
3,1957,1,49,4,14,0,6,0.0,26.0,0.0,336907024
4,1958,1,46,1,6,0,5,0.0,13.0,1.0,336907024


In [7]:
# Function to do the dataManipulation steps in PlayerBattingML
def manipulateDF(dfBatting, dfPlayers, dfFranchPlayers, latestDataYear, stat):
    #get only data from latest year forward
    dfBatting = dfBatting[dfBatting["yearID"] >= latestDataYear]
    #combine stats of players with multiple stints in a year
    dfBatting = dfBatting.groupby(["yearID", "fpID"]).sum().reset_index()
    
    # Merge FranchisePlayer and PLayer Tables
    dfFP = dfFranchPlayers[["playerID", "fpID"]]
    playerMergedDF = pd.merge(dfPlayers, dfFP, on = ["playerID"])
    # Get years out of columns
    playerMergedDF["debutYear"] = playerMergedDF["debut"].dt.year
    playerMergedDF["finalYear"] = playerMergedDF["finalGame"].dt.year
    playerMergedDF["totalYears"] = playerMergedDF.finalYear + 1 - playerMergedDF.debutYear
    playerMergedDF = playerMergedDF[["fpID", "playerID", "birthYear", "debutYear", "finalYear", "totalYears"]]
    
    # Merga Player and Batting Data
    mergedBatting = pd.merge(dfBatting, playerMergedDF, on = ["fpID"])
    
    # Normalize statistics to be per game

    # Divide statistics by games played
    mergedBatting["RpG"] = mergedBatting.R / mergedBatting.G
    mergedBatting["HpG"] = mergedBatting.H / mergedBatting.G
    mergedBatting["HRpG"] = mergedBatting.HR / mergedBatting.G
    mergedBatting["BBpG"] = (mergedBatting.BB + mergedBatting.IBB) / mergedBatting.G
    mergedBatting["SOpG"] = mergedBatting.SO / mergedBatting.G
    mergedBatting["SBpG"] = mergedBatting.SB / mergedBatting.G

    #Limit to only columns of interest
    desiredCols = ["yearID", "fpID", "G","RpG", "HpG", "HRpG", "BBpG", "SOpG", "SBpG",
                   "birthYear", "debutYear", "finalYear", "totalYears"]
    mergedBatting = mergedBatting[desiredCols]
    
    # Add age and careerYears columns
    mergedBatting["careerYear"] = mergedBatting.yearID + 1 - mergedBatting.debutYear
    mergedBatting["age"] = mergedBatting.yearID - mergedBatting.birthYear
    
    # Add a column to indicate rows that should be skipped
    #  years < latestDataYear + 2 (we can't get 2 year previous data for these years)
    #  careerYear < 3 (we can't get 2 year previous data for these years)
    #  year = finalYear (we can't use next year to check model)
    #  year = 2019 (last year of our data so )

    mergedBatting["skip"] = 0
    for index, row in mergedBatting.iterrows():
        if row["yearID"] < (latestDataYear + 2):
            mergedBatting.at[index, "skip"] = 1
        elif row["careerYear"] < 3:
            mergedBatting.at[index, "skip"] = 1
        elif row["yearID"] == row["finalYear"]:
            mergedBatting.at[index, "skip"] = 1
        elif row["yearID"] == 2019:
            mergedBatting.at[index, "skip"] = 1
            
    #  Sort by player and year - to get career all together
    sortedBatting = mergedBatting.sort_values(by = ["fpID", "yearID"])
    sortedBatting = sortedBatting.reset_index(drop=True)
    
    
    mlDF = sortedBatting.copy()
# playersMLBatting = playersMLBatting.drop(columns=["birthYear", "debuYear"])

    # Iterate through the sorted batting and grab previous stats
    # Make Columns labels based on stat
    p2Label = "p2-" + stat
    p1Label = "p1-" + stat
    f1Label = "f1-" + stat
    # Add those columns to DF
    mlDF[p2Label] = ""
    mlDF[p1Label] = ""
    mlDF[f1Label] = ""

    # Itterate through DF and populate those columns
    for index, row in mlDF.iterrows():
        if row["skip"] == 1:
            continue
        p2Stat = mlDF.iloc[index - 2][stat]
        p1Stat = mlDF.iloc[index - 1][stat]
        f1Stat = mlDF.iloc[index + 1][stat]

        mlDF.at[index, p2Label] = p2Stat
        mlDF.at[index, p1Label] = p1Stat
        mlDF.at[index, f1Label] = f1Stat
        
    # Get rid of the skipped rows, then all complete data
    mlData = mlDF.loc[mlDF['skip'] == 0]
    
    return mlData


In [8]:
rpgDF = manipulateDF(battingDF, playersDF, fPlayersDF, 1980, 'RpG')
rpgDF.tail(20)

Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,debutYear,finalYear,totalYears,careerYear,age,skip,p2-RpG,p1-RpG,f1-RpG
285170,2009,698387328,159,0.616352,0.962264,0.226415,0.622642,0.981132,0.125786,1979.0,2002,2017,16,8,30.0,0,0.457447,0.544776,0.679487
285171,2010,698387328,156,0.679487,1.051282,0.173077,0.564103,0.942308,0.083333,1979.0,2002,2017,16,9,31.0,0,0.544776,0.616352,0.46
285172,2011,698387328,150,0.46,0.866667,0.133333,0.526667,1.066667,0.126667,1979.0,2002,2017,16,10,32.0,0,0.616352,0.679487,0.518519
285173,2012,698387328,81,0.518519,1.111111,0.061728,0.54321,0.703704,0.098765,1979.0,2002,2017,16,11,33.0,0,0.679487,0.46,0.651163
285174,2013,698387328,129,0.651163,1.139535,0.193798,0.488372,0.782946,0.077519,1979.0,2002,2017,16,12,34.0,0,0.46,0.518519,0.578231
285175,2014,698387328,147,0.578231,1.061224,0.108844,0.585034,0.768707,0.061224,1979.0,2002,2017,16,13,35.0,0,0.518519,0.651163,0.579545
285176,2015,698387328,88,0.579545,0.829545,0.136364,0.431818,0.954545,0.0,1979.0,2002,2017,16,14,36.0,0,0.651163,0.578231,0.587413
285177,2016,698387328,143,0.587413,0.895105,0.146853,0.496503,0.972028,0.034965,1979.0,2002,2017,16,15,37.0,0,0.578231,0.579545,0.5
285181,2007,698440183,162,0.611111,1.074074,0.148148,0.395062,0.771605,0.024691,1984.0,2005,2019,15,3,23.0,0,0.3,0.535032,0.481132
285182,2008,698440183,106,0.481132,1.141509,0.132075,0.301887,0.669811,0.009434,1984.0,2005,2019,15,4,24.0,0,0.535032,0.611111,0.700637


In [167]:
# Function to Split data and run model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

def run_ML_Model(mlDF, stat):
    mlData = mlDF.loc[mlDF['skip'] == 0]
    p2Label = "p2-" + stat
    p1Label = "p1-" + stat
    f1Label = "f1-" + stat
    
#     print("== All Inputs ==")
    inputFactors = [p2Label, p1Label, "RpG", "HpG", "HRpG", "BBpG", "age"]
    X = mlData[inputFactors]
    y = mlData[[f1Label]]
    
    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
    
    #LinearModel===============
    model_lin = LinearRegression()
    model_lin.fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_lin = model_lin.score(X_train, y_train)
    testing_score_lin = model_lin.score(X_test, y_test)
    # Get coeffs and Y-int
    coeffs_lin = model_lin.coef_.tolist()[0]
    y_int_lin = model_lin.intercept_.tolist()[0]
#     print("===================")
#     print("=== Linear ===")
#     print(f"{stat}: Training Score: {training_score_lin}")
#     print(f"{stat}:Testing Score: {testing_score_lin}")
#     print('Weight coefficients: ', coeffs_lin)
#     print('y-axis intercept: ', y_int_lin)
    
    
    
    inputFactors2 = [p2Label, p1Label, stat]
    X2 = mlData[inputFactors2]
    y2 = mlData[[f1Label]]
    
    # Split Data
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=5)
    
    #LinearModel===============
    model_lin2 = LinearRegression()
    model_lin2.fit(X_train2, y_train2)
    # Calculate the R2 scores
    training_score_lin2 = model_lin2.score(X_train2, y_train2)
    testing_score_lin2 = model_lin2.score(X_test2, y_test2)
    # Get coeffs and Y-int
    coeffs_lin2 = model_lin2.coef_.tolist()[0]
    y_int_lin2 = model_lin2.intercept_.tolist()[0]
#     print("===================")
#     print("=== Linear ===")
#     print(f"{stat}: Training Score: {training_score_lin2}")
#     print(f"{stat}:Testing Score: {testing_score_lin2}")
#     print('Weight coefficients: ', coeffs_lin2)
#     print('y-axis intercept: ', y_int_lin2)
#     print("===================")

    
    
    
#     inputFactors.append("Y-Int")
    
#     coeffs_lin.append(y_int_lin)
#     mlFormulaLin = {'Stat': stat,
#                  'Input': inputFactors,
#                  'Coeff': coeffs_lin,
#                 'TrainingScore': training_score_lin,
#                 'TestingScore': testing_score_lin}
    
#     coeffs_log.append(y_int_log)
#     mlFormulaLog = {'Stat': stat,
#                  'Input': inputFactors,
#                  'Coeff': coeffs_log,
#                 'TrainingScore': training_score_log,
#                 'TestingScore': testing_score_log}
    
#     mlFormulaDF = pd.DataFrame(rpgFormula)
    
    return (model_lin, model_lin2)

    
    
    

In [103]:
# Comment after done - Takes a long time to execute

# rpgDF = manipulateDF(battingDF, playersDF, fPlayersDF, 1980, 'RpG')

In [166]:
rpgFormulaDF = run_ML_Model(rpgDF, "RpG")

ValueError: Unknown label type: 'continuous'

In [105]:
# Comment after done - Takes a long time to execute

# hpgDF = manipulateDF(battingDF, playersDF, fPlayersDF, 1980, 'HpG')

In [106]:
hpgFormulaDF = run_ML_Model(hpgDF, "HpG")

== All Inputs ==
=== Linear ===
HpG: Training Score: 0.8715164888744652
HpG:Testing Score: 0.8687481175957179
Weight coefficients:  [0.10314856376993395, 0.19415132905121496, 0.18162821828050904, 0.5194927746888385, 0.0012229294781802417, 0.06165500712347907, -0.0074503876487092]
y-axis intercept:  0.24246860780617735
=== Logistic ===
HpG: Training Score: 0.8916378820479751
HpG:Testing Score: 0.8930432117683114
Weight coefficients:  [-0.9996337357623933, -1.844898940235335, -2.564941974599952, -4.743071592344707, 1.3936783890665807, 0.03613243172052249, 0.12274534568955929]
y-axis intercept:  5.197697811621753


== Limited Inputs ==
=== Linear ===
HpG: Training Score: 0.8651676346473534
HpG:Testing Score: 0.8622514632316883
Weight coefficients:  [0.08714549900524404, 0.2037562939094359, 0.6477022175134244]
y-axis intercept:  0.01424028097213692
=== Logistic ===
HpG: Training Score: 0.8878644898897352
HpG:Testing Score: 0.8874693533558076
Weight coefficients:  [-0.4286468213503534, -1.8

In [107]:
# Comment after done - Takes a long time to execute

# HRpgDF = manipulateDF(battingDF, playersDF, fPlayersDF, 1980, 'HRpG')

In [108]:
HRpgFormulaDF = run_ML_Model(HRpgDF, "HRpG")

== All Inputs ==
=== Linear ===
HRpG: Training Score: 0.7506382109274283
HRpG:Testing Score: 0.7477265889288973
Weight coefficients:  [0.12387728987114553, 0.23604852946665117, 0.001395048142083439, 0.0076065087334711455, 0.43672490880096254, 0.02728005527867255, -0.0014774201660191725]
y-axis intercept:  0.04530755224799877
=== Logistic ===
HRpG: Training Score: 0.999993615241695
HRpG:Testing Score: 0.9999808458473797
Weight coefficients:  [0.09255537432606839, -0.05702119932325963, -0.1315496157641472, -0.24862163345395707, 0.020708289380589155, 0.11139373446565075, -0.4103042900674223]
y-axis intercept:  -0.20466408268298275


== Limited Inputs ==
=== Linear ===
HRpG: Training Score: 0.7384870704767421
HRpG:Testing Score: 0.7360142716247464
Weight coefficients:  [0.12516680723610585, 0.26313077206790725, 0.511731353126307]
y-axis intercept:  0.0045198199349791995
=== Logistic ===
HRpG: Training Score: 0.999993615241695
HRpG:Testing Score: 0.9999808458473797
Weight coefficients:  [-0

In [126]:
# Comment after done - Takes a long time to execute

# BBpGDF = manipulateDF(battingDF, playersDF, fPlayersDF, 1980, 'BBpG')

In [110]:
BBpgFormulaDF = run_ML_Model(BBpgDF, "BBpG")

== All Inputs ==
=== Linear ===
BBpG: Training Score: 0.8283702767374077
BBpG:Testing Score: 0.8242480220140065
Weight coefficients:  [0.12563436656202243, 0.23236127243766133, 0.0789105498461265, 0.020226277782279756, 0.23356937432531713, 0.40644771112956873, -0.002947620874419216]
y-axis intercept:  0.09164923392698321
=== Logistic ===
BBpG: Training Score: 0.9978419516929187
BBpG:Testing Score: 0.9974908060067422
Weight coefficients:  [-2.6461982864138984, -1.69571567579117, -1.829750250478428, 3.057365664121115, -5.563884883935782, -3.979795088736442, 0.1514541529270679]
y-axis intercept:  4.50782975453504


== Limited Inputs ==
=== Linear ===
BBpG: Training Score: 0.8141481062825023
BBpG:Testing Score: 0.8113784351047519
Weight coefficients:  [0.12523186388768584, 0.27092556666006384, 0.5406527328159626]
y-axis intercept:  0.010861249330943623
=== Logistic ===
BBpG: Training Score: 0.9976695632186844
BBpG:Testing Score: 0.9972801103279191
Weight coefficients:  [-1.978932938793326,

In [149]:
BBpGDF.head(1)

Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,debutYear,finalYear,totalYears,careerYear,age,skip,p2-BBpG,p1-BBpG,f1-BBpG
2,1982,1000,33,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,7,27.0,0,0,0,0


In [150]:
rowSel = BBpGDF.iloc[[0]]
rowSel

Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,debutYear,finalYear,totalYears,careerYear,age,skip,p2-BBpG,p1-BBpG,f1-BBpG
2,1982,1000,33,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,7,27.0,0,0,0,0


In [164]:
model1 = BBpgFormulaDF[0]
row1 = BBpgDF.head(1)
rowSel = BBpGDF.loc[[2]]
rowSel
predVal = model1.predict(row1[["p2-BBpG", "p1-BBpG", "RpG", "HpG", "HRpG", "BBpG", "age"]])
predVal2 = model1.predict(rowSel[["p2-BBpG", "p1-BBpG", "RpG", "HpG", "HRpG", "BBpG", "age"]])
print(predVal.tolist()[0][0])
print(predVal2)

0.012063470317664374
[[0.01206347]]


In [128]:
model0R = run_ML_Model(rpgDF, "RpG")[0]
model1R = run_ML_Model(rpgDF, "RpG")[1]
model2R = run_ML_Model(rpgDF, "RpG")[2]
model3R = run_ML_Model(rpgDF, "RpG")[3]

model0H = run_ML_Model(hpgDF, "HpG")[0]
model1H = run_ML_Model(hpgDF, "HpG")[1]
model2H = run_ML_Model(hpgDF, "HpG")[2]
model3H = run_ML_Model(hpgDF, "HpG")[3]

model0HR = run_ML_Model(HRpgDF, "HRpG")[0]
model1HR = run_ML_Model(HRpgDF, "HRpG")[1]
model2HR = run_ML_Model(HRpgDF, "HRpG")[2]
model3HR = run_ML_Model(HRpgDF, "HRpG")[3]

model0BB = run_ML_Model(BBpGDF, "BBpG")[0]
model1BB = run_ML_Model(BBpGDF, "BBpG")[1]
model2BB = run_ML_Model(BBpGDF, "BBpG")[2]
model3BB = run_ML_Model(BBpGDF, "BBpG")[3]

In [159]:
newBBpGDF = BBpGDF.copy()
# statsList = ["RpG", "HpG", "HRpG", "BBpG"]
# for s in statsList:
#     new_label = "pred" + s
#     newBBpGDF[new_label] = ""
newBBpGDF[predBBpG] = ""
newBBpGDF.head()
for index, row in newBBpGDF.iterrows():
    rowSelected = newBBpGDF.loc[[index]]
    
#     predR = model0R.predict(rowSelected[["p2-RpG", "p1-RpG", "RpG", "HpG", "HRpG", "BBpG", "age"]])
#     predH = model0H.predict(rowSelected[["p2-HpG", "p1-HpG", "RpG", "HpG", "HRpG", "BBpG", "age"]])
#     predHR = model0HR.predict(rowSelected[["p2-HRpG", "p1-HRpG", "RpG", "HpG", "HRpG", "BBpG", "age"]])
    predBB = model0BB.predict(rowSelected[["p2-BBpG", "p1-BBpG", "RpG", "HpG", "HRpG", "BBpG", "age"]])
    
#     newBBpGDF.at[index, "predRpG"] = predR
#     newBBpGDF.at[index, "predHpG"] = predH
#     newBBpGDF.at[index, "predHRpG"] = predHR
    newBBpGDF.at[index, "predBBpG"] = predBB[0][0]
    
newBBpGDF.head()
    

Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,...,careerYear,age,skip,p2-BBpG,p1-BBpG,f1-BBpG,predRpG,predHpG,predHRpG,predBBpG
2,1982,1000,33,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,...,7,27.0,0,0,0.0,0.0,,,,[[0.012063470317664374]]
3,1983,1000,30,0.033333,0.033333,0.0,0.0,0.166667,0.0,1955.0,...,8,28.0,0,0,0.0,0.285714,,,,[[0.01242041036419203]]
4,1984,1000,21,0.095238,0.095238,0.0,0.285714,0.904762,0.0,1955.0,...,9,29.0,0,0,0.0,0.0,,,,[[0.1317377486656938]]
5,1985,1000,20,0.0,0.0,0.0,0.0,0.35,0.0,1955.0,...,10,30.0,0,0,0.285714,0.0,,,,[[0.06960954267659568]]
9,1983,1003,70,0.0,0.028571,0.0,0.0,0.1,0.0,1957.0,...,3,26.0,0,0,0.015625,0.0,,,,[[0.019219629724844325]]
