<h1>Player Prediction Machine Learning Models - Batting</h1>

<h4>Import Dependencies</h4>

In [27]:
%matplotlib inline

In [28]:
import os
import csv
import pandas as pd

import sqlite3
import csv
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import numpy as np

from config import pgPassword

<h4>Create a connection to SQL database</h4>

In [29]:
pg_user = 'postgres'
pg_password = pgPassword
db_name = 'baseball_db'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')

<h4>Read in databases tables as DatFrames</h4>

In [30]:
teamsStatsDF = pd.read_sql_table("Team-Stats", con = engine)
battingDF = pd.read_sql_table("Batting", con = engine)
pitchingDF = pd.read_sql_table("Pitching", con = engine)
playersDF = pd.read_sql_table("Players", con = engine)
franchisesDF = pd.read_sql_table("Franchises", con = engine)
salariesDF = pd.read_sql_table("Salaries", con = engine)
teamsDF = pd.read_sql_table("Teams", con = engine)
fPlayersDF = pd.read_sql_table("FranchisePlayers", con = engine)

In [31]:
battingDF.head()

Unnamed: 0,yearID,stint,G,R,H,HR,BB,IBB,SO,SB,fpID
0,1954,1,35,2,10,0,3,0.0,15.0,0.0,336907024
1,1955,1,46,1,2,0,1,0.0,0.0,0.0,336907024
2,1956,1,49,4,11,0,2,0.0,28.0,0.0,336907024
3,1957,1,49,4,14,0,6,0.0,26.0,0.0,336907024
4,1958,1,46,1,6,0,5,0.0,13.0,1.0,336907024


<h4>Manipulate Data</h4>

In [32]:
# Get data going back to latest year of interest
latestDataYear = 1980
targetBattingDF = battingDF[battingDF["yearID"] >=latestDataYear]
targetBattingDF.head()

Unnamed: 0,yearID,stint,G,R,H,HR,BB,IBB,SO,SB,fpID
87,1980,1,36,0,0,0,0,0.0,0.0,0.0,143883
88,1980,1,36,0,0,0,0,0.0,0.0,0.0,424800
89,1980,1,36,0,0,0,0,0.0,0.0,0.0,989035
90,1980,1,36,0,0,0,0,0.0,0.0,0.0,2809975
91,1980,1,36,0,0,0,0,0.0,0.0,0.0,5323248


In [33]:
# Combine the stats of players with multiple stints in a year
combinedStints = targetBattingDF.groupby(["yearID", "fpID"]).sum().reset_index()
combinedStints.head()

Unnamed: 0,yearID,fpID,stint,G,R,H,HR,BB,IBB,SO,SB
0,1980,1000,1,36,0,0,0,0,0.0,0.0,0.0
1,1980,1008,1,120,18,91,2,28,8.0,36.0,3.0
2,1980,1015,1,77,0,1,0,0,0.0,6.0,0.0
3,1980,1024,1,147,70,144,14,32,2.0,56.0,1.0
4,1980,1035,1,158,83,170,18,49,6.0,73.0,7.0


In [34]:
# Check to make sure combining Stints worked properly
totalRecords = targetBattingDF.shape[0]
combinedRecords = combinedStints.shape[0]

multipleStints = targetBattingDF.loc[targetBattingDF["stint"] > 1]
multipleStintRecords = multipleStints.shape[0]

if totalRecords - multipleStintRecords == combinedRecords:
    print("Success")
else:
    print("Failure")

Success


In [35]:
# Get Player Info

#just the player ID and fpID
fPlayersDF1 = fPlayersDF[["playerID", "fpID"]]
# Merge players with fplayers1
playerMergedDF = pd.merge(playersDF, fPlayersDF1, on = ["playerID"])
playerMergedDF.head()

Unnamed: 0,playerID,birthYear,nameFirst,nameLast,debut,finalGame,fpID
0,aardsda01,1981.0,David,Aardsma,2004-04-06,2015-08-23,266506624
1,aardsda01,1981.0,David,Aardsma,2004-04-06,2015-08-23,332151624
2,aardsda01,1981.0,David,Aardsma,2004-04-06,2015-08-23,362751115
3,aardsda01,1981.0,David,Aardsma,2004-04-06,2015-08-23,417304183
4,aardsda01,1981.0,David,Aardsma,2004-04-06,2015-08-23,451903563


In [36]:
# Create DF with  Years fields

playerYearsDF = playerMergedDF
playerYearsDF["debutYear"] = playerYearsDF["debut"].dt.year
playerYearsDF["finalYear"] = playerYearsDF["finalGame"].dt.year
playerYearsDF["totalYears"] = playerYearsDF.finalYear + 1 - playerYearsDF.debutYear
playerYearsDF = playerYearsDF[["fpID", "playerID", "birthYear", "debutYear", "finalYear", "totalYears"]]
playerYearsDF.head()

Unnamed: 0,fpID,playerID,birthYear,debutYear,finalYear,totalYears
0,266506624,aardsda01,1981.0,2004,2015,12
1,332151624,aardsda01,1981.0,2004,2015,12
2,362751115,aardsda01,1981.0,2004,2015,12
3,417304183,aardsda01,1981.0,2004,2015,12
4,451903563,aardsda01,1981.0,2004,2015,12


In [37]:
# Merge Player Years data with batting data

mergedBatting = pd.merge(combinedStints, playerYearsDF, on = ["fpID"])
mergedBatting.head()

Unnamed: 0,yearID,fpID,stint,G,R,H,HR,BB,IBB,SO,SB,playerID,birthYear,debutYear,finalYear,totalYears
0,1980,1000,1,36,0,0,0,0,0.0,0.0,0.0,barkele01,1955.0,1976,1987,12
1,1981,1000,1,22,0,0,0,0,0.0,0.0,0.0,barkele01,1955.0,1976,1987,12
2,1982,1000,1,33,0,0,0,0,0.0,0.0,0.0,barkele01,1955.0,1976,1987,12
3,1983,1000,3,30,1,1,0,0,0.0,5.0,0.0,barkele01,1955.0,1976,1987,12
4,1984,1000,1,21,2,2,0,6,0.0,19.0,0.0,barkele01,1955.0,1976,1987,12


In [38]:
# Normalize statistics to be per game

# Divide statistics by games played
mergedBatting["RpG"] = mergedBatting.R / mergedBatting.G
mergedBatting["HpG"] = mergedBatting.H / mergedBatting.G
mergedBatting["HRpG"] = mergedBatting.HR / mergedBatting.G
mergedBatting["BBpG"] = (mergedBatting.BB + mergedBatting.IBB) / mergedBatting.G
mergedBatting["SOpG"] = mergedBatting.SO / mergedBatting.G
mergedBatting["SBpG"] = mergedBatting.SB / mergedBatting.G

#Limit to only columns of interest
desiredCols = ["yearID", "fpID", "G","RpG", "HpG", "HRpG", "BBpG", "SOpG", "SBpG",
               "birthYear", "debutYear", "finalYear", "totalYears"]
mergedBatting = mergedBatting[desiredCols]
mergedBatting.head()

Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,debutYear,finalYear,totalYears
0,1980,1000,36,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12
1,1981,1000,22,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12
2,1982,1000,33,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12
3,1983,1000,30,0.033333,0.033333,0.0,0.0,0.166667,0.0,1955.0,1976,1987,12
4,1984,1000,21,0.095238,0.095238,0.0,0.285714,0.904762,0.0,1955.0,1976,1987,12


In [39]:
# Add age and careerYears columns

mergedBatting["careerYear"] = mergedBatting.yearID + 1 - mergedBatting.debutYear
mergedBatting["age"] = mergedBatting.yearID - mergedBatting.birthYear

mergedBatting.head(5)

Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,debutYear,finalYear,totalYears,careerYear,age
0,1980,1000,36,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,5,25.0
1,1981,1000,22,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,6,26.0
2,1982,1000,33,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,7,27.0
3,1983,1000,30,0.033333,0.033333,0.0,0.0,0.166667,0.0,1955.0,1976,1987,12,8,28.0
4,1984,1000,21,0.095238,0.095238,0.0,0.285714,0.904762,0.0,1955.0,1976,1987,12,9,29.0


<h4>Now, extract previous year data to be added to DataFrame</h4>

In [40]:
#testing ability to grab desired column value

currentYear = 1984
fpID = 1000

oneYearRow = mergedBatting.loc[(mergedBatting["yearID"] == currentYear - 1) & (mergedBatting["fpID"] == fpID)]
oneYearRpG = oneYearRow["RpG"].values[0]
print(oneYearRpG)


0.03333333333333333


In [41]:
# get a dataframe that will only hold valid inputs 
#  to get historical data for

# get only data from latestDataYear + 2
#  that way we can grab 2 years previous data
validHistoric = mergedBatting.loc[mergedBatting["yearID"] >= latestDataYear + 2]

# get only players who are in at least their 3rd career year and who played at least 4
#  we are using 3 years data to predict a 4th year value
validHistoric = validHistoric.loc[(validHistoric["careerYear"] >= 3) & (validHistoric["totalYears"] >= 4)]

# Get only data prior to 2019 season 
#  we can't supervise a model based of  2019 data becaus ewe don't have 2020 data to check it against
validHistoric = validHistoric.loc[validHistoric["yearID"] < 2019]
validHistoric.sort_values(by = ["yearID"], ascending = False).head()
validHistoric.head(4)

Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,debutYear,finalYear,totalYears,careerYear,age
2,1982,1000,33,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,7,27.0
3,1983,1000,30,0.033333,0.033333,0.0,0.0,0.166667,0.0,1955.0,1976,1987,12,8,28.0
4,1984,1000,21,0.095238,0.095238,0.0,0.285714,0.904762,0.0,1955.0,1976,1987,12,9,29.0
5,1985,1000,20,0.0,0.0,0.0,0.0,0.35,0.0,1955.0,1976,1987,12,10,30.0


In [59]:
# nums = [83,84, 85,86,87, 88, 89, 90, 91, 92, 95, 96, 97]
# lastYear = nums[-1]
# gap = 0
# gapTimes = 0
# for num in nums:
#     print(f"===== {num} =====")
#     if num >= lastYear:
#         continue
#     if gapTimes == 2:
#         print("Executing 2")
#         prev2 = num - 2 - gap
#         prev1 = num - 1 - gap
#         next1 = num + 1
#         gapTimes = 1
#     elif gapTimes == 1:
#         print("Executing 1")
#         prev2 = num - 2 - gap
#         prev1 = num - 1
#         next1 = num + 1
#         gapTimes = 0
#     else:
#         prev2 = num - 2
#         prev1 = num - 1
#         next1 = num + 1
    
#     while next1 not in nums and next1 <lastYear:
#         print(f"Missing {next1}")
#         next1 += 1
#         gapTimes = 2
#         gap = next1 - num - 1
    
#     print(prev2, prev1, num, next1)


In [60]:
# Practice looping though dataframes
sampHistoric = validHistoric.head(10)

# Set up columns for the new data to be entered into
sampHistoric["prevTwo"] = ""
sampHistoric["prevOne"] = ""
sampHistoric["nextOne"] = ""
stat = "HpG"
print("=================")
gap = 0
gapTimes = 0
old_player = 0
for index, row in sampHistoric.iterrows():
    player = row["fpID"]
    if player != old_player:
        print("NEW PLAYER")
        gap = 0
        gapTimes = 0
        old_player = player
    print(f"playerID: {player}")
    yr = row["yearID"]
    print(f"===== {yr} =====")
    finalYear = row["finalYear"]
    if yr >= finalYear:
        continue
    if gapTimes == 2:
        print("Executing 2")
        prevTwoYear = yr - 2 - gap
        prevOneYear = yr - 1 - gap
        nextYear = yr + 1
        gapTimes = 1
    elif gapTimes == 1:
        print("Executing 1")
        prevTwoYear = yr - 2 - gap
        prevOneYear = yr - 1
        nextYear = yr + 1
        gapTimes = 0
    else:
        prevTwoYear = yr - 2
        prevOneYear = yr - 1
        nextYear = yr + 1

    

    prevTwoRow = mergedBatting.loc[(mergedBatting["yearID"] == prevTwoYear)
                                     & (mergedBatting["fpID"] == player)]
    prevOneRow = mergedBatting.loc[(mergedBatting["yearID"] == prevOneYear)
                                             & (mergedBatting["fpID"] == player)]
    nextYearRow = mergedBatting.loc[(mergedBatting["yearID"] == nextYear)
                                             & (mergedBatting["fpID"] == player)]
    while nextYearRow.empty:
        print(f"{nextYear} Missing")
        nextYear += 1
        nextYearRow = mergedBatting.loc[(mergedBatting["yearID"] == nextYear)
                                             & (mergedBatting["fpID"] == player)]
        gapTimes = 2
        gap = nextYear - yr - 1
    
    prevTwoStat = prevTwoRow[stat].values[0]
    prevOneStat = prevOneRow[stat].values[0]    
    nextYearStat = nextYearRow[stat].values[0]    
    
    sampHistoric.at[index, "prevTwo"] = prevTwoStat
    sampHistoric.at[index, "prevOne"] = prevOneStat
    sampHistoric.at[index, "nextOne"] = nextYearStat
    
    print(f"{prevTwoYear}: {prevTwoStat}")
    print(f"{prevOneYear}: {prevOneStat}")
    print(f"{yr}: {sampHistoric.loc[index, stat]}")
    print(f"{nextYear}: {nextYearStat}")
          
    print("=================")

        
sampHistoric.head()


NEW PLAYER
playerID: 1000
===== 1982 =====
1980: 0.0
1981: 0.0
1982: 0.0
1983: 0.03333333333333333
playerID: 1000
===== 1983 =====
1981: 0.0
1982: 0.0
1983: 0.03333333333333333
1984: 0.09523809523809523
playerID: 1000
===== 1984 =====
1982: 0.0
1983: 0.03333333333333333
1984: 0.09523809523809523
1985: 0.0
playerID: 1000
===== 1985 =====
1986 Missing
1983: 0.03333333333333333
1984: 0.09523809523809523
1985: 0.0
1987: 0.0
playerID: 1000
===== 1987 =====
NEW PLAYER
playerID: 1008
===== 1982 =====
1980: 0.7583333333333333
1981: 0.8666666666666667
1982: 0.8050847457627118
1983: 0.9402985074626866
playerID: 1008
===== 1983 =====
1981: 0.8666666666666667
1982: 0.8050847457627118
1983: 0.9402985074626866
1984: 0.7052631578947368
playerID: 1008
===== 1984 =====
1982: 0.8050847457627118
1983: 0.9402985074626866
1984: 0.7052631578947368
1985: 0.6
playerID: 1008
===== 1985 =====
1983: 0.9402985074626866
1984: 0.7052631578947368
1985: 0.6
1986: 0.5625
playerID: 1008
===== 1986 =====
1984: 0.7052631

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,yearID,fpID,G,RpG,HpG,HRpG,BBpG,SOpG,SBpG,birthYear,debutYear,finalYear,totalYears,careerYear,age,prevTwo,prevOne,nextOne
2,1982,1000,33,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,7,27.0,0.0,0.0,0.0333333
3,1983,1000,30,0.033333,0.033333,0.0,0.0,0.166667,0.0,1955.0,1976,1987,12,8,28.0,0.0,0.0,0.0952381
4,1984,1000,21,0.095238,0.095238,0.0,0.285714,0.904762,0.0,1955.0,1976,1987,12,9,29.0,0.0,0.0333333,0.0
5,1985,1000,20,0.0,0.0,0.0,0.0,0.35,0.0,1955.0,1976,1987,12,10,30.0,0.0333333,0.0952381,0.0
6,1987,1000,11,0.0,0.0,0.0,0.0,0.0,0.0,1955.0,1976,1987,12,12,32.0,,,
