<h1>TEAM Prediction Machine Learning Models - Batting</h1>

<h4>Import Dependencies</h4>

In [2]:
%matplotlib inline

In [3]:
import os
import csv
import pandas as pd

import sqlite3
import csv
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import numpy as np

from config import pgPassword

<h4>Create a connection to SQL database</h4>

In [4]:
pg_user = 'postgres'
pg_password = pgPassword
db_name = 'baseball_db'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')

<h4>Read in databases tables as DatFrames</h4>

In [5]:
teamsStatsDF = pd.read_sql_table("Team-Stats", con = engine)
battingDF = pd.read_sql_table("Batting", con = engine)
pitchingDF = pd.read_sql_table("Pitching", con = engine)
playersDF = pd.read_sql_table("Players", con = engine)
franchisesDF = pd.read_sql_table("Franchises", con = engine)
salariesDF = pd.read_sql_table("Salaries", con = engine)
teamsDF = pd.read_sql_table("Teams", con = engine)
fPlayersDF = pd.read_sql_table("FranchisePlayers", con = engine)

In [6]:
teamsStatsDF.head()
teamsStatsDF.shape
teams1980 = teamsStatsDF[teamsStatsDF["yearID"] >=1980]
teams1980.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA,statID,teamID
1787,ATL,1980,161,81,630,1352,144,434.0,899.0,73.0,660,3.77,1397,131,454,696,3197943,ATL-1980
1788,BAL,1980,162,100,805,1523,156,587.0,766.0,111.0,640,3.64,1438,134,507,789,3201520,BAL-1980
1789,BOS,1980,160,83,757,1588,162,475.0,720.0,79.0,767,4.38,1557,129,481,696,3205099,BOS-1980
1790,ANA,1980,160,65,698,1442,106,539.0,889.0,91.0,797,4.52,1548,141,529,725,3208680,CAL-1980
1791,CHW,1980,162,70,587,1408,91,399.0,670.0,68.0,722,3.92,1434,108,563,724,3212263,CHA-1980


In [7]:
# function to do the data manipulation steps in TeamBattingML
# dfTeamStats - the teamStatsDF as read from DB
# latesDataYear - the furthest year back you want data from 
# stat - the stat you are trying to predict (winPct)
# reachYears - how many years of historical data you want to use
 

def manipulateTeamDF(dfTeamStats, latestDataYear, stat, reachYears):
    
    # Get data going back to latest year of interest
    targetTeamsDF = teamsStatsDF[teamsStatsDF["yearID"] >=latestDataYear]
    
    # Limit to columns of interest
    colsOfInterst = ["franchiseID", "yearID","teamID", "G", "W", "R", "H", "HR", "BB"]
    targetTeamsDF = targetTeamsDF[colsOfInterst]
    
    # add a teams only column
    targetTeamsDF["teamOnly"] = ""
    for index, row in targetTeamsDF.iterrows():
        teamID = row["teamID"]
        teamOnly = teamID.split("-")[0]
        targetTeamsDF.at[index, "teamOnly"] = teamOnly
    
    # Find first year, last year, and total years of franchise
    teamYearsDF = targetTeamsDF[["teamOnly", "yearID"]]
    teamYearsDF = teamYearsDF.groupby("teamOnly").agg(['min', 'max', 'count'])
    teamYearsDF = teamYearsDF.reset_index()
    teamYearsDF.columns = teamYearsDF.columns.droplevel()
    teamYearsDF = teamYearsDF.rename(columns={"": "teamOnly",
                                              "min": "firstYear",
                                             "max": "lastYear",
                                             "count": "totalYears"})
    # Merge Years Data with target data
    targetTeamsDF = pd.merge(targetTeamsDF, teamYearsDF, on = ["teamOnly"])

    # Create field for yearofTeam
    targetTeamsDF["teamYr"] = targetTeamsDF["yearID"] + 1 - targetTeamsDF["firstYear"]
    
    # Add a winPCT field and make stats per game stats
    targetTeamsDF["winPct"] = targetTeamsDF.W / targetTeamsDF.G
    targetTeamsDF["RpG"] = targetTeamsDF.R / targetTeamsDF.G
    targetTeamsDF["HpG"] = targetTeamsDF.H / targetTeamsDF.G
    targetTeamsDF["HRpG"] = targetTeamsDF.HR / targetTeamsDF.G
    targetTeamsDF["BBpG"] = targetTeamsDF.BB / targetTeamsDF.G

    # Then keep only perGame cols
    perGameCols = ['franchiseID', 'yearID', 'teamID', 'G', 'winPct', 
                   'RpG', 'HpG', 'HRpG', 'BBpG', 'firstYear', 'lastYear', 'totalYears', 'teamYr']
    targetTeamsDF = targetTeamsDF[perGameCols]
    
    
    # Add a column to indicate rows that should be skipped
    #  years < latestDataYear + 2 (we can't get 2 year previous data for these years)
    #  teamYr < 3 (we can't get 2 year previous data for these years)
    #  year = lastYear (we can't use next year to check model)
    #  year = 2019 (last year of our data so )

    targetTeamsDF["skip"] = 0
    for index, row in targetTeamsDF.iterrows():
        if row["yearID"] < (latestDataYear + reachYears):
            targetTeamsDF.at[index, "skip"] = 1
        elif row["teamYr"] < 3:
            targetTeamsDF.at[index, "skip"] = 1
        elif row["yearID"] == row["lastYear"]:
            targetTeamsDF.at[index, "skip"] = 1
        elif row["yearID"] == 2019:
            targetTeamsDF.at[index, "skip"] = 2
            
    # SOrt by teamID (team and year) - to get all franchise data together
    sortedTeamsDF = targetTeamsDF.sort_values(by = ["teamID"])
    sortedTeamsDF = sortedTeamsDF.reset_index(drop=True)
    
    # Iterate through the sorted batting and grab previous stats
    mlDF = sortedTeamsDF.copy()
    # playersMLBatting = playersMLBatting.drop(columns=["birthYear", "debuYear"])

    # Make Columns labels based on stat
    p2Label = "p2-" + stat
    p1Label = "p1-" + stat
    f1Label = "f1-" + stat
    # Add those columns to DF
    mlDF[p2Label] = ""
    mlDF[p1Label] = ""
    mlDF[f1Label] = ""

    # Itterate through DF and populate those columns
    for index, row in mlDF.iterrows():
        if row["skip"] == 1:
            continue
        elif row["skip"] == 2:
            p2Stat = mlDF.iloc[index - 2][stat]
            p1Stat = mlDF.iloc[index - 1][stat]
            
            mlDF.at[index, p2Label] = p2Stat
            mlDF.at[index, p1Label] = p1Stat
        else:    
            p2Stat = mlDF.iloc[index - 2][stat]
            p1Stat = mlDF.iloc[index - 1][stat]
            f1Stat = mlDF.iloc[index + 1][stat]

            mlDF.at[index, p2Label] = p2Stat
            mlDF.at[index, p1Label] = p1Stat
            mlDF.at[index, f1Label] = f1Stat
            
            
    # # Get rid of the skipped rows, then all complete data
    mlData = mlDF.loc[mlDF['skip'] == 0]
    
    return mlData
    return mlDF
    

In [11]:
def makePredsTabel(df, inputs_list):
    newDF = df.copy()
    # Create a column to hold prediction values
    newDF["model"] = ""
    
    #Creat columns lists - for Dicts that will then be converted to DFs
    linearYears = []
    linearTeams = []
    linearActual = []
    linearModel = []
    linearType = []
    
    ridgeYears = []
    ridgeTeams = []
    ridgeActual = []
    ridgeModel = []
    ridgeType = []
    
    lassoYears = []
    lassoTeams = []
    lassoActual = []
    lassoModel = []
    lassoType = []
    
    eNetYears = []
    eNetTeams = []
    eNetActual = []
    eNetModel = []
    eNetType = []
    
    # Loop through df and make predictions and update columns lists
    inputFactors = ["p2-winPct", "p1-winPct", "winPct"]
    inputFactors.extend(inputs_list)
    for index, row in newDF.iterrows():
        yr = row["yearID"]
        team = row["teamID"]
        actual = row["winPct"]
        
        rowSelected = newDF.loc[[index]]

        lin_pred = teamBatting2Model[0].predict(rowSelected[inputFactors])[0][0]
        linearYears.append(yr)
        linearTeams.append(team)
        linearActual.append(actual)
        linearModel.append(lin_pred)
        linearType.append("ML-LN-T1")
        
        ridge_pred = teamBatting2Model[1].predict(rowSelected[inputFactors])[0][0]
        ridgeYears.append(yr)
        ridgeTeams.append(team)
        ridgeActual.append(actual)
        ridgeModel.append(ridge_pred)
        ridgeType.append("ML-RD-T1")
        
        lasso_pred = teamBatting2Model[2].predict(rowSelected[inputFactors])[0]
        lassoYears.append(yr)
        lassoTeams.append(team)
        lassoActual.append(actual)
        lassoModel.append(lasso_pred)
        lassoType.append("ML-LS-T1")
        lasso_type = "ML-LS-T1"
        
        eNet_pred = teamBatting2Model[3].predict(rowSelected[inputFactors])[0]    
        eNetYears.append(yr)
        eNetTeams.append(team)
        eNetActual.append(actual)
        eNetModel.append(eNet_pred)
        eNetType.append("ML-EN-T1")
        
    # Make dictionaries with the columns created
    linDict = {"yearID": linearYears,
              "teamID": linearTeams,
              "actual": linearActual,
              "model": linearModel,
              "model_type": linearType}
    linDF = pd.DataFrame.from_dict(linDict)
    
    ridgeDict = {"yearID": ridgeYears,
              "teamID": ridgeTeams,
              "actual": ridgeActual,
              "model": ridgeModel,
              "model_type": ridgeType}
    ridgeDF = pd.DataFrame.from_dict(ridgeDict)
    
    lassoDict = {"yearID": lassoYears,
              "teamID": lassoTeams,
              "actual": lassoActual,
              "model": lassoModel,
              "model_type": lassoType}
    lassoDF = pd.DataFrame.from_dict(lassoDict)
    
    eNetDict = {"yearID": eNetYears,
              "teamID": eNetTeams,
              "actual": eNetActual,
              "model": eNetModel,
              "model_type": eNetType}
    eNetDF = pd.DataFrame.from_dict(eNetDict)
    
    fullDF = linDF.append(ridgeDF)
    fullDF = fullDF.append(lassoDF)
    fullDF = fullDF.append(eNetDF)
    
    return fullDF

In [12]:
# Function to Split data and run model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

def run_ML_Model(mlDF, stat, inputs_list):
    mlData = mlDF.loc[mlDF['skip'] == 0]
    p2Label = "p2-" + stat
    p1Label = "p1-" + stat
    f1Label = "f1-" + stat
    
    print("== All Inputs ==")
    inputFactors = [p2Label, p1Label, stat]
    inputFactors.extend(inputs_list)
    print(f"Input Factors: {inputFactors}")
    X = mlData[inputFactors]
    y = mlData[[f1Label]]
    
    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
    
    #==============================LinearModel===============
    model_lin = LinearRegression()
    model_lin.fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_lin = model_lin.score(X_train, y_train)
    testing_score_lin = model_lin.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_lin = model_lin.predict(X_test)
    MSE_lin = mean_squared_error(y_test, preds_lin)
    # Get coeffs and Y-int
    coeffs_lin = model_lin.coef_.tolist()[0]
    y_int_lin = model_lin.intercept_.tolist()[0]
    print("===================")
    print("=== Linear ===")
    print(f"{stat}: Training Score: {training_score_lin}")
    print(f"{stat}:Testing Score: {testing_score_lin}")
    print(f"{stat}:MSE: {MSE_lin}")
    print('Weight coefficients: ', coeffs_lin)
    print('y-axis intercept: ', y_int_lin)
    print("===================")
    
    
    #===============================RidgeModel===============
    model_ridge = Ridge(alpha=0.01).fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_ridge= model_ridge.score(X_train, y_train)
    testing_score_ridge = model_ridge.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_ridge = model_ridge.predict(X_test)
    MSE_ridge = mean_squared_error(y_test, preds_ridge)
    # Get coeffs and Y-int
    coeffs_ridge= model_ridge.coef_.tolist()[0]
    y_int_ridge = model_ridge.intercept_.tolist()[0]
    print("===================")
    print("=== Ridge ===")
    print(f"{stat}: Training Score: {training_score_ridge}")
    print(f"{stat}:Testing Score: {testing_score_ridge}")
    print(f"{stat}:MSE: {MSE_ridge}")
    print('Weight coefficients: ', coeffs_ridge)
    print('y-axis intercept: ', y_int_ridge)
    print("===================")
    
    #=================================LassoModel===============
    model_lasso = Lasso(alpha=0.01).fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_lasso= model_lasso.score(X_train, y_train)
    testing_score_lasso = model_lasso.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_lasso = model_lasso.predict(X_test)
    MSE_lasso = mean_squared_error(y_test, preds_lasso)
    # Get coeffs and Y-int
    coeffs_lasso= model_lasso.coef_.tolist()[0]
    y_int_lasso = model_lasso.intercept_.tolist()[0]
    print("===================")
    print("=== Lasso ===")
    print(f"{stat}: Training Score: {training_score_lasso}")
    print(f"{stat}:Testing Score: {testing_score_lasso}")
    print(f"{stat}:MSE: {MSE_lasso}")
    print('Weight coefficients: ', coeffs_lasso)
    print('y-axis intercept: ', y_int_lasso)
    print("===================")
    
    #================================ElasticNetModel===============
    model_eNet = ElasticNet(alpha=.01, l1_ratio=0.001).fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_eNet= model_eNet.score(X_train, y_train)
    testing_score_eNet = model_eNet.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_eNet = model_eNet.predict(X_test)
    MSE_eNet = mean_squared_error(y_test, preds_eNet)
    # Get coeffs and Y-int
    coeffs_eNet= model_eNet.coef_.tolist()[0]
    y_int_eNet = model_eNet.intercept_.tolist()[0]
    print("===================")
    print("=== Elastic Net ===")
    print(f"{stat}: Training Score: {training_score_eNet}")
    print(f"{stat}:Testing Score: {testing_score_eNet}")
    print(f"{stat}:MSE: {MSE_eNet}")
    print('Weight coefficients: ', coeffs_eNet)
    print('y-axis intercept: ', y_int_eNet)
    print("===================")
    
    
    return (model_lin, model_ridge, model_lasso, model_eNet)

In [13]:
# Comment after Executing

teamBatting2 = manipulateTeamDF(teamsStatsDF, 1980, "winPct", 2)
teamBatting2.head()

Unnamed: 0,franchiseID,yearID,teamID,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip,p2-winPct,p1-winPct,f1-winPct
2,ANA,1999,ANA-1999,162,0.432099,4.388889,8.666667,0.975309,3.154321,1997,2004,8,3,0,0.518519,0.524691,0.506173
3,ANA,2000,ANA-2000,162,0.506173,5.333333,9.716049,1.45679,3.753086,1997,2004,8,4,0,0.524691,0.432099,0.462963
4,ANA,2001,ANA-2001,162,0.462963,4.265432,8.932099,0.975309,3.049383,1997,2004,8,5,0,0.432099,0.506173,0.611111
5,ANA,2002,ANA-2002,162,0.611111,5.253086,9.895062,0.938272,2.851852,1997,2004,8,6,0,0.506173,0.462963,0.475309
6,ANA,2003,ANA-2003,162,0.475309,4.54321,9.092593,0.925926,2.938272,1997,2004,8,7,0,0.462963,0.611111,0.567901


In [14]:
teamBatting2Model = run_ML_Model(teamBatting2, "winPct", ["RpG", "HpG", "HRpG", "BBpG"])

== All Inputs ==
Input Factors: ['p2-winPct', 'p1-winPct', 'winPct', 'RpG', 'HpG', 'HRpG', 'BBpG']
=== Linear ===
winPct: Training Score: 0.25820273325126775
winPct:Testing Score: 0.18260436816470915
winPct:MSE: 0.0038878236296913176
Weight coefficients:  [-0.0810267000425587, 0.14381800079186033, 0.3980238799363427, -0.01463093975850033, 0.00847863490108637, 0.026260447619041335, 0.017738620927386164]
y-axis intercept:  0.17597878123964478
=== Ridge ===
winPct: Training Score: 0.2582007949792464
winPct:Testing Score: 0.18262078447544217
winPct:MSE: 0.003887745547893111
Weight coefficients:  [-0.08032772881931927, 0.14367997549387979, 0.3963760581540223, -0.01444987875544191, 0.008417001501874343, 0.02617151991826452, 0.017733347692016885]
y-axis intercept:  0.17635408319657803
=== Lasso ===
winPct: Training Score: 0.019551545081380683
winPct:Testing Score: 0.006771637548217413
winPct:MSE: 0.0047241464803885365
Weight coefficients:  0.0
y-axis intercept:  0.4791819595691222
=== Elastic

In [15]:
fullDFTry = makePredsTabel(teamBatting2, ["RpG", "HpG", "HRpG", "BBpG"])
fullDFTry = fullDFTry.sort_values(by=['teamID', "model_type"])
fullDFTry.head(20)

Unnamed: 0,yearID,teamID,actual,model,model_type
0,1999,ANA-1999,0.432099,0.489055,ML-EN-T1
0,1999,ANA-1999,0.432099,0.472244,ML-LN-T1
0,1999,ANA-1999,0.432099,0.498932,ML-LS-T1
0,1999,ANA-1999,0.432099,0.472354,ML-RD-T1
1,2000,ANA-2000,0.506173,0.524125,ML-EN-T1
1,2000,ANA-2000,0.506173,0.506255,ML-LN-T1
1,2000,ANA-2000,0.506173,0.503182,ML-LS-T1
1,2000,ANA-2000,0.506173,0.506321,ML-RD-T1
2,2001,ANA-2001,0.462963,0.486693,ML-EN-T1
2,2001,ANA-2001,0.462963,0.491063,ML-LN-T1


In [16]:
savePath = os.path.join("..", "..", "data", "csv", "models", "teamPredictions.csv")
fullDFTry.to_csv(savePath)