<h1>TEAM Prediction Machine Learning Models - Batting</h1>

<h4>Import Dependencies</h4>

In [1]:
%matplotlib inline

In [2]:
import os
import csv
import pandas as pd

import sqlite3
import csv
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import numpy as np

from config import pgPassword

<h4>Create a connection to SQL database</h4>

In [3]:
pg_user = 'postgres'
pg_password = pgPassword
db_name = 'baseball_db'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')

<h4>Read in databases tables as DatFrames</h4>

In [4]:
teamsStatsDF = pd.read_sql_table("Team-Stats", con = engine)
battingDF = pd.read_sql_table("Batting", con = engine)
pitchingDF = pd.read_sql_table("Pitching", con = engine)
playersDF = pd.read_sql_table("Players", con = engine)
franchisesDF = pd.read_sql_table("Franchises", con = engine)
salariesDF = pd.read_sql_table("Salaries", con = engine)
teamsDF = pd.read_sql_table("Teams", con = engine)
fPlayersDF = pd.read_sql_table("FranchisePlayers", con = engine)

In [5]:
teamsStatsDF.head()

Unnamed: 0,franchiseID,yearID,G,W,R,H,HR,BB,SO,SB,RA,ERA,HA,HRA,BBA,SOA,statID,teamID
0,BNA,1871,31,20,401,426,3,60.0,19.0,73.0,303,3.55,367,2,42,23,1000,BS1-1871
1,CNA,1871,28,19,302,323,10,60.0,22.0,69.0,241,2.76,308,6,28,22,1003,CH1-1871
2,CFC,1871,29,10,249,328,7,26.0,25.0,18.0,341,4.11,346,13,53,34,1008,CL1-1871
3,KEK,1871,19,7,137,178,2,33.0,9.0,16.0,243,5.17,261,5,21,17,1015,FW1-1871
4,NNA,1871,33,16,302,403,1,33.0,15.0,46.0,313,3.72,373,7,42,22,1024,NY2-1871


In [9]:
# function to do the data manipulation steps in TeamBattingML
# dfTeamStats - the teamStatsDF as read from DB
# latesDataYear - the furthest year back you want data from 
# stat - the stat you are trying to predict (winPct)
# reachYears - how many years of historical data you want to use
# model_type
# 
# 
# 

def manipulateTeamDF(dfTeamStats, latestDataYear, stat, reachYears):
    
    # Get data going back to latest year of interest
    targetTeamsDF = teamsStatsDF[teamsStatsDF["yearID"] >=latestDataYear]
    
    # Limit to columns of interest
    colsOfInterst = ["franchiseID", "yearID", "G", "W", "R", "H", "HR", "BB"]
        
        
    targetTeamsDF = targetTeamsDF[colsOfInterst]
    
    # Find first year, last year, and total years of franchise
    teamYearsDF = targetTeamsDF[["franchiseID", "yearID"]]
    teamYearsDF = teamYearsDF.groupby("franchiseID").agg(['min', 'max', 'count'])
    teamYearsDF = teamYearsDF.reset_index()
    teamYearsDF.columns = teamYearsDF.columns.droplevel()
    teamYearsDF = teamYearsDF.rename(columns={"": "franchiseID",
                                              "min": "firstYear",
                                             "max": "lastYear",
                                             "count": "totalYears"})
    # Merge Years Data with target data
    targetTeamsDF = pd.merge(targetTeamsDF, teamYearsDF, on = ["franchiseID"])

    # Create field for yearofTeam
    targetTeamsDF["teamYr"] = targetTeamsDF["yearID"] + 1 - targetTeamsDF["firstYear"]
    
    # Add a winPCT field and make stats per game stats
    targetTeamsDF["winPct"] = targetTeamsDF.W / targetTeamsDF.G
    targetTeamsDF["RpG"] = targetTeamsDF.R / targetTeamsDF.G
    targetTeamsDF["HpG"] = targetTeamsDF.H / targetTeamsDF.G
    targetTeamsDF["HRpG"] = targetTeamsDF.HR / targetTeamsDF.G
    targetTeamsDF["BBpG"] = targetTeamsDF.BB / targetTeamsDF.G

    # Then keep only perGame cols
    perGameCols = ['franchiseID', 'yearID', 'G', 'winPct', 
                   'RpG', 'HpG', 'HRpG', 'BBpG', 'firstYear', 'lastYear', 'totalYears', 'teamYr']
    targetTeamsDF = targetTeamsDF[perGameCols]
    
    
    # Add a column to indicate rows that should be skipped
    #  years < latestDataYear + 2 (we can't get 2 year previous data for these years)
    #  teamYr < 3 (we can't get 2 year previous data for these years)
    #  year = lastYear (we can't use next year to check model)
    #  year = 2019 (last year of our data so )

    targetTeamsDF["skip"] = 0
    for index, row in targetTeamsDF.iterrows():
        if row["yearID"] < (latestDataYear + reachYears):
            targetTeamsDF.at[index, "skip"] = 1
        elif row["teamYr"] < 3:
            targetTeamsDF.at[index, "skip"] = 1
        elif row["yearID"] == row["lastYear"]:
            targetTeamsDF.at[index, "skip"] = 1
        elif row["yearID"] == 2019:
            targetTeamsDF.at[index, "skip"] = 1
            
    # SOrt by Franchise and year - to get all franchise data together
    sortedTeamsDF = targetTeamsDF.sort_values(by = ["franchiseID", "yearID"])
    sortedTeamsDF = sortedTeamsDF.reset_index(drop=True)
    
    # Iterate through the sorted batting and grab previous stats
    mlDF = sortedTeamsDF.copy()
    # playersMLBatting = playersMLBatting.drop(columns=["birthYear", "debuYear"])

    # Make Columns labels based on stat
    p2Label = "p2-" + stat
    p1Label = "p1-" + stat
    f1Label = "f1-" + stat
    # Add those columns to DF
    mlDF[p2Label] = ""
    mlDF[p1Label] = ""
    mlDF[f1Label] = ""

    # Itterate through DF and populate those columns
    for index, row in mlDF.iterrows():
        if row["skip"] == 1:
            continue
        p2Stat = mlDF.iloc[index - 2][stat]
        p1Stat = mlDF.iloc[index - 1][stat]
        f1Stat = mlDF.iloc[index + 1][stat]

        mlDF.at[index, p2Label] = p2Stat
        mlDF.at[index, p1Label] = p1Stat
        mlDF.at[index, f1Label] = f1Stat
            
            
    # # Get rid of the skipped rows, then all complete data
    mlData = mlDF.loc[mlDF['skip'] == 0]
    
    return mlData
    

In [81]:
# Function to Split data and run model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

def run_ML_Model(mlDF, stat, inputs_list):
    mlData = mlDF.loc[mlDF['skip'] == 0]
    p2Label = "p2-" + stat
    p1Label = "p1-" + stat
    f1Label = "f1-" + stat
    
    print("== All Inputs ==")
    inputFactors = [p2Label, p1Label, stat]
    inputFactors.extend(inputs_list)
    print(f"Input Factors: {inputFactors}")
    X = mlData[inputFactors]
    y = mlData[[f1Label]]
    
    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
    
    #==============================LinearModel===============
    model_lin = LinearRegression()
    model_lin.fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_lin = model_lin.score(X_train, y_train)
    testing_score_lin = model_lin.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_lin = model_lin.predict(X_test)
    MSE_lin = mean_squared_error(y_test, preds_lin)
    # Get coeffs and Y-int
    coeffs_lin = model_lin.coef_.tolist()[0]
    y_int_lin = model_lin.intercept_.tolist()[0]
    print("===================")
    print("=== Linear ===")
    print(f"{stat}: Training Score: {training_score_lin}")
    print(f"{stat}:Testing Score: {testing_score_lin}")
    print(f"{stat}:MSE: {MSE_lin}")
    print('Weight coefficients: ', coeffs_lin)
    print('y-axis intercept: ', y_int_lin)
    print("===================")
    
    
    #===============================RidgeModel===============
    model_ridge = Ridge(alpha=0.01).fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_ridge= model_ridge.score(X_train, y_train)
    testing_score_ridge = model_ridge.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_ridge = model_ridge.predict(X_test)
    MSE_ridge = mean_squared_error(y_test, preds_ridge)
    # Get coeffs and Y-int
    coeffs_ridge= model_ridge.coef_.tolist()[0]
    y_int_ridge = model_ridge.intercept_.tolist()[0]
    print("===================")
    print("=== Ridge ===")
    print(f"{stat}: Training Score: {training_score_ridge}")
    print(f"{stat}:Testing Score: {testing_score_ridge}")
    print(f"{stat}:MSE: {MSE_ridge}")
    print('Weight coefficients: ', coeffs_ridge)
    print('y-axis intercept: ', y_int_ridge)
    print("===================")
    
    #=================================LassoModel===============
    model_lasso = Lasso(alpha=0.01).fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_lasso= model_lasso.score(X_train, y_train)
    testing_score_lasso = model_lasso.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_lasso = model_lasso.predict(X_test)
    MSE_lasso = mean_squared_error(y_test, preds_lasso)
    # Get coeffs and Y-int
    coeffs_lasso= model_lasso.coef_.tolist()[0]
    y_int_lasso = model_lasso.intercept_.tolist()[0]
    print("===================")
    print("=== Lasso ===")
    print(f"{stat}: Training Score: {training_score_lasso}")
    print(f"{stat}:Testing Score: {testing_score_lasso}")
    print(f"{stat}:MSE: {MSE_lasso}")
    print('Weight coefficients: ', coeffs_lasso)
    print('y-axis intercept: ', y_int_lasso)
    print("===================")
    
    #================================ElasticNetModel===============
    model_eNet = ElasticNet(alpha=.01, l1_ratio=0.001).fit(X_train, y_train)
    # Calculate the R2 scores
    training_score_eNet= model_eNet.score(X_train, y_train)
    testing_score_eNet = model_eNet.score(X_test, y_test)
    # Make Predictions and get MSE
    preds_eNet = model_eNet.predict(X_test)
    MSE_eNet = mean_squared_error(y_test, preds_eNet)
    # Get coeffs and Y-int
    coeffs_eNet= model_eNet.coef_.tolist()[0]
    y_int_eNet = model_eNet.intercept_.tolist()[0]
    print("===================")
    print("=== Elastic Net ===")
    print(f"{stat}: Training Score: {training_score_eNet}")
    print(f"{stat}:Testing Score: {testing_score_eNet}")
    print(f"{stat}:MSE: {MSE_eNet}")
    print('Weight coefficients: ', coeffs_eNet)
    print('y-axis intercept: ', y_int_eNet)
    print("===================")
    
#     print("== Limited Inputs ==")
#     inputFactors2 = [p2Label, p1Label, stat]
#     X2 = mlData[inputFactors2]
#     y2 = mlData[[f1Label]]
    
#     # Split Data
#     X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=4)
    
#     #LinearModel===============
#     model_lin2 = LinearRegression()
#     model_lin2.fit(X_train2, y_train2)
#     # Calculate the R2 scores
#     training_score_lin2 = model_lin2.score(X_train2, y_train2)
#     testing_score_lin2 = model_lin2.score(X_test2, y_test2)
#     # Get coeffs and Y-int
#     coeffs_lin2 = model_lin2.coef_.tolist()[0]
#     y_int_lin2 = model_lin2.intercept_.tolist()[0]
#     print("===================")
#     print("=== Linear ===")
#     print(f"{stat}: Training Score: {training_score_lin2}")
#     print(f"{stat}:Testing Score: {testing_score_lin2}")
#     print('Weight coefficients: ', coeffs_lin2)
#     print('y-axis intercept: ', y_int_lin2)
#     print("===================")

    
    
    
#     inputFactors.append("Y-Int")
    
#     coeffs_lin.append(y_int_lin)
#     mlFormulaLin = {'Stat': stat,
#                  'Input': inputFactors,
#                  'Coeff': coeffs_lin,
#                 'TrainingScore': training_score_lin,
#                 'TestingScore': testing_score_lin}
    
#     coeffs_log.append(y_int_log)
#     mlFormulaLog = {'Stat': stat,
#                  'Input': inputFactors,
#                  'Coeff': coeffs_log,
#                 'TrainingScore': training_score_log,
#                 'TestingScore': testing_score_log}
    
#     mlFormulaDF = pd.DataFrame(rpgFormula)
    
    return (model_lin, model_ridge, model_lasso, model_eNet)

In [74]:
# Comment after Executing

# teamBatting2 = manipulateTeamDF(teamsStatsDF, 1980, "winPct", 2)

In [112]:
teamBatting2Model = run_ML_Model(teamBatting2, "winPct", ["RpG", "HpG", "HRpG", "BBpG"])

== All Inputs ==
Input Factors: ['p2-winPct', 'p1-winPct', 'winPct', 'RpG', 'HpG', 'HRpG', 'BBpG']
=== Linear ===
winPct: Training Score: 0.24124859279057254
winPct:Testing Score: 0.23584436142379883
winPct:MSE: 0.003462748106293515
Weight coefficients:  [-0.031475568606866206, 0.16124763110588788, 0.38644804327879556, -0.016006263130392513, 0.011358935547305627, 0.021184565268899183, 0.013636137999370187]
y-axis intercept:  0.14719427731191448
=== Ridge ===
winPct: Training Score: 0.2412469633883173
winPct:Testing Score: 0.23574546269851115
winPct:MSE: 0.003463196262868454
Weight coefficients:  [-0.03098626667233266, 0.16107699428679872, 0.38492774472541963, -0.015840737640179552, 0.01130970539387582, 0.02109692048628129, 0.013633865002010189]
y-axis intercept:  0.14757352381209304
=== Lasso ===
winPct: Training Score: 0.007050767558712812
winPct:Testing Score: -0.0027513418916633725
winPct:MSE: 0.004543937301422379
Weight coefficients:  0.0
y-axis intercept:  0.4909847682648249
=== E

In [76]:
# Comment after Executing

# teamBatting3 = manipulateTeamDF(teamsStatsDF, 1980, "winPct", 3)

In [None]:
# teamBatting3Model = run_ML_Model(teamBatting3, "winPct", ["RpG", "HRpG"])

In [24]:
# Comment after executing

# teamBatting5 = manipulateTeamDF(teamsStatsDF, 1980, "winPct", 5)

In [None]:
# teamBatting5Model = run_ML_Model(teamBatting5, "winPct", ["RpG", "HRpG"])

In [54]:
new5DF = teamBatting5.copy()
new5DF["prediction"] = ""
for index, row in new5DF.iterrows():
    rowSelected = new5DF.loc[[index]]
    pred = teamBatting5Model[0].predict(rowSelected[["p2-winPct", "p1-winPct", "winPct","RpG", "HpG", "HRpG", "BBpG"]])
    
#     newBBpGDF.at[index, "predRpG"] = predR
#     newBBpGDF.at[index, "predHpG"] = predH
#     newBBpGDF.at[index, "predHRpG"] = predHR
    new5DF.at[index, "prediction"] = pred[0][0]
    
new5DF.head()

Unnamed: 0,franchiseID,yearID,G,winPct,RpG,HpG,HRpG,BBpG,firstYear,lastYear,totalYears,teamYr,skip,p2-winPct,p1-winPct,f1-winPct,prediction
5,ANA,1985,162,0.555556,4.518519,8.419753,0.944444,4.0,1980,2019,40,6,0,0.432099,0.5,0.567901,0.530443
6,ANA,1986,162,0.567901,4.851852,8.561728,1.030864,4.141975,1980,2019,40,7,0,0.5,0.555556,0.462963,0.539874
7,ANA,1987,162,0.462963,4.753086,8.679012,1.061728,3.641975,1980,2019,40,8,0,0.555556,0.567901,0.462963,0.493804
8,ANA,1988,162,0.462963,4.407407,9.0,0.765432,2.895062,1980,2019,40,9,0,0.567901,0.462963,0.561728,0.468723
9,ANA,1989,162,0.561728,4.12963,8.777778,0.895062,2.648148,1980,2019,40,10,0,0.462963,0.462963,0.493827,0.513246


In [108]:
def makePredsTabel(df, inputs_list):
    newDF = df.copy()
    # Create a column to hold prediction values
    newDF["model"] = ""
    
    #Creat columns lists - for Dicts that will then be converted to DFs
    linearYears = []
    linearTeams = []
    linearActual = []
    linearModel = []
    linearType = []
    
    ridgeYears = []
    ridgeTeams = []
    ridgeActual = []
    ridgeModel = []
    ridgeType = []
    
    lassoYears = []
    lassoTeams = []
    lassoActual = []
    lassoModel = []
    lassoType = []
    
    eNetYears = []
    eNetTeams = []
    eNetActual = []
    eNetModel = []
    eNetType = []
    
    # Loop through df and make predictions and update columns lists
    inputFactors = ["p2-winPct", "p1-winPct", "winPct"]
    inputFactors.extend(inputs_list)
    for index, row in newDF.iterrows():
        yr = row["yearID"]
        team = row["franchiseID"]
        actual = row["winPct"]
        
        rowSelected = newDF.loc[[index]]

        lin_pred = teamBatting2Model[0].predict(rowSelected[inputFactors])[0][0]
        linearYears.append(yr)
        linearTeams.append(team + "-" + str(yr))
        linearActual.append(actual)
        linearModel.append(lin_pred)
        linearType.append("ML-LN-T1")
        
        ridge_pred = teamBatting2Model[1].predict(rowSelected[inputFactors])[0][0]
        ridgeYears.append(yr)
        ridgeTeams.append(team + "-" + str(yr))
        ridgeActual.append(actual)
        ridgeModel.append(ridge_pred)
        ridgeType.append("ML-RD-T1")
        
        lasso_pred = teamBatting2Model[2].predict(rowSelected[inputFactors])[0]
        lassoYears.append(yr)
        lassoTeams.append(team + "-" + str(yr))
        lassoActual.append(actual)
        lassoModel.append(lasso_pred)
        lassoType.append("ML-LS-T1")
        lasso_type = "ML-LS-T1"
        
        eNet_pred = teamBatting2Model[3].predict(rowSelected[inputFactors])[0]    
        eNetYears.append(yr)
        eNetTeams.append(team + "-" + str(yr))
        eNetActual.append(actual)
        eNetModel.append(eNet_pred)
        eNetType.append("ML-EN-T1")
        
    # Make dictionaries with the columns created
    linDict = {"yearID": linearYears,
              "teamID": linearTeams,
              "actual": linearActual,
              "model": linearModel,
              "model_type": linearType}
    linDF = pd.DataFrame.from_dict(linDict)
    
    ridgeDict = {"yearID": ridgeYears,
              "teamID": ridgeTeams,
              "actual": ridgeActual,
              "model": ridgeModel,
              "model_type": ridgeType}
    ridgeDF = pd.DataFrame.from_dict(ridgeDict)
    
    lassoDict = {"yearID": lassoYears,
              "teamID": lassoTeams,
              "actual": lassoActual,
              "model": lassoModel,
              "model_type": lassoType}
    lassoDF = pd.DataFrame.from_dict(lassoDict)
    
    eNetDict = {"yearID": eNetYears,
              "teamID": eNetTeams,
              "actual": eNetActual,
              "model": eNetModel,
              "model_type": eNetType}
    eNetDF = pd.DataFrame.from_dict(eNetDict)
    
    fullDF = linDF.append(ridgeDF)
    fullDF = fullDF.append(lassoDF)
    fullDF = fullDF.append(eNetDF)
    
    return fullDF

In [113]:
fullDFTry = makePredsTabel(teamBatting2, ["RpG", "HpG", "HRpG", "BBpG"])
fullDFTry = fullDFTry.sort_values(by=['teamID', "model_type"])
fullDFTry.head()

Unnamed: 0,yearID,teamID,actual,model,model_type
0,1982,ANA-1982,0.574074,0.520139,ML-EN-T1
0,1982,ANA-1982,0.574074,0.53295,ML-LN-T1
0,1982,ANA-1982,0.574074,0.49953,ML-LS-T1
0,1982,ANA-1982,0.574074,0.532837,ML-RD-T1
1,1983,ANA-1983,0.432099,0.493408,ML-EN-T1


In [116]:
savePath = os.path.join("..", "..", "data", "csv", "outputData", "teamPredictions.csv")
fullDFTry.to_csv(savePath)

In [115]:
teamBatting2.shape

(1048, 16)

In [60]:
compareDF = new5DF[['franchiseID', 'yearID', 'G', 'winPct', 'prediction']]
compareDF["wins"] = compareDF['winPct'] * compareDF['G']
compareDF["predWins"] = compareDF['prediction'] * compareDF['G']
compareDF["Difference"] = compareDF["wins"] - compareDF["predWins"]
compareDF["sqErr"] = compareDF["Difference"] **2
compareDF.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Unnamed: 0,franchiseID,yearID,G,winPct,prediction,wins,predWins,Difference,sqErr
5,ANA,1985,162,0.555556,0.530443,90.0,85.9317,4.06827,16.5508
6,ANA,1986,162,0.567901,0.539874,92.0,87.4597,4.54034,20.6147
7,ANA,1987,162,0.462963,0.493804,75.0,79.9962,-4.99625,24.9625
8,ANA,1988,162,0.462963,0.468723,75.0,75.9331,-0.933087,0.870652
9,ANA,1989,162,0.561728,0.513246,91.0,83.1458,7.8542,61.6885


In [61]:

compareDF1 = compareDF.convert_dtypes()
compareDF1.dtypes
maxMiss
maxMiss = compareDF1.groupby("G").agg({"sqErr": 'mean'})
maxMiss.head(11)

Unnamed: 0_level_0,sqErr
G,Unnamed: 1_level_1
112,19.411724
113,18.688334
114,23.679196
115,17.752332
117,24.027771
143,11.83666
144,35.293769
145,15.933775
160,61.71279
161,39.153015
