# Reading in Polars and Data

In [1]:
import polars as pl
pl.Config.with_columns_kwargs = True

In [2]:
batting = pl.read_csv('Data/lahman baseball data/Batting.csv')
pitching = pl.read_csv('Data/lahman baseball data/Pitching.csv')
fielding = pl.read_csv('Data/lahman baseball data/Fielding.csv')
awards = pl.read_csv('Data/lahman baseball data/AwardsPlayers.csv')
salaries = pl.read_csv('Data/lahman baseball data/Salaries.csv')

# Gold Glove Data Set

In [3]:
'case is player, year, pos, league'

'case is player, year, pos, league'

In [4]:
fielding_awards = (fielding
.filter((pl.col('yearID') >= 2013))
.join(
     awards.rename({"notes": "POS"}) 
           .select(['playerID', 'yearID', 'awardID', 'lgID', 'POS'])  
           .filter((pl.col('awardID') == 'Gold Glove') & (pl.col('yearID') >= 2013)),
      on=['playerID', 'yearID', 'lgID'],
      how='left'
     )
.group_by(['playerID','yearID', 'lgID', 'POS', 'awardID'])
.agg(
     pl.col('InnOuts').sum().alias('InnOuts'),
     pl.col('PO').sum().alias('PO'),
     pl.col('A').sum().alias('A'),
     pl.col('E').sum().alias('E'),
     pl.col('DP').sum().alias('DP'),
     pl.col('PB').cast(int).sum().alias('PB'),
     pl.col('WP').cast(int).sum().alias('WP'),
     pl.col('SB').cast(int).sum().alias('SB'),
     pl.col('CS').cast(int).sum().alias('CS'),
     pl.col('ZR').cast(int).sum().alias('ZR') 
    )    
.with_columns(
        pl.when(pl.col('awardID').is_null())
          .then(pl.lit('No'))
          .otherwise(pl.lit('Yes'))
          .alias('Gold Glove?'),
         pl.when(pl.col('yearID') == 2023)
          .then(pl.lit('Validation'))
          .otherwise(pl.lit('Training'))
          .alias('Training-Validation')
              )
.drop(pl.col('awardID'))
                  )
# fielding_awards = fielding_awards.filter(pl.col('Gold Glove?') == 'Yes', pl.col('POS') == 'C', pl.col('yearID') == 2017)
fielding_awards.write_csv('data/fielding_awards.csv')

fielding_awards

playerID,yearID,lgID,POS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,Gold Glove?,Training-Validation
str,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
"""rogerjo01""",2021,"""NL""","""P""",107,0,4,0,1,0,0,0,0,0,"""No""","""Training"""
"""rominau01""",2014,"""AL""","""C""",45,14,2,0,2,0,0,0,1,0,"""No""","""Training"""
"""correca01""",2023,"""AL""","""SS""",3433,166,278,6,56,0,0,0,0,0,"""No""","""Validation"""
"""gimench01""",2014,"""AL""","""1B""",165,39,1,0,5,0,0,0,0,0,"""No""","""Training"""
"""negrokr01""",2018,"""AL""","""3B""",78,2,3,0,0,0,0,0,0,0,"""No""","""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""martimi02""",2017,"""AL""","""OF""",29,3,0,0,0,0,0,0,0,0,"""No""","""Training"""
"""mahtomi01""",2019,"""AL""","""OF""",177,16,0,2,0,0,0,0,0,0,"""No""","""Training"""
"""coveydy01""",2018,"""AL""","""P""",365,9,14,4,2,0,0,0,0,0,"""No""","""Training"""
"""speasal01""",2023,"""AL""","""P""",6,0,0,0,0,0,0,0,0,0,"""No""","""Validation"""


# Salary Dataset

In [5]:
'case is player year' 'salary, batting, fielding, pitching'

'case is player yearsalary, batting, fielding, pitching'

In [6]:
salaries_final = (salaries
.filter(pl.col('yearID') >= 2006)
.join(fielding
        .filter(pl.col('yearID') >= 2006),
        on=['playerID', 'yearID', 'lgID', 'teamID'],
        how='left'
     )
.join(batting
        .rename({"CS": "CS_batting", "SB": "SB_Batting"})
        .filter(pl.col('yearID') >= 2006),
        on=['playerID', 'yearID', 'teamID', 'stint', 'lgID', 'G'],
        how='left',
     )
.join(pitching
        .rename({
            "GIDP": "GIDP_pitching", "H": "H_pitching", "HR": "HR_pitching", "BB": "BB_pitching",
            "SO": "SO_pitching", "IBB": "IBB_pitching", "WP": "WP_pitching", "HBP": "HBP_pitching",
            "SH": "SH_pitching", "SF": "SF_pitching", "R": "R_pitching", "GIDP": "GIDP_pitching"
                })
        .filter(pl.col('yearID') >= 2006),
        on=['playerID', 'yearID', 'teamID', 'stint', 'lgID', 'G', 'GS'],
        how='left'
     )
.group_by(['playerID','yearID'])
.agg(
     pl.col('salary').sum().alias('salary'),
     pl.col('InnOuts').sum().alias('InnOuts'),
     pl.col('PO').sum().alias('PO'),
     pl.col('A').sum().alias('A'),
     pl.col('E').sum().alias('E'),
     pl.col('DP').sum().alias('DP'),
     pl.col('PB').cast(int).sum().alias('PB'),
     pl.col('WP').cast(int).sum().alias('WP'),
     pl.col('SB').cast(int).sum().alias('SB'),
     pl.col('CS').cast(int).sum().alias('CS'),
     pl.col('ZR').cast(int).sum().alias('ZR'),
     pl.col('G').sum().alias('G'),
     pl.col('AB').sum().alias('AB'),
     pl.col('R').sum().alias('R'),
     pl.col('H').sum().alias('H'),
     pl.col('2B').sum().alias('2B'),
     pl.col('3B').sum().alias('3B'),
     pl.col('HR').sum().alias('HR'),
     pl.col('SB_Batting').sum().alias('SB_batting'),
     pl.col('CS_batting').sum().alias('CS_batting'),
     pl.col('BB').sum().alias('BB_batting'),
     pl.col('SO').sum().alias('SO_batting'),
     pl.col('IBB').sum().alias('IBB_batting'),
     pl.col('HBP').sum().alias('HBP_batting'),
     pl.col('SH').sum().alias('Sacrifice Hits'),
     pl.col('SF').sum().alias('Sacrifice Flies'),
     pl.col('GIDP').sum().alias('GIDP'),
     pl.col('GS').sum().alias('GS'),
     pl.col('CG').sum().alias('CG'),
     pl.col('SHO').sum().alias('SHO'),
     pl.col('SV').sum().alias('SV'),
     pl.col('IPouts').sum().alias('IPOuts'),
     pl.col('H_pitching').sum().alias('H_pitching'),
     pl.col('ER').sum().alias('ER_pitching'),
     pl.col('HR_pitching').sum().alias('HR_pitching'),
     pl.col('BB_pitching').sum().alias('BB_pitching'),
     pl.col('SO_pitching').sum().alias('SO_pitching'),
     pl.col('BAOpp').sum().alias('BAOpp'),
     pl.col('ERA').sum().alias('ERA_pitching'),
     pl.col('IBB_pitching').sum().alias('IBB_pitching'),
     pl.col('WP_pitching').sum().alias('WP_pitching'),
     pl.col('HBP_pitching').sum().alias('HBP_pitching'),
     pl.col('BK').sum().alias('Balk'),
     pl.col('BFP').sum().alias('Batters Faced by Pitcher'),
     pl.col('GF').sum().alias('Games Finished'),
     pl.col('SH').sum().alias('SH_pitching'),
     pl.col('SF').sum().alias('SF_pitching'),
     pl.col('GIDP_pitching').sum().alias('GIDP_pitching')
    )   
.with_columns(
         pl.when(pl.col('yearID') == 2016)
          .then(pl.lit('Validation'))
          .otherwise(pl.lit('Training'))
          .alias('Training-Validation')
               )
)
salaries_final.write_csv('data/salaries_final.csv')
salaries_final

playerID,yearID,salary,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,G,AB,R,H,2B,3B,HR,SB_batting,CS_batting,BB_batting,SO_batting,IBB_batting,HBP_batting,Sacrifice Hits,Sacrifice Flies,GIDP,GS,CG,SHO,SV,IPOuts,H_pitching,ER_pitching,HR_pitching,BB_pitching,SO_pitching,BAOpp,ERA_pitching,IBB_pitching,WP_pitching,HBP_pitching,Balk,Batters Faced by Pitcher,Games Finished,SH_pitching,SF_pitching,GIDP_pitching,Training-Validation
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str
"""sherrge01""",2011,1200000,108,2,6,0,1,0,0,0,0,0,51,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,108,33,12,3,12,38,0.248,3.0,1,1,1,0,149,9,0,0,3,"""Training"""
"""hamelco01""",2015,23500000,386,5,20,2,0,0,0,0,0,0,20,39,2,6,1,0,0,0,0,0,15,0,0,4,0,1,20,1,1,0,386,113,52,12,39,137,0.233,3.64,3,7,6,2,537,0,4,0,6,"""Training"""
"""andinro01""",2012,5200000,3036,195,306,13,71,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,107,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,"""Training"""
"""belliro01""",2008,6400000,2037,260,125,11,40,0,0,0,0,0,98,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,77,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,"""Training"""
"""clarkto02""",2008,900000,66,17,1,0,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,"""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""matthry01""",2013,504500,106,3,3,0,1,0,0,0,0,0,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,106,52,25,1,15,22,0.351,6.37,0,4,0,1,166,13,0,0,7,"""Training"""
"""millean01""",2012,1040000,121,1,2,1,0,0,0,0,0,0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,121,28,15,3,20,51,0.194,3.35,1,1,2,0,169,4,0,0,1,"""Training"""
"""ledezwi01""",2009,750000,17,1,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,8,6,1,4,8,0.333,9.53,0,1,0,0,30,2,0,0,0,"""Training"""
"""herrmch01""",2015,1530000,852,208,16,3,5,0,0,11,8,0,42,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,"""Training"""


# Classification Problem

In [14]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [46]:
fielding_awards

playerID,yearID,lgID,POS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,Gold Glove?,Training-Validation
str,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
"""rogerjo01""",2021,"""NL""","""P""",107,0,4,0,1,0,0,0,0,0,"""No""","""Training"""
"""rominau01""",2014,"""AL""","""C""",45,14,2,0,2,0,0,0,1,0,"""No""","""Training"""
"""correca01""",2023,"""AL""","""SS""",3433,166,278,6,56,0,0,0,0,0,"""No""","""Validation"""
"""gimench01""",2014,"""AL""","""1B""",165,39,1,0,5,0,0,0,0,0,"""No""","""Training"""
"""negrokr01""",2018,"""AL""","""3B""",78,2,3,0,0,0,0,0,0,0,"""No""","""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""martimi02""",2017,"""AL""","""OF""",29,3,0,0,0,0,0,0,0,0,"""No""","""Training"""
"""mahtomi01""",2019,"""AL""","""OF""",177,16,0,2,0,0,0,0,0,0,"""No""","""Training"""
"""coveydy01""",2018,"""AL""","""P""",365,9,14,4,2,0,0,0,0,0,"""No""","""Training"""
"""speasal01""",2023,"""AL""","""P""",6,0,0,0,0,0,0,0,0,0,"""No""","""Validation"""


In [16]:
fielding_awards.columns

['playerID',
 'yearID',
 'lgID',
 'POS',
 'InnOuts',
 'PO',
 'A',
 'E',
 'DP',
 'PB',
 'WP',
 'SB',
 'CS',
 'ZR',
 'Gold Glove?',
 'Training-Validation']

## Settting Up

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

param_grid = {'max_depth': [None, 1,2,3,4,5],
              'min_samples_split' : [1, 5, 10],
               'min_samples_leaf' : [1, 5, 10],
              'class_weight':[None, "balanced"]
              }


In [50]:
from sklearn.model_selection import StratifiedKFold

state = 458

cv_obj = StratifiedKFold(n_splits=10, shuffle=True, random_state=state)

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, verbose=3, cv=cv_obj)

## Performing Grid Search

In [None]:
grid_search.fit(fielding_stats_train, gold_glove_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_

best_model

In [58]:
#Data preparation
fielding_training = fielding_awards.filter(pl.col('Training-Validation') == 'Training').drop(['Gold Glove?', 'playerID', 'yearID', 'lgID', 'POS'])
gold_glove_train = fielding_awards.filter(pl.col('Training-Validation') == 'Training').select('Gold Glove?')

fielding_validation = fielding_awards.filter(pl.col('Training-Validation') == 'Validation').drop(['Gold Glove?', 'playerID', 'yearID', 'lgID', 'POS'])
gold_glove_validation = fielding_awards.filter(pl.col('Training-Validation') == 'Validation').select('Gold Glove?')

# Convert to pandas for sklearn
fielding_stats_train = fielding_training.to_pandas()
gold_glove_train = gold_glove_train.to_pandas().squeeze()

fielding_stats_validation = fielding_validation.to_pandas()
gold_glove_validation = gold_glove_validation.to_pandas().squeeze()

# Step 2: Decision Tree grid search
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

param_grid_tree = {'max_depth': [None, 1, 2, 3, 4, 5],
                   'min_samples_split': [2, 5, 10],
                   'min_samples_leaf': [1, 5, 10],
                   'class_weight': [None, "balanced"]}

state = 458
cv_obj = StratifiedKFold(n_splits=10, shuffle=True, random_state=state)

grid_search_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_tree, verbose=3, cv=cv_obj)
grid_search_tree.fit(fielding_stats_train, gold_glove_train)

param_grid_rf = {'n_estimators': [50, 100, 200],
                 'max_depth': [None, 5, 10],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 5, 10],
                 'class_weight': [None, "balanced"]}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, verbose=3, cv=cv_obj)
grid_search_rf.fit(fielding_stats_train, gold_glove_train)

# Step 4: Evaluate best models
best_tree = grid_search_tree.best_estimator_
tree_score = best_tree.score(fielding_stats_validation, gold_glove_validation)

best_rf = grid_search_rf.best_estimator_
rf_score = best_rf.score(fielding_stats_validation, gold_glove_validation)

# Step 5: Print the results
print(f"Best Decision Tree accuracy on validation data: {tree_score}")
print(f"Best Random Forest accuracy on validation data: {rf_score}")


Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=nan total time=   0.0s
[CV 2/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=nan total time=   0.0s
[CV 3/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=nan total time=   0.0s
[CV 4/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=nan total time=   0.0s
[CV 5/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=nan total time=   0.0s
[CV 6/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=nan total time=   0.0s
[CV 7/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=nan total time=   0.0s
[CV 8/10] END class_weight=None, max_depth=None, min_samples_leaf=1, min_samples_split=2;, scor

ValueError: 
All the 1080 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\tree\_classes.py", line 1009, in fit
    super()._fit(
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\tree\_classes.py", line 252, in _fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 645, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hp6265bz\AppData\Local\anaconda3\envs\polars\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Training'
