In [4]:
import polars as pl
pl.Config.with_columns_kwargs = True

In [6]:
batting = pl.read_csv('Data/lahman baseball data/Batting.csv')
pitching = pl.read_csv('Data/lahman baseball data/Pitching.csv')
fielding = pl.read_csv('Data/lahman baseball data/Fielding.csv')
awards = pl.read_csv('Data/lahman baseball data/AwardsPlayers.csv')
salaries = pl.read_csv('Data/lahman baseball data/Salaries.csv')

In [8]:
awards.columns

['playerID', 'awardID', 'yearID', 'lgID', 'tie', 'notes']

In [10]:
fielding.columns

['playerID',
 'yearID',
 'stint',
 'teamID',
 'lgID',
 'POS',
 'G',
 'GS',
 'InnOuts',
 'PO',
 'A',
 'E',
 'DP',
 'PB',
 'WP',
 'SB',
 'CS',
 'ZR']

In [14]:
'case is player, year, pos, league'

'case is player, year, pos, league'

In [189]:
awards_filter = (awards
                 .rename({"notes": "POS"})
                 .select(['playerID',
                           'yearID',
                           'awardID',
                           'lgID',
                           'POS'])
                 .filter(pl.col('awardID') == 'Gold Glove', pl.col('yearID') >= 2013))

fielding_awards = (fielding
.join(awards_filter, on=['playerID', 'yearID'], how='left')
.group_by(['playerID','yearID', 'lgID', 'awardID', 'POS'])
.agg(
     pl.col('InnOuts').sum().alias('InnOuts'),
     pl.col('PO').sum().alias('PO'),
     pl.col('A').sum().alias('A'),
     pl.col('E').sum().alias('E'),
     pl.col('DP').sum().alias('DP'),
     pl.col('PB'),
     pl.col('WP'),
     pl.col('SB'),
     pl.col('CS'),
     pl.col('ZR')
    )    
                  
 .with_columns(
        pl.when(pl.col('awardID').is_null())
          .then(pl.lit('No'))
          .otherwise(pl.lit('Yes'))
          .alias('Gold Glove?')
              )
 .with_columns(
         pl.when(pl.col('yearID') == 2023)
            .then(pl.lit('Validation'))
            .otherwise(pl.lit('Training'))
            .alias('Training-Validation')
              )
                   
                  )
fielding_awards = fielding_awards.filter(pl.col('awardID') == 'Gold Glove', pl.col('POS') == 'C')

fielding_awards

playerID,yearID,lgID,awardID,POS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,Gold Glove?,Training-Validation
str,i64,str,str,str,i64,i64,i64,i64,i64,list[str],list[str],list[str],list[str],list[str],str,str
"""molinya01""",2018,"""NL""","""Gold Glove""","""C""",3053,966,42,2,8,"[""4""]",[null],"[""27""]","[""12""]",[null],"""Yes""","""Training"""
"""maldoma01""",2017,"""AL""","""Gold Glove""","""C""",3439,1046,65,2,2,"[""8""]",[null],"[""46""]","[""29""]",[null],"""Yes""","""Training"""
"""stallja01""",2021,"""NL""","""Gold Glove""","""C""",2676,868,49,5,2,"[""0""]",[null],"[""45""]","[""12""]",[null],"""Yes""","""Training"""
"""perezsa02""",2013,"""AL""","""Gold Glove""","""C""",3346,930,71,7,4,"[""3""]",[null],"[""46""]","[""25""]",[null],"""Yes""","""Training"""
"""barnhtu01""",2017,"""NL""","""Gold Glove""","""C""",2779,863,89,1,9,"[""4""]",[null],"[""41""]","[""32""]",[null],"""Yes""","""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""perezsa02""",2015,"""AL""","""Gold Glove""","""C""",3577,974,90,4,10,"[""4""]",[null],"[""66""]","[""29""]",[null],"""Yes""","""Training"""
"""murphse01""",2021,"""AL""","""Gold Glove""","""C""",2770,873,42,6,6,"[""1""]",[null],"[""41""]","[""13""]",[null],"""Yes""","""Training"""
"""perezro02""",2020,"""AL""","""Gold Glove""","""C""",768,291,21,0,2,"[""0""]",[null],"[""4""]","[""10""]",[null],"""Yes""","""Training"""
"""molinya01""",2014,"""NL""","""Gold Glove""","""C""",2795,810,56,2,10,"[""3""]",[null],"[""23""]","[""21""]",[null],"""Yes""","""Training"""


In [191]:
duplicates = (
    fielding_awards
    .group_by('yearID')
    .agg(pl.len().alias('count'))
    .filter(pl.col('count') > 1)
    .sort('count', descending = True)
)

duplicates


yearID,count
i64,u32
2021,2
2020,2
2019,2
2023,2
2013,2
…,…
2015,2
2022,2
2017,2
2018,2
