In [1]:
import polars as pl
pl.Config.with_columns_kwargs = True

In [166]:
batting = pl.read_csv('Data/lahman baseball data/Batting.csv')
pitching = pl.read_csv('Data/lahman baseball data/Pitching.csv')
fielding = pl.read_csv('Data/lahman baseball data/Fielding.csv')
awards = pl.read_csv('Data/lahman baseball data/AwardsPlayers.csv')
salaries = pl.read_csv('Data/lahman baseball data/Salaries.csv')

In [178]:
awards.columns

['playerID', 'awardID', 'yearID', 'lgID', 'tie', 'notes']

In [177]:
fielding.columns

['playerID',
 'yearID',
 'stint',
 'teamID',
 'lgID',
 'POS',
 'G',
 'GS',
 'InnOuts',
 'PO',
 'A',
 'E',
 'DP',
 'PB',
 'WP',
 'SB',
 'CS',
 'ZR']

In [None]:
fielding.sort('yearID', descending=True)


In [None]:
'case is player, year, pos, league'

playerID,yearID,lgID,POS,awardID,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
str,i64,str,str,str,i64,i64,i64,i64,i64,str,str,str,str,str


In [263]:
awards_filter = (awards
                 .rename({"notes": "POS"})
                 .select(['playerID',
                           'yearID',
                           'awardID',
                           'lgID',
                           'POS'])
                 .filter(pl.col('awardID') == 'Gold Glove'))

fielding_awards = (fielding
.join(awards_filter, on=['playerID', 'yearID'], how='left')
.group_by(['yearID', 'lgID', 'POS', 'awardID', 'playerID'])
.agg(
     pl.col('InnOuts').sum().alias('InnOuts'),
     pl.col('PO').sum().alias('PO'),
     pl.col('A').sum().alias('A'),
     pl.col('E').sum().alias('E'),
     pl.col('DP').sum().alias('DP'),
     pl.col('PB').sum().alias('PB'),
     pl.col('WP').sum().alias('WP'),
     pl.col('SB').sum().alias('SB'),
     pl.col('CS').sum().alias('CS'),
     pl.col('ZR').sum().alias('ZR')
    )    

 .with_columns(
        pl.when(pl.col('awardID').is_null())
          .then(pl.lit('No'))
          .otherwise(pl.lit('Yes'))
          .alias('Gold Glove?')
              )
 .with_columns(
         pl.when(pl.col('yearID') == 2023)
            .then(pl.lit('Validation'))
            .otherwise(pl.lit('Training'))
            .alias('Training-Validation')
              )
                   
                  )
# fielding_awards = fielding_awards.filter(pl.col('awardID') == 'Gold Glove', pl.col('POS') == 'C')

fielding_awards

yearID,lgID,POS,awardID,playerID,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR,Gold Glove?,Training-Validation
i64,str,str,str,str,i64,i64,i64,i64,i64,str,str,str,str,str,str,str
1895,"""NL""","""OF""",,"""stratsc01""",66,5,0,2,0,,,,,,"""No""","""Training"""
1970,"""AL""","""P""",,"""bluevi01""",117,1,7,0,0,,,,,,"""No""","""Training"""
1943,"""NL""","""P""",,"""headed01""",0,6,40,3,2,,,,,,"""No""","""Training"""
1908,"""AL""","""P""",,"""burnsbi01""",492,2,69,6,2,,,,,,"""No""","""Training"""
2006,"""AL""","""P""",,"""campsh01""",225,6,13,1,2,,,,,,"""No""","""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2021,"""NL""","""P""",,"""kranima01""",116,2,5,0,0,,,,,,"""No""","""Training"""
1872,"""NA""","""P""",,"""woltery01""",225,1,8,9,1,,,,,,"""No""","""Training"""
2019,"""NL""","""P""",,"""altheaa01""",3,0,0,0,0,,,,,,"""No""","""Training"""
1990,"""NL""","""SS""",,"""harrile01""",3,0,2,0,0,,,,,,"""No""","""Training"""


In [264]:
duplicates = (
    fielding_awards
    .group_by('playerID', 'yearID')
    .agg(pl.len().alias('count'))
    .filter(pl.col('count') > 1)
    .sort('count', descending = True)# Filter only players that appear more than once
)

duplicates


playerID,yearID,count
str,i64,u32
"""lopezni01""",2023,10
"""betemwi01""",2007,9
"""lopezfe01""",2010,9
"""kellyki01""",1891,9
"""lunahe01""",2006,9
…,…,…
"""garvest01""",1972,2
"""steinte01""",1992,2
"""hoffmgl01""",1981,2
"""teufeti01""",1987,2
