In [1]:
import pandas as pd
import numpy as np
from functools import reduce

import boxball_loader as bbl
import utils

In [2]:
bat = pd.read_parquet('../data/baseballdatabank/batting.parquet')

careers = bat.groupby('player_id')[['ab', 'h', 'hr']].sum()
careers['ba'] = (careers['h']/careers['ab']).fillna(0)

careers

Unnamed: 0_level_0,ab,h,hr,ba
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
aardsda01,4,0,0,0.000000
aaronha01,12364,3771,755,0.304998
aaronto01,944,216,13,0.228814
aasedo01,5,0,0,0.000000
abadan01,21,2,0,0.095238
...,...,...,...,...
zupofr01,18,3,0,0.166667
zuvelpa01,491,109,2,0.221996
zuverge01,142,21,0,0.147887
zwilldu01,1280,364,30,0.284375


In [3]:
# Find players who match this player's career totals in all of the categories
def find_matches(row, df):
    cats = ['h', 'hr', 'ba']
    cat_matches = [df[cat]>=row[cat] for cat in cats]
    matches_all = reduce((lambda x, y: x & y), cat_matches)
    
    return df[matches_all]
    

In [4]:
find_matches({'ba': 1, 'h': 2, 'hr': 1}, careers)

Unnamed: 0_level_0,ab,h,hr,ba
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
oconnfr01,2,2,1,1.0
yanes01,2,2,1,1.0


In [5]:
def get_unmatchables_brute_force(careers):

    careers_matching = careers.apply(lambda row: len(find_matches(row, careers)), axis=1)
    unmatchables = careers[careers_matching==1]
    return unmatchables.sort_values(by='h')

In [6]:
unmatchables = get_unmatchables_brute_force(careers)
unmatchables, len(unmatchables)

(              ab     h   hr        ba
 player_id                            
 paciojo01      3     3    0  1.000000
 jansera01      5     4    0  0.800000
 brittza01      8     5    1  0.625000
 silvelu01     11     6    0  0.545455
 durhado01     14     7    2  0.500000
 dappecl01     17     8    1  0.470588
 martido02     22     9    2  0.409091
 heathsl01     25    10    2  0.400000
 pattepa01     35    14    1  0.400000
 willigl01     40    17    0  0.425000
 rhombke01     47    18    1  0.382979
 goodahe01     45    19    0  0.422222
 forstte01     78    31    0  0.397436
 hayeske01     85    32    5  0.376471
 willite01   7706  2654  521  0.344407
 ruthba01    8398  2873  714  0.342105
 simmoal01   8759  2927  307  0.334171
 hornsro01   8173  2930  301  0.358497
 bondsba01   9847  2935  762  0.298060
 gwynnto01   9288  3141  135  0.338178
 musiast01  10972  3630  475  0.330842
 aaronha01  12364  3771  755  0.304998
 cobbty01   11436  4189  117  0.366299
 rosepe01   14053  4256  

In [7]:
unmatchables['name'] = utils.get_player_names_df(unmatchables, idx_fld='player_id')
unmatchables

Unnamed: 0_level_0,ab,h,hr,ba,name
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
paciojo01,3,3,0,1.0,John Paciorek
jansera01,5,4,0,0.8,Ray Jansen
brittza01,8,5,1,0.625,Zack Britton
silvelu01,11,6,0,0.545455,Luis Silverio
durhado01,14,7,2,0.5,Don Durham
dappecl01,17,8,1,0.470588,Cliff Dapper
martido02,22,9,2,0.409091,Domingo Martinez
heathsl01,25,10,2,0.4,Slade Heathcott
pattepa01,35,14,1,0.4,Pat Patterson
willigl01,40,17,0,0.425,Glenn Williams


In [8]:
print(unmatchables[['name', 'ba', 'hr', 'h', 'ab']].to_string())

                       name        ba   hr     h     ab
player_id                                              
paciojo01     John Paciorek  1.000000    0     3      3
jansera01        Ray Jansen  0.800000    0     4      5
brittza01      Zack Britton  0.625000    1     5      8
silvelu01     Luis Silverio  0.545455    0     6     11
durhado01        Don Durham  0.500000    2     7     14
dappecl01      Cliff Dapper  0.470588    1     8     17
martido02  Domingo Martinez  0.409091    2     9     22
heathsl01   Slade Heathcott  0.400000    2    10     25
pattepa01     Pat Patterson  0.400000    1    14     35
willigl01    Glenn Williams  0.425000    0    17     40
rhombke01    Kevin Rhomberg  0.382979    1    18     47
goodahe01      Herb Goodall  0.422222    0    19     45
forstte01     Terry Forster  0.397436    0    31     78
hayeske01    Ke'Bryan Hayes  0.376471    5    32     85
willite01      Ted Williams  0.344407  521  2654   7706
ruthba01          Babe Ruth  0.342105  714  2873

In [9]:
def get_unmatchables_iterative(careers):
    cats = ['h', 'hr', 'ba']

    def run_one_iteration(unmatchables, df):
        leader = df.iloc[0]

        # Keep any player who beats our leader in at least one category
        # Discard the rest
        cat_matches = [df[cat]>leader[cat] for cat in cats]
        beats_any = reduce((lambda x, y: x | y), cat_matches)
        remain = df[beats_any]
        return unmatchables.append(leader), remain
    
    df = careers.sort_values(by=cats, ascending=False)
    unmatchables = pd.DataFrame()
    while len(df) > 0:
        unmatchables, df = run_one_iteration(unmatchables, df)
    
    # there might be some ties, so run the original definition again to filter out the real answers
    careers_matching = unmatchables.apply(lambda row: len(find_matches(row, careers)), axis=1)
    return unmatchables[careers_matching==1]

In [10]:
unmatchables = get_unmatchables_iterative(careers)
unmatchables, len(unmatchables)

(                ab        ba       h     hr
 rosepe01   14053.0  0.302853  4256.0  160.0
 cobbty01   11436.0  0.366299  4189.0  117.0
 aaronha01  12364.0  0.304998  3771.0  755.0
 musiast01  10972.0  0.330842  3630.0  475.0
 gwynnto01   9288.0  0.338178  3141.0  135.0
 bondsba01   9847.0  0.298060  2935.0  762.0
 hornsro01   8173.0  0.358497  2930.0  301.0
 simmoal01   8759.0  0.334171  2927.0  307.0
 ruthba01    8398.0  0.342105  2873.0  714.0
 willite01   7706.0  0.344407  2654.0  521.0
 hayeske01     85.0  0.376471    32.0    5.0
 forstte01     78.0  0.397436    31.0    0.0
 goodahe01     45.0  0.422222    19.0    0.0
 rhombke01     47.0  0.382979    18.0    1.0
 willigl01     40.0  0.425000    17.0    0.0
 pattepa01     35.0  0.400000    14.0    1.0
 heathsl01     25.0  0.400000    10.0    2.0
 martido02     22.0  0.409091     9.0    2.0
 dappecl01     17.0  0.470588     8.0    1.0
 durhado01     14.0  0.500000     7.0    2.0
 silvelu01     11.0  0.545455     6.0    0.0
 brittza01