## Code in v13 works.  Remove all-star stat

In [20]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [139]:
# Functions

def add_allstars(df, allstar_file):
    '''
    Description: add all-star game column
    '''
    # Read in allstars file
    allstars = pd.read_csv(allstar_file)
    # Remove duplicate years
    allstars = allstars.groupby(['playerID', 'yearID']).sum().reset_index()
    # Create allstar column
    allstars['AS'] = 1
    # Merge allstars with input df
    df_merged = pd.merge(df, allstars[['playerID', 'yearID', 'AS']],\
                  how='left', on=['playerID', 'yearID'])
    # Fill NaNs with 0
    df_merged['AS'].fillna(0, inplace=True)
    return df_merged

# Function to add awards

def add_awards(df, awards_file):
    '''
    Description: Add MVP, Triple Crown, and Gold Glove awards.
    '''
    # Read in awards csv file
    awards = pd.read_csv(awards_file)
    
    # Select only MVP, Triple Crown, and Gold Glove awards
    awards_subset = awards[awards['awardID'].isin(['Most Valuable Player', 'Triple Crown', 'Gold Glove'])]
    awards_subset = pd.concat([awards_subset, pd.get_dummies(awards_subset['awardID'])], axis=1)
    awards_subset.rename(columns={'Most Valuable Player': 'MVP'}, inplace=True)
    awards_subset = awards_subset.groupby(['playerID', 'yearID']).sum().reset_index()    
    
    # Merge awards_subset with df from argument
    df_merged = pd.merge(df, awards_subset[['playerID', 'yearID', 'Gold Glove', 'MVP', 'Triple Crown']],\
                  how='left', on=['playerID', 'yearID'])
    
    # Fill NaNs with 0
    df_merged['Gold Glove'].fillna(0, inplace=True)
    df_merged['MVP'].fillna(0, inplace=True)
    df_merged['Triple Crown'].fillna(0, inplace=True)
    return df_merged

def calculate_stat_ratio(row, stat, denom):
    '''
    Description: Calculate the ratio of a player's stats in a particular category to the mean, median, or min of
    that stat for a Hall of Famer at that same position
    '''
    return ((row[stat] / row['year']) * (row['year'] + row['yrs_remain'])) / row[denom]

def add_eras_cols(df):
    # Dead Ball Era 1 (1900-1919)
    df['DBE1'] = 0
    df.ix[(df['yearID'] >= 1900) & (df['yearID'] <= 1919), 'DBE1'] = 1

    # Dead Ball Era 2 (1961-1968)
    df['DBE2'] = 0
    df.ix[(df['yearID'] >= 1961) & (df['yearID'] <= 1968), 'DBE2'] = 1

    # Steroid era (1988-2003)
    df['SE'] = 0
    df.ix[(df['yearID'] >= 1988) & (df['yearID'] <= 2003), 'SE'] = 1
    return df

def combine_stints(df):
    df2 = df.copy()
    if 'inducted' in df2.columns:
        df2.drop('inducted', axis=1, inplace=True)
    if 'stint' in df2.columns:
        df2.drop('stint', axis=1, inplace=True)
    return df2.groupby(['playerID', 'yearID']).sum().reset_index().sort_values(by=['playerID', 'yearID'])


def add_stat_ratio_cols(df, stats, denominator='mean'):
    '''
    Description: Calculate the desired stats ratios and add them as new column to df
    '''
    if denominator == 'mean':
        denom_stats = [stat + '_mean' for stat in stats]
    elif denominator == 'median':
        denom_stats = [stat + '_med' for stat in stats]
    elif denominator == 'min':
        denom_stats = [stat + '_min' for stat in stats]
        
    stats_ratio = [stat + '_ratio' for stat in stats]
    
    df2 = df.copy()
    for stat, denom, stat_ratio in zip(stats, denom_stats, stats_ratio):
        df2[stat_ratio] = df2.apply(calculate_stat_ratio, axis=1, args=(stat, denom))
    return df2

def add_yr_col(df):
    # Create 'year' variable indicating the number of years players have played in the MLB.
    player_startyr_dict = pd.DataFrame(df.groupby('playerID').min()['yearID']).to_dict()['yearID']
    df['year'] = df.apply(subtract_start_yr, axis=1, args=(player_startyr_dict,))
    return df

# def create_remain_yrs_col(df):
#     # Create 'yrs_remain' variable that estimates the remaining number of years for that player.
#     # Based on the median number of years that eligible players at that position played
#     player_totalyrs_dict = pd.DataFrame(df.groupby('playerID').max()['year']).to_dict()['year']
#     df['yrs_remain'] = df.apply(get_remaining_yrs, axis=1, args=(player_totalyrs_dict,))
#     return df

def fill_na(df, stats):
    stats_filled = []
    for stat in stats:
        if df[stat].isnull().sum() > 0:
            stat_filled = stat + '_filled'
            df[stat_filled] = 0
            df.ix[df[stat].isnull(), stat_filled] = 1
            df.ix[df[stat].isnull(), stat] = 0
            stats_filled.append(stat_filled)
    return df, stats_filled


def get_avg_position_player_stats(df, features):
    '''
    Description: Obtain the average stats for a position player
    '''

    positions = df['POS'].unique()
    position_stats_lst = []

    for position in positions:
        pos = pd.Series([position], index=['POS'])
        position_stats = df[df['POS'] == position][features].mean().append(pos)
        position_stats_lst.append(position_stats)
    nonhofer_stats_df = pd.DataFrame(position_stats_lst)
    return nonhofer_stats_df.set_index('POS')


def get_birth_year(filename):
    master = pd.read_csv(filename)
    return pd.DataFrame(master.groupby('playerID').sum()['birthYear']).reset_index()

def get_name(filename):
    master = pd.read_csv(filename)
    df = master[['playerID', 'nameFirst', 'nameLast']]
    df['name'] = df['nameFirst'].map(str) + ' ' + df['nameLast'].map(str)
    df.drop(['nameFirst', 'nameLast'], axis=1, inplace=True)
    return df
    
def get_cumulative_stats(df, stats_to_accumulate):
    # Calculate cumulative stats over the years for each player.
    return df.groupby('playerID')[stats_to_accumulate].cumsum()[stats_to_accumulate]

# def get_cumulative_stats(df):
#     # Calculate cumulative stats over the years for each player.
#     stats = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
#              'IBB', 'HBP', 'SH', 'SF', 'GIDP']
#     return df.groupby('playerID')[stats].cumsum()[stats]


def get_hof_labels(filename):
    '''
    INPUT: 1 file
    OUTPUT: Pandas df

    Given Hall of Fame (HOF) data file, create HOF labels for all eligible players (inducted and not inducted)
    Returns: dataframe of all eligible HOF players with labels indicating if they were inducted or not.
    '''

    # Load HallofFame.csv file containing players who were/are eligible for election to HOF.
    hof = pd.read_csv(filename)

    # Select those who were inducted into HOF
    hof_players = hof[(hof['inducted'] == 'Y') & (hof['category'] == 'Player')][['playerID', 'inducted']]
    hof_players['inducted'] = hof_players['inducted'].map({'Y' : 1})

#    hof_player_indices = set(hof_players.index)
    hof_playerID = set(hof_players['playerID'])

    # Select all eligible players for the HOF (i.e., those who were on the ballot)
    elig = hof[(hof['category'] == 'Player')]

#    elig_indices = set(elig.index)
    elig_playerID = set(elig['playerID'])

    # Select players who were on the ballot but were not inducted into HOF
    nonhof_playerID = elig_playerID - hof_playerID
    nonhof_playerID = list(nonhof_playerID)
    nonhof_players = pd.DataFrame(nonhof_playerID, columns=['playerID'])
    nonhof_players['inducted'] = 0

    # Merge hof_players and nonhof_players
    return pd.concat([hof_players, nonhof_players])

def get_hofer_stats(df, stats, calculate='mean'):
    '''
    Description: Determine mean, median, or min of each stat for HOF players at each position
    '''
#     stats = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
#              'IBB', 'HBP', 'SH', 'SF', 'GIDP']

    positions = df['POS'].unique()
    position_stats_lst = []

    for position in positions:
        pos = pd.Series([position], index=['POS'])
        if calculate == 'mean':
            stats_labels = [stat + '_mean' for stat in stats]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats].max().mean().round(1).append(pos)
        elif calculate == 'median':
            stats_labels = [stat + '_med' for stat in stats]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats].max().median().round(1).append(pos)
        elif calculate == 'min':
            stats_labels = [stat + '_min' for stat in stats]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats].max().min().round(1).append(pos)            
        
        position_stats_lst.append(position_stats)

    hofer_stats_df = pd.DataFrame(position_stats_lst)

    hofer_stats_df.columns = stats_labels
    return hofer_stats_df


def get_positions(filename):

    # Load Fielding.csv file
    fielding = pd.read_csv(filename)

    # Set position of each player to the one at which he played the most games
    fielding_grouped = fielding.groupby(['playerID', 'POS']).sum().reset_index()
    max_game_indices = np.array(fielding_grouped.groupby('playerID')['G'].idxmax())
    player_pos = fielding_grouped.iloc[max_game_indices][['playerID', 'POS']]

    # Set all outfield positions (LF, CF, OF) to OF.
    positions_dict = {'P': 'P', 'OF': 'OF', '1B': '1B', '2B': '2B', 'C': 'C', 'SS': 'SS', \
                    '3B': '3B', 'DH': 'DH', 'CF': 'OF', 'LF': 'OF'}
    player_pos['POS'] = player_pos['POS'].map(positions_dict)

    # # Write out player position dataframe
    # with open('player_pos.pkl', 'w') as f:
    #     pickle.dump(player_pos, f)
    #
    return player_pos


def get_remaining_yrs(row, retire_age='mean'):
    if retire_age == 'mean':      
        yrs_remain = row['retire_age_mean'] - row['age']
    elif retire_age == 'median':
        yrs_remain = row['retire_age_med'] - row['age']        
    elif retire_age == 'max':
        yrs_remain = row['retire_age_max'] - row['age']        
    if yrs_remain >= 0:
        return yrs_remain
    # In case player has played longer than mean career, set to 0.
    else:
        return 0

def get_retirement_age(df, calculate='mean'):
    '''
    Description: Get mean or median retirement age of all eligible players at the different positions 
    '''    
    positions = df['POS'].unique()    
    df_age = pd.DataFrame()
    
    for position in positions:
        pos = pd.Series([position], index=['POS'])
        if calculate == 'mean':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().mean(), 1)
            col = 'retire_age_mean'         
        elif calculate == 'median':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().median(), 1)
            col = 'retire_age_med'
        elif calculate == 'max':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().max(), 1)
            col = 'retire_age_max'
        
        df_age = df_age.append({'POS': position, col: retirement_age}, ignore_index=True)
    return df_age


def subtract_start_yr(row, player_dict):
    '''
    Description: Subtract start year from each player's yearID 
    '''    
    name = row['playerID']
    return row['yearID'] - player_dict[name] + 1


# # -------------------------------------------------------------------

In [22]:
allstar_file = '../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/AllstarFull.csv'
awards_file = '../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/AwardsPlayers.csv'
batting_file = '../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Batting.csv'
fielding_file = '../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Fielding.csv'
hall_of_fame_file = '../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/HallOfFame.csv'
master_file = '../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Master.csv'

# 2. Get and join player positions to the df
player_pos = get_positions(fielding_file)
batting = pd.read_csv(batting_file)
all_players = batting.merge(player_pos, on='playerID')

# 3. Select only hitters (remove pitchers from df)
all_hitters = all_players[all_players['POS'] != 'P']

# 4. Some players played a stint for different teams in the same season.
# Combine the stats in those cases into one row.
combined_stints = combine_stints(all_hitters)

# 5b. Add awards
awards_df = add_awards(combined_stints, awards_file)

# 5c. Add all-star appearances
#allstars_df = add_allstars(awards_df, allstar_file)

# Select which stats to create ratios for
stats_to_ratio = ['R', 'H', 'HR', 'RBI', 'SB', 'BB']

# 5. Fill in null values that are present in the stats of interest
filled_na_df, filled_stats = fill_na(awards_df, stats_to_ratio)

# 6a. Calculate cumulative stats over the years for each player.
stats_to_accumulate = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
             'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'MVP', 'Triple Crown', 'Gold Glove'] 

cumulative_stats = get_cumulative_stats(filled_na_df, stats_to_accumulate)

# 6b. Combine cumulative stats with non-stats columns
cols_to_add = list(set(filled_na_df.columns) - set(cumulative_stats.columns))
all_hitters_cumstats = cumulative_stats.join(filled_na_df[cols_to_add])[filled_na_df.columns]

# 7. Get and join birth year, first and last name of each player to df and create 'age' column
birth_year = get_birth_year(master_file)
all_hitters_cumstats = all_hitters_cumstats.merge(birth_year, on='playerID')
all_hitters_cumstats['age'] = all_hitters_cumstats['yearID'] - all_hitters_cumstats['birthYear']

# 11. Create 'year' variable indicating the number of years players have played in MLB.
all_hitters_cumstats = add_yr_col(all_hitters_cumstats)

# 8. Re-join positions to df
all_hitters_cumstats = all_hitters_cumstats.merge(player_pos, on='playerID')

# Add names of players.
names = get_name(master_file)
all_hitters_cumstats = all_hitters_cumstats.merge(names, on='playerID')

# 13. Add variables corresponding to different baseball eras
all_hitters_cumstats = add_eras_cols(all_hitters_cumstats)

# --------------------------------------------------------------------------------------------

## Create dataframe of HOF eligible hitters.

# Select only players from batting table who were/are eligible for HOF.
elig_labels = get_hof_labels(hall_of_fame_file)

# Drop a few players from the list (brownwi02, irvinmo01, tennefr01, rosepe01). Willard Brown (brownwi02)
# and Monte Irvin (irvinmo01) were both inducted into the HOF, but they played the majority of their careers
# in the Negro League for which stats are not available. Fred Tenney was nominated but only one years worth
# of stat is available. Pete Rose, in all likelihood, would be in HOF if not for scandal.

omit = ['brownwi02', 'irvinmo01', 'tennefr01', 'rosepe01']
elig_labels = elig_labels[-elig_labels['playerID'].isin(omit)]

# Also, Jacque Jones' playerID is jonesja05 in the HallofFame file. This corresponds to only one year's worth of stats.
# Majority of his career stats are associated with playerID jonesja04. Replace with this playerID instead.

elig_labels.loc[elig_labels['playerID'] == 'jonesja05', 'playerID'] = 'jonesja04'


# 1. Merge hof labels with all_hitters_cumstats df
elig_hitters_cumstats = all_hitters_cumstats.merge(elig_labels, on='playerID')


# 9. Get mean, median, or min stats of different positions for HOF hitters and merge to dfs
hof_hitters_stats = get_hofer_stats(elig_hitters_cumstats, stats_to_ratio, 'mean')

all_hitters_cumstats = all_hitters_cumstats.merge(hof_hitters_stats, on='POS')
elig_hitters_cumstats = elig_hitters_cumstats.merge(hof_hitters_stats, on='POS')

# 10. Get mean or median retirement age of different positions for all eligible HOF hitters and merge to dfs
retirement_age_elig = get_retirement_age(elig_hitters_cumstats, 'mean')

all_hitters_cumstats = all_hitters_cumstats.merge(retirement_age_elig, on='POS')
elig_hitters_cumstats = elig_hitters_cumstats.merge(retirement_age_elig, on='POS')


# --------------------------------------------------------------------------------------------

# Steps 12 and 14 done for both df_all and df_hof_elig

# 12. Create 'yrs_remain' column that estimates the remaining number of years in the career of that player
# based on the mean retirement age of eligible players at that position
all_hitters_cumstats['yrs_remain'] = all_hitters_cumstats.apply(get_remaining_yrs, axis=1, \
                                                                  args=('mean',))
elig_hitters_cumstats['yrs_remain'] = elig_hitters_cumstats.apply(get_remaining_yrs, axis=1, \
                                                                  args=('mean',))

# 14. Calculate the ratio of a player's cumulative total for a particular stat to the mean, median, or min of that
# stat for players at that position who are in the HOF
all_hitters_ratios = add_stat_ratio_cols(all_hitters_cumstats, stats_to_ratio, 'mean')
elig_hitters_ratios = add_stat_ratio_cols(elig_hitters_cumstats, stats_to_ratio, 'mean')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
pd.options.display.max_columns=99

In [4]:
combined_stints.shape

(53898, 19)

In [5]:
filled_na_df.shape

(53898, 24)

In [6]:
awards_df.shape

(53898, 24)

In [7]:
allstars_df.shape

NameError: name 'allstars_df' is not defined

In [8]:
hof_hitters_stats.head(25)

Unnamed: 0,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,POS
0,1468.0,2569.9,250.3,1304.2,268.1,959.9,OF
1,1210.2,2327.9,210.7,1184.5,109.1,838.3,3B
2,1342.2,2431.8,154.3,1087.6,285.2,906.8,2B
3,1357.6,2439.3,298.5,1453.2,147.7,948.6,1B
4,1227.9,2323.0,98.1,1027.9,283.0,833.7,SS
5,911.3,1798.1,214.2,1035.9,86.9,709.1,C
6,1782.0,3319.0,234.0,1307.0,504.0,1094.0,DH


In [11]:
all_hitters_cumstats.head(25)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,Gold Glove,MVP,Triple Crown,allstar,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,allstar_mean,retire_age_mean,yrs_remain
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0.0,0.0,0.0,0.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,17.3
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0.0,0.0,0.0,1.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,16.3
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0.0,0.0,0.0,2.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,15.3
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0.0,1.0,0.0,3.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,14.3
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,1.0,1.0,0.0,4.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,13.3
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,20.0,9.0,281.0,315.0,59.0,13.0,18.0,30.0,107.0,2.0,1.0,0.0,5.0,0,0,1934.0,25.0,6,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,12.3
6,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,36.0,16.0,341.0,378.0,72.0,15.0,18.0,42.0,115.0,3.0,1.0,0.0,6.0,0,0,1934.0,26.0,7,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,11.3
7,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,57.0,25.0,397.0,442.0,92.0,17.0,19.0,51.0,131.0,3.0,1.0,0.0,7.0,0,0,1934.0,27.0,8,OF,Hank Aaron,0,1,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,10.3
8,aaronha01,1962,1350,5309.0,956.0,1697.0,292.0,73.0,298.0,991.0,72.0,32.0,463.0,515.0,106.0,20.0,19.0,57.0,145.0,3.0,1.0,0.0,8.0,0,0,1934.0,28.0,9,OF,Hank Aaron,0,1,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,9.3
9,aaronha01,1963,1511,5940.0,1077.0,1898.0,321.0,77.0,342.0,1121.0,103.0,37.0,541.0,609.0,124.0,20.0,19.0,62.0,156.0,3.0,1.0,0.0,9.0,0,0,1934.0,29.0,10,OF,Hank Aaron,0,1,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,8.3


In [13]:
elig_hitters_ratios.head(30)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,Gold Glove,MVP,Triple Crown,allstar,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,inducted,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,allstar_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,allstar_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0.0,0.0,0.0,0.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806,0.0
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0.0,0.0,0.0,1.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,1.605263
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0.0,0.0,0.0,2.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445,2.140351
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0.0,1.0,0.0,3.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007,2.407895
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,1.0,1.0,0.0,4.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966,2.568421
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,20.0,9.0,281.0,315.0,59.0,13.0,18.0,30.0,107.0,2.0,1.0,0.0,5.0,0,0,1934.0,25.0,6,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,12.3,1.271526,1.34941,2.181183,1.442915,0.227527,0.892853,2.675439
6,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,36.0,16.0,341.0,378.0,72.0,15.0,18.0,42.0,115.0,3.0,1.0,0.0,6.0,0,0,1934.0,26.0,7,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,11.3,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713,2.75188
7,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,57.0,25.0,397.0,442.0,92.0,17.0,19.0,51.0,131.0,3.0,1.0,0.0,7.0,0,0,1934.0,27.0,8,OF,Hank Aaron,0,1,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,10.3,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075,2.809211
8,aaronha01,1962,1350,5309.0,956.0,1697.0,292.0,73.0,298.0,991.0,72.0,32.0,463.0,515.0,106.0,20.0,19.0,57.0,145.0,3.0,1.0,0.0,8.0,0,0,1934.0,28.0,9,OF,Hank Aaron,0,1,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,9.3,1.32416,1.342685,2.420828,1.545034,0.546065,0.980762,2.853801
9,aaronha01,1963,1511,5940.0,1077.0,1898.0,321.0,77.0,342.0,1121.0,103.0,37.0,541.0,609.0,124.0,20.0,19.0,62.0,156.0,3.0,1.0,0.0,9.0,0,0,1934.0,29.0,10,OF,Hank Aaron,0,1,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,8.3,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389,2.889474


In [94]:
get_retirement_age_test(elig_hitters_cumstats, 'median')

Unnamed: 0,POS,retire_age_med
0,OF,37.0
1,3B,36.0
2,2B,36.0
3,1B,37.0
4,SS,37.0
5,C,37.0
6,DH,41.0


In [143]:
get_retirement_age(elig_hitters_cumstats, 'mean')

Unnamed: 0,POS,retire_age_mean
0,OF,37.3
1,3B,36.6
2,2B,36.4
3,1B,37.4
4,SS,37.4
5,C,37.4
6,DH,40.2


In [5]:
fielding = pd.read_csv('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Fielding.csv')

In [6]:
batting = pd.read_csv('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Batting.csv')

In [7]:
len(fielding['playerID'].unique())

18465

In [10]:
fielding['POS'].unique()

array(['SS', '2B', 'OF', 'C', '1B', '3B', 'P', 'LF', 'RF', 'CF', 'DH'], dtype=object)

In [8]:
len(batting['playerID'].unique())

18659

In [20]:
birth_year

Unnamed: 0,playerID,birthYear
0,aardsda01,1981.0
1,aaronha01,1934.0
2,aaronto01,1939.0
3,aasedo01,1954.0
4,abadan01,1972.0
5,abadfe01,1985.0
6,abadijo01,1854.0
7,abbated01,1877.0
8,abbeybe01,1869.0
9,abbeych01,1866.0


In [53]:
set(batting['playerID'].unique()) - set(fielding['playerID'].unique())

{'aragoja01',
 'atkinle01',
 'banisje01',
 'barbare01',
 'bassdo01',
 'batscbi01',
 'belnovi01',
 'biglepe01',
 'bluhmre01',
 'bowmael01',
 'brovijo01',
 'brownde01',
 'bubseha01',
 'burkach01',
 'burnscb01',
 'byrdji01',
 'campbji02',
 'casimca01',
 'cassija01',
 'clarkgl01',
 'cobbjo01',
 'connejo01',
 'corrijo01',
 'cortaje01',
 'coseyra01',
 'cotepe01',
 'creedco01',
 'daubeha01',
 'daughbo01',
 'daughdo01',
 'davisot01',
 'decasyu01',
 'delarje01',
 'diazel01',
 'dickepa01',
 'duffpa01',
 'dwyerdo01',
 'eastojo01',
 'echoljo01',
 'eversjo02',
 'falloch01',
 'falsepe01',
 'fautsjo01',
 'fetzewi01',
 'fialane01',
 'fishewi01',
 'fitzbch01',
 'fitzgra01',
 'fletcfr01',
 'foleyra01',
 'fostere01',
 'fritzla01',
 'gablega01',
 'gaedeed01',
 'gaglira01',
 'galviji01',
 'garboal01',
 'garrice01',
 'genovge01',
 'gentisa01',
 'gentrha01',
 'gleasro01',
 'godwity01',
 'goletst01',
 'gormahe01',
 'greenad01',
 'greenjo02',
 'hajduch01',
 'hamribe01',
 'hansedo01',
 'hardgpa01',
 'hargiga01'

In [22]:
pd.options.display.max_columns = 999

In [23]:
all_hitters_cumstats.head(30)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,20.0,9.0,281.0,315.0,59.0,13.0,18.0,30.0,107.0,0,0,1934.0,25.0,6,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,12.3,1.271526,1.34941,2.181183,1.442915,0.227527,0.892853
6,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,36.0,16.0,341.0,378.0,72.0,15.0,18.0,42.0,115.0,0,0,1934.0,26.0,7,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,11.3,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713
7,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,57.0,25.0,397.0,442.0,92.0,17.0,19.0,51.0,131.0,0,0,1934.0,27.0,8,OF,0,1,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,10.3,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075
8,aaronha01,1962,1350,5309.0,956.0,1697.0,292.0,73.0,298.0,991.0,72.0,32.0,463.0,515.0,106.0,20.0,19.0,57.0,145.0,0,0,1934.0,28.0,9,OF,0,1,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,9.3,1.32416,1.342685,2.420828,1.545034,0.546065,0.980762
9,aaronha01,1963,1511,5940.0,1077.0,1898.0,321.0,77.0,342.0,1121.0,103.0,37.0,541.0,609.0,124.0,20.0,19.0,62.0,156.0,0,0,1934.0,29.0,10,OF,0,1,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,8.3,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389


In [141]:

# --------------------------------------------------------------------------------------------

# 15. Filter all_hitters_ratios df for just active players and those still on the HOF ballot

# Get most current year of database
most_current_yr = all_hitters_ratios[['playerID', 'yearID']].groupby('playerID').max()['yearID'].max()

# Get the last year in which a player has played
all_hitters_ratios_last_yr = all_hitters_ratios[['playerID', 'yearID']].groupby('playerID').max().reset_index()

# Select only for hitters that are currently still in MLB
active_hitters = all_hitters_ratios_last_yr[all_hitters_ratios_last_yr['yearID'] == most_current_yr]['playerID'].values
active_hitters_ratios = all_hitters_ratios[all_hitters_ratios['playerID'].isin(active_hitters)]

with open('active_hitters.pkl', 'w') as f:
    pickle.dump(active_hitters_ratios[['playerID', 'name', 'yearID', 'year']], f)

# Select for hitters that have recently been in MLB (in the past 10 years)
recent_hitters = all_hitters_ratios_last_yr[(all_hitters_ratios_last_yr['yearID'] >= (most_current_yr - 10)) & \
                                           (all_hitters_ratios_last_yr['yearID'] < most_current_yr)]['playerID'].values
recent_hitters_ratios = all_hitters_ratios[all_hitters_ratios['playerID'].isin(recent_hitters)]

with open('recent_hitters.pkl', 'w') as f:
    pickle.dump(recent_hitters_ratios[['playerID', 'name', 'yearID']], f)

# Select the stat ratio columns as feature set on which to train model and write out.

eras = ['DBE1', 'DBE2', 'SE']
# awards_allstar = ['MVP', 'Triple Crown', 'Gold Glove', 'AS']
awards = ['MVP', 'Triple Crown', 'Gold Glove']

# Feature set 1
features = [stat + '_ratio' for stat in stats_to_ratio] + filled_stats
X1 = elig_hitters_ratios[features]
with open('eligible_hitters_X1.pkl', 'w') as f:
    pickle.dump(X1, f)

# Feature set 2
features2 = features = [stat + '_ratio' for stat in stats_to_ratio] + ['year'] + filled_stats
X2 = elig_hitters_ratios[features2]
with open('eligible_hitters_X2.pkl', 'w') as f:
    pickle.dump(X2, f)

# Feature set 3
features3 = features = [stat + '_ratio' for stat in stats_to_ratio] + ['year', 'yearID'] + filled_stats
X3 = elig_hitters_ratios[features3]
with open('eligible_hitters_X3.pkl', 'w') as f:
    pickle.dump(X3, f)

# Feature set 4
features4 = features = [stat + '_ratio' for stat in stats_to_ratio] + ['year', 'yearID'] + eras + filled_stats
X4 = elig_hitters_ratios[features4]
with open('eligible_hitters_X4.pkl', 'w') as f:
    pickle.dump(X4, f)

# Feature set 5
features5 = [stat + '_ratio' for stat in stats_to_ratio] + ['year'] + eras + filled_stats
X5 = elig_hitters_ratios[features5]
with open('eligible_hitters_X5.pkl', 'w') as f:
    pickle.dump(X5, f)

# Feature set 6
features6 = [stat + '_ratio' for stat in stats_to_ratio] + eras + filled_stats
X6 = elig_hitters_ratios[features6]
with open('eligible_hitters_X6.pkl', 'w') as f:
    pickle.dump(X6, f)

# Feature set 7
# features7 = [stat + '_ratio' for stat in stats_to_ratio] + eras + filled_stats + awards_allstar
# features7.remove('AS_ratio')
# X7 = elig_hitters_ratios[features7]
# with open('eligible_hitters_X7.pkl', 'w') as f:
#     pickle.dump(X7, f)

# Feature set 8 (included AS_ratio)
features8 = [stat + '_ratio' for stat in stats_to_ratio] + eras + filled_stats + awards
X8 = elig_hitters_ratios[features8]
with open('eligible_hitters_X8.pkl', 'w') as f:
    pickle.dump(X8, f)

# Feature set 9 (AS is not included in feature set)
features9 = [stat + '_ratio' for stat in stats_to_ratio] + eras + filled_stats + awards
X9 = elig_hitters_ratios[features9]
with open('eligible_hitters_X9.pkl', 'w') as f:
    pickle.dump(X9, f)

    
# Select 'inducted' column as target variable (1 = inducted into HOF, 0 = not inducted into HOF)
y = elig_hitters_ratios['inducted']
with open('eligible_hitters_y.pkl', 'w') as f:
    pickle.dump(y, f)


# Select the stat ratio columns as feature set for the active players
active1 = active_hitters_ratios[features]
with open('active_hitters_X1.pkl', 'w') as f:
    pickle.dump(active1, f)

active2 = active_hitters_ratios[features2]
with open('active_hitters_X2.pkl', 'w') as f:
    pickle.dump(active2, f)

active3 = active_hitters_ratios[features3]
with open('active_hitters_X3.pkl', 'w') as f:
    pickle.dump(active3, f)

active4 = active_hitters_ratios[features4]
with open('active_hitters_X4.pkl', 'w') as f:
    pickle.dump(active4, f)

active6 = active_hitters_ratios[features6]
with open('active_hitters_X6.pkl', 'w') as f:
    pickle.dump(active6, f)

# active7 = active_hitters_ratios[features7]
# with open('active_hitters_X7.pkl', 'w') as f:
#     pickle.dump(active7, f)

active8 = active_hitters_ratios[features8]
with open('active_hitters_X8.pkl', 'w') as f:
    pickle.dump(active8, f)

active9 = active_hitters_ratios[features9]
with open('active_hitters_X9.pkl', 'w') as f:
    pickle.dump(active9, f)

recent3 = recent_hitters_ratios[features3]
with open('recent_hitters_X3.pkl', 'w') as f:
    pickle.dump(recent3, f)

recent9 = recent_hitters_ratios[features9]
with open('recent_hitters_X9.pkl', 'w') as f:
    pickle.dump(recent9, f)

# Determine the stats of the average player in the league who is not a HOFer
# Select the cumulative stats of the last year of each player

#idx_max = nonhof_hitters_ratios.groupby('name')['year'].transform(max) == nonhof_hitters_ratios['year']
#nonhof_hitters_ratios_max = nonhof_hitters_ratios[idx_max].groupby('name').max()
# Get the average stats for every position
average9 = get_avg_position_player_stats(nonhof_hitters_ratios_max, features9)

act

In [13]:
X9

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled,MVP,Triple Crown,Gold Glove
0,0.723025,0.932838,0.950459,0.968180,0.136516,0.533806,0,0,0,0,0,0.0,0.0,0.0
1,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,0,0,0,0,0,0.0,0.0,0.0
2,1.117779,1.234289,1.608470,1.248812,0.159269,0.724450,0,0,0,0,0,0.0,0.0,0.0
3,1.206080,1.278201,2.010587,1.399651,0.136516,0.815007,0,0,0,0,0,1.0,0.0,0.0
4,1.236621,1.301700,2.047143,1.386321,0.163819,0.876966,0,0,0,0,0,1.0,0.0,1.0
5,1.271526,1.349410,2.181183,1.442915,0.227527,0.892853,0,0,0,0,0,1.0,0.0,2.0
6,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713,0,0,0,0,0,1.0,0.0,3.0
7,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075,0,1,0,0,0,1.0,0.0,3.0
8,1.324160,1.342685,2.420828,1.545034,0.546065,0.980762,0,1,0,0,0,1.0,0.0,3.0
9,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389,0,1,0,0,0,1.0,0.0,3.0


In [31]:
elig_hitters_ratios[elig_hitters_ratios['inducted'] == 1]

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,Gold Glove,MVP,Triple Crown,AS,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,inducted,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,AS_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,AS_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0.0,0.0,0.0,0.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,17.3,0.723025,0.932838,0.950459,0.968180,0.136516,0.533806,0.000000
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0.0,0.0,0.0,1.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,1.605263
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0.0,0.0,0.0,2.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,15.3,1.117779,1.234289,1.608470,1.248812,0.159269,0.724450,2.140351
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0.0,1.0,0.0,3.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,14.3,1.206080,1.278201,2.010587,1.399651,0.136516,0.815007,2.407895
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,1.0,1.0,0.0,4.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,13.3,1.236621,1.301700,2.047143,1.386321,0.163819,0.876966,2.568421
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,20.0,9.0,281.0,315.0,59.0,13.0,18.0,30.0,107.0,2.0,1.0,0.0,5.0,0,0,1934.0,25.0,6,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,12.3,1.271526,1.349410,2.181183,1.442915,0.227527,0.892853,2.675439
6,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,36.0,16.0,341.0,378.0,72.0,15.0,18.0,42.0,115.0,3.0,1.0,0.0,6.0,0,0,1934.0,26.0,7,OF,Hank Aaron,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,11.3,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713,2.751880
7,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,57.0,25.0,397.0,442.0,92.0,17.0,19.0,51.0,131.0,3.0,1.0,0.0,7.0,0,0,1934.0,27.0,8,OF,Hank Aaron,0,1,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,10.3,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075,2.809211
8,aaronha01,1962,1350,5309.0,956.0,1697.0,292.0,73.0,298.0,991.0,72.0,32.0,463.0,515.0,106.0,20.0,19.0,57.0,145.0,3.0,1.0,0.0,8.0,0,0,1934.0,28.0,9,OF,Hank Aaron,0,1,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,9.3,1.324160,1.342685,2.420828,1.545034,0.546065,0.980762,2.853801
9,aaronha01,1963,1511,5940.0,1077.0,1898.0,321.0,77.0,342.0,1121.0,103.0,37.0,541.0,609.0,124.0,20.0,19.0,62.0,156.0,3.0,1.0,0.0,9.0,0,0,1934.0,29.0,10,OF,Hank Aaron,0,1,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,5.7,37.3,8.3,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389,2.889474


In [62]:
all_hitters_ratios[['playerID', 'yearID']].groupby('playerID').max()['yearID'].max()

2015

In [63]:
most_current_yr = all_hitters_ratios[['playerID', 'yearID']].groupby('playerID').max()['yearID'].max()

In [64]:
most_current_yr

2015

In [18]:
pd.options.display.max_columns=99

In [19]:
all_hitters_ratios.head(5)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966


In [70]:
all_hitters_ratios_last_yr = all_hitters_ratios[['playerID', 'yearID']].groupby('playerID').max().reset_index()

In [71]:
all_hitters_ratios_last_yr

Unnamed: 0,playerID,yearID
0,aaronha01,1976
1,aaronto01,1971
2,abadan01,2006
3,abadijo01,1875
4,abbated01,1910
5,abbeych01,1897
6,abbotfr01,1905
7,abbotje01,2001
8,abbotku01,2001
9,abbotod01,1910


In [83]:
active_hitters = all_hitters_ratios_last_yr[all_hitters_ratios_last_yr['yearID'] == most_current_yr]['playerID'].values

In [85]:
active_hitters_ratios = all_hitters_ratios[all_hitters_ratios['playerID'].isin(active_players)]

In [90]:
len(active_hitters_ratios['playerID'].unique())

633

In [87]:
active_hitters_ratios[active_hitters_ratios['playerID'] == 'mccutan01']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
11685,mccutan01,2009,108,433.0,74.0,124.0,26.0,9.0,12.0,54.0,22.0,5.0,54.0,83.0,2.0,2.0,0.0,4.0,3.0,0,0,1986.0,23.0,1,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,0.771253,0.738239,0.73352,0.633492,1.255502,0.860715
11686,mccutan01,2010,262,1003.0,168.0,287.0,61.0,14.0,28.0,110.0,55.0,15.0,124.0,172.0,3.0,7.0,1.0,11.0,9.0,0,0,1986.0,24.0,2,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,0.875477,0.854333,0.855773,0.645223,1.569377,0.988228
11687,mccutan01,2011,420,1575.0,255.0,435.0,95.0,19.0,51.0,199.0,78.0,25.0,213.0,298.0,6.0,16.0,3.0,17.0,16.0,0,0,1986.0,25.0,3,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,12.3,0.885899,0.863263,1.039153,0.778178,1.483775,1.13168
11688,mccutan01,2012,577,2168.0,362.0,629.0,124.0,25.0,82.0,295.0,98.0,37.0,283.0,430.0,19.0,21.0,3.0,22.0,25.0,0,0,1986.0,26.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,11.3,0.943222,0.936194,1.253096,0.865186,1.398172,1.127696
11689,mccutan01,2013,734,2751.0,459.0,814.0,162.0,30.0,103.0,379.0,125.0,47.0,361.0,531.0,31.0,30.0,3.0,26.0,38.0,0,0,1986.0,27.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,10.3,0.956771,0.969236,1.259209,0.889235,1.426706,1.150807
11690,mccutan01,2014,880,3299.0,548.0,986.0,200.0,36.0,128.0,462.0,143.0,50.0,445.0,646.0,39.0,40.0,3.0,32.0,47.0,0,0,1986.0,28.0,6,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,9.3,0.951907,0.978365,1.304035,0.903312,1.360127,1.182154
11691,mccutan01,2015,1037,3865.0,639.0,1151.0,236.0,39.0,151.0,558.0,154.0,55.0,543.0,779.0,51.0,52.0,3.0,41.0,56.0,0,0,1986.0,29.0,7,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,8.3,0.951411,0.978932,1.318589,0.935155,1.255502,1.236423


In [88]:
active_hitters_ratios[active_hitters_ratios['playerID'] == 'troutmi01']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
17912,troutmi01,2011,40,123.0,20.0,27.0,6.0,0.0,5.0,16.0,4.0,0.0,9.0,30.0,0.0,2.0,0.0,1.0,2.0,0,0,1991.0,20.0,1,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.249319,0.192264,0.365561,0.224505,0.273032,0.17158
17913,troutmi01,2012,179,682.0,149.0,209.0,33.0,8.0,35.0,99.0,53.0,5.0,76.0,169.0,4.0,8.0,0.0,8.0,9.0,0,0,1991.0,21.0,2,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,0.928713,0.744134,1.279465,0.694564,1.80884,0.72445
17914,troutmi01,2013,336,1271.0,258.0,399.0,72.0,17.0,62.0,196.0,86.0,12.0,186.0,305.0,14.0,17.0,0.0,16.0,17.0,0,0,1991.0,22.0,3,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.072071,0.94708,1.510987,0.916731,1.956733,1.181998
17915,troutmi01,2014,493,1873.0,373.0,572.0,111.0,26.0,98.0,307.0,102.0,14.0,269.0,489.0,20.0,27.0,0.0,26.0,23.0,0,0,1991.0,23.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.162449,1.018289,1.79125,1.076925,1.740582,1.282087
17916,troutmi01,2015,652,2448.0,477.0,744.0,143.0,32.0,139.0,397.0,113.0,21.0,361.0,647.0,34.0,37.0,0.0,31.0,34.0,0,0,1991.0,24.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.189251,1.05959,2.032521,1.114108,1.542633,1.376456


In [89]:
active_hitters_ratios[active_hitters_ratios['playerID'] == 'pujolal01'] 

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
24255,pujolal01,2001,161,590.0,112.0,194.0,47.0,4.0,37.0,130.0,1.0,3.0,69.0,93.0,6.0,9.0,1.0,7.0,21.0,0,0,1980.0,21.0,1,1B,0,0,1,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,16.4,1.435474,1.38384,2.156784,1.556565,0.117806,1.265655
24256,pujolal01,2002,318,1180.0,230.0,379.0,87.0,6.0,71.0,257.0,3.0,7.0,141.0,162.0,19.0,18.0,1.0,11.0,41.0,0,0,1980.0,22.0,2,1B,0,0,1,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,15.4,1.473925,1.35174,2.069347,1.538604,0.17671,1.293169
24257,pujolal01,2003,475,1771.0,367.0,591.0,138.0,7.0,114.0,381.0,8.0,8.0,220.0,227.0,31.0,28.0,1.0,16.0,54.0,0,0,1980.0,23.0,3,1B,0,0,1,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,14.4,1.567914,1.405239,2.215075,1.520644,0.31415,1.34514
24258,pujolal01,2004,629,2363.0,500.0,787.0,189.0,9.0,160.0,504.0,13.0,13.0,304.0,279.0,43.0,35.0,1.0,25.0,75.0,0,0,1980.0,24.0,4,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,13.4,1.602092,1.403456,2.331658,1.508671,0.382871,1.394054
24259,pujolal01,2005,790,2954.0,629.0,982.0,227.0,11.0,201.0,621.0,29.0,15.0,401.0,344.0,70.0,44.0,1.0,28.0,94.0,0,0,1980.0,25.0,5,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,12.4,1.612345,1.400959,2.343317,1.487118,0.683277,1.471094
24260,pujolal01,2006,933,3489.0,748.0,1159.0,260.0,12.0,250.0,758.0,36.0,17.0,493.0,394.0,98.0,48.0,1.0,31.0,114.0,0,0,1980.0,26.0,6,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,11.4,1.59782,1.377895,2.428811,1.512662,0.706838,1.507168
24261,pujolal01,2007,1091,4054.0,847.0,1344.0,298.0,13.0,282.0,861.0,38.0,23.0,592.0,452.0,120.0,55.0,1.0,39.0,141.0,0,0,1980.0,27.0,7,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,10.4,1.550825,1.369573,2.348313,1.47275,0.63952,1.551279
24262,pujolal01,2008,1239,4578.0,947.0,1531.0,342.0,13.0,319.0,977.0,45.0,26.0,696.0,506.0,154.0,60.0,1.0,47.0,157.0,0,0,1980.0,28.0,8,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,9.4,1.517181,1.365115,2.324372,1.462273,0.662661,1.595825
24263,pujolal01,2009,1399,5146.0,1071.0,1717.0,387.0,14.0,366.0,1112.0,61.0,30.0,811.0,570.0,198.0,69.0,1.0,55.0,180.0,0,0,1980.0,29.0,9,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,8.4,1.525192,1.360855,2.370519,1.479402,0.798465,1.652892
24264,pujolal01,2010,1558,5733.0,1186.0,1900.0,426.0,15.0,408.0,1230.0,75.0,34.0,914.0,646.0,236.0,73.0,1.0,61.0,203.0,0,0,1980.0,30.0,10,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,7.4,1.520065,1.355307,2.378291,1.47275,0.883548,1.676534


In [91]:
elig_hitters_ratios.head(5)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,inducted,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,0,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966


In [94]:
set(elig_hitters_ratios.columns) - set(active_hitters_ratios.columns)

{'inducted'}

In [93]:
len(active_hitters_ratios.columns)

42

In [104]:
with open('active_hitters.pkl', 'w') as f:
    pickle.dump(active_hitters_ratios['playerID'], f)


In [111]:
len(active_hitters)

633

In [112]:
len(recent_hitters)

1172

In [124]:
all_hitters_ratios.head(5)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966


In [123]:
all_hitters_ratios[all_hitters_ratios['playerID' == 'suzukic01']]

KeyError: False

In [125]:
batting[batting['playerID'] == 'suzukic01']

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
81799,suzukic01,2001,1,SEA,AL,157,692.0,127.0,242.0,34.0,8.0,8.0,69.0,56.0,14.0,30.0,53.0,10.0,8.0,4.0,4.0,3.0
83127,suzukic01,2002,1,SEA,AL,157,647.0,111.0,208.0,27.0,8.0,8.0,51.0,31.0,15.0,68.0,62.0,27.0,5.0,3.0,5.0,8.0
84457,suzukic01,2003,1,SEA,AL,159,679.0,111.0,212.0,29.0,8.0,13.0,62.0,34.0,8.0,36.0,69.0,7.0,6.0,3.0,1.0,3.0
85812,suzukic01,2004,1,SEA,AL,161,704.0,101.0,262.0,24.0,5.0,8.0,60.0,36.0,11.0,49.0,63.0,19.0,4.0,2.0,3.0,6.0
87135,suzukic01,2005,1,SEA,AL,162,679.0,111.0,206.0,21.0,12.0,15.0,68.0,33.0,8.0,48.0,66.0,23.0,4.0,2.0,6.0,5.0
88513,suzukic01,2006,1,SEA,AL,161,695.0,110.0,224.0,20.0,9.0,9.0,49.0,45.0,2.0,49.0,71.0,16.0,5.0,1.0,2.0,2.0
89899,suzukic01,2007,1,SEA,AL,161,678.0,111.0,238.0,22.0,7.0,6.0,68.0,37.0,8.0,49.0,77.0,13.0,3.0,4.0,2.0,7.0
91292,suzukic01,2008,1,SEA,AL,162,686.0,103.0,213.0,20.0,7.0,6.0,42.0,43.0,4.0,51.0,65.0,12.0,5.0,3.0,4.0,8.0
92676,suzukic01,2009,1,SEA,AL,146,639.0,88.0,225.0,31.0,4.0,11.0,46.0,26.0,9.0,32.0,71.0,15.0,4.0,2.0,1.0,1.0
94038,suzukic01,2010,1,SEA,AL,162,680.0,74.0,214.0,30.0,3.0,6.0,43.0,42.0,9.0,45.0,86.0,13.0,3.0,3.0,1.0,3.0


In [126]:
fielding[fielding['playerID'] == 'suzukic01']

Unnamed: 0,playerID,yearID,stint,teamID,lgID,POS,G,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
136917,suzukic01,2001,1,SEA,AL,OF,152,,3942.0,335.0,8.0,1.0,2.0,,,,,
136918,suzukic01,2001,1,SEA,AL,RF,152,,,,,,,,,,,
139260,suzukic01,2002,1,SEA,AL,CF,3,,,,,,,,,,,
139261,suzukic01,2002,1,SEA,AL,OF,152,,3924.0,333.0,8.0,3.0,0.0,,,,,
139262,suzukic01,2002,1,SEA,AL,RF,150,,,,,,,,,,,
141573,suzukic01,2003,1,SEA,AL,OF,159,,4101.0,337.0,12.0,2.0,4.0,,,,,
141574,suzukic01,2003,1,SEA,AL,RF,159,,,,,,,,,,,
143905,suzukic01,2004,1,SEA,AL,OF,158,,4215.0,372.0,12.0,3.0,2.0,,,,,
143906,suzukic01,2004,1,SEA,AL,RF,158,,,,,,,,,,,
146164,suzukic01,2005,1,SEA,AL,OF,158,,4164.0,381.0,10.0,2.0,2.0,,,,,


In [127]:
all_hitters_cumstats[all_hitters_cumstats['playerID'] == 'suzukic01']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
17270,suzukic01,2001,157,692.0,127.0,242.0,34.0,8.0,8.0,69.0,56.0,14.0,30.0,53.0,10.0,8.0,4.0,4.0,3.0,0,0,1973.0,28.0,1,OF,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,9.3,0.891076,0.969921,0.329205,0.544932,2.151436,0.321909
17271,suzukic01,2002,314,1339.0,238.0,450.0,61.0,16.0,16.0,120.0,87.0,29.0,98.0,115.0,37.0,13.0,7.0,9.0,11.0,0,0,1973.0,29.0,2,OF,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,8.3,0.834946,0.901786,0.329205,0.473854,1.671205,0.525784
17272,suzukic01,2003,473,2018.0,349.0,662.0,90.0,24.0,29.0,182.0,121.0,37.0,134.0,184.0,44.0,19.0,10.0,10.0,14.0,0,0,1973.0,30.0,3,OF,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,7.3,0.816235,0.884418,0.397789,0.479119,1.549546,0.479286
17273,suzukic01,2004,634,2722.0,450.0,924.0,114.0,29.0,37.0,242.0,157.0,48.0,183.0,247.0,63.0,23.0,12.0,13.0,20.0,0,0,1973.0,31.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,6.3,0.789339,0.925834,0.380643,0.477802,1.507926,0.490911
17274,suzukic01,2005,796,3401.0,561.0,1130.0,135.0,41.0,52.0,310.0,190.0,56.0,231.0,313.0,86.0,27.0,14.0,19.0,25.0,0,0,1973.0,32.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,5.3,0.787234,0.905794,0.427966,0.489649,1.459903,0.495739
17275,suzukic01,2006,957,4096.0,671.0,1354.0,155.0,50.0,61.0,359.0,235.0,58.0,280.0,384.0,102.0,32.0,15.0,21.0,27.0,0,0,1973.0,33.0,6,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,4.3,0.784662,0.904458,0.418365,0.472537,1.504725,0.500747
17276,suzukic01,2007,1118,4774.0,782.0,1592.0,177.0,57.0,67.0,427.0,272.0,66.0,329.0,461.0,115.0,35.0,19.0,23.0,34.0,0,0,1973.0,34.0,7,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,3.3,0.783826,0.91152,0.39387,0.481751,1.492833,0.504323
17277,suzukic01,2008,1280,5460.0,885.0,1805.0,197.0,64.0,73.0,469.0,315.0,70.0,380.0,526.0,127.0,40.0,22.0,27.0,42.0,0,0,1973.0,35.0,8,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,2.3,0.776184,0.904291,0.375499,0.462995,1.512728,0.509689
17278,suzukic01,2009,1426,6099.0,973.0,2030.0,228.0,68.0,84.0,515.0,341.0,79.0,412.0,597.0,142.0,44.0,24.0,28.0,43.0,0,0,1973.0,36.0,9,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,1.3,0.758545,0.904013,0.384072,0.451916,1.455634,0.491209
17279,suzukic01,2010,1588,6779.0,1047.0,2244.0,258.0,71.0,90.0,558.0,383.0,88.0,457.0,683.0,155.0,47.0,27.0,29.0,46.0,0,0,1973.0,37.0,10,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,0.3,0.734612,0.899381,0.370356,0.440684,1.471429,0.490374


In [128]:
all_hitters_ratios[all_hitters_ratios['playerID'] == 'suzukic01']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
17270,suzukic01,2001,157,692.0,127.0,242.0,34.0,8.0,8.0,69.0,56.0,14.0,30.0,53.0,10.0,8.0,4.0,4.0,3.0,0,0,1973.0,28.0,1,OF,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,9.3,0.891076,0.969921,0.329205,0.544932,2.151436,0.321909
17271,suzukic01,2002,314,1339.0,238.0,450.0,61.0,16.0,16.0,120.0,87.0,29.0,98.0,115.0,37.0,13.0,7.0,9.0,11.0,0,0,1973.0,29.0,2,OF,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,8.3,0.834946,0.901786,0.329205,0.473854,1.671205,0.525784
17272,suzukic01,2003,473,2018.0,349.0,662.0,90.0,24.0,29.0,182.0,121.0,37.0,134.0,184.0,44.0,19.0,10.0,10.0,14.0,0,0,1973.0,30.0,3,OF,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,7.3,0.816235,0.884418,0.397789,0.479119,1.549546,0.479286
17273,suzukic01,2004,634,2722.0,450.0,924.0,114.0,29.0,37.0,242.0,157.0,48.0,183.0,247.0,63.0,23.0,12.0,13.0,20.0,0,0,1973.0,31.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,6.3,0.789339,0.925834,0.380643,0.477802,1.507926,0.490911
17274,suzukic01,2005,796,3401.0,561.0,1130.0,135.0,41.0,52.0,310.0,190.0,56.0,231.0,313.0,86.0,27.0,14.0,19.0,25.0,0,0,1973.0,32.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,5.3,0.787234,0.905794,0.427966,0.489649,1.459903,0.495739
17275,suzukic01,2006,957,4096.0,671.0,1354.0,155.0,50.0,61.0,359.0,235.0,58.0,280.0,384.0,102.0,32.0,15.0,21.0,27.0,0,0,1973.0,33.0,6,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,4.3,0.784662,0.904458,0.418365,0.472537,1.504725,0.500747
17276,suzukic01,2007,1118,4774.0,782.0,1592.0,177.0,57.0,67.0,427.0,272.0,66.0,329.0,461.0,115.0,35.0,19.0,23.0,34.0,0,0,1973.0,34.0,7,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,3.3,0.783826,0.91152,0.39387,0.481751,1.492833,0.504323
17277,suzukic01,2008,1280,5460.0,885.0,1805.0,197.0,64.0,73.0,469.0,315.0,70.0,380.0,526.0,127.0,40.0,22.0,27.0,42.0,0,0,1973.0,35.0,8,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,2.3,0.776184,0.904291,0.375499,0.462995,1.512728,0.509689
17278,suzukic01,2009,1426,6099.0,973.0,2030.0,228.0,68.0,84.0,515.0,341.0,79.0,412.0,597.0,142.0,44.0,24.0,28.0,43.0,0,0,1973.0,36.0,9,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,1.3,0.758545,0.904013,0.384072,0.451916,1.455634,0.491209
17279,suzukic01,2010,1588,6779.0,1047.0,2244.0,258.0,71.0,90.0,558.0,383.0,88.0,457.0,683.0,155.0,47.0,27.0,29.0,46.0,0,0,1973.0,37.0,10,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,0.3,0.734612,0.899381,0.370356,0.440684,1.471429,0.490374


In [9]:
all_hitters_ratios[all_hitters_ratios['playerID'] == 'teixema01']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
25038,teixema01,2003,146,529.0,66.0,137.0,29.0,5.0,26.0,84.0,...,147.7,948.6,37.4,14.4,0.748674,0.86492,1.341374,0.890173,0.104265,0.714316
25039,teixema01,2004,291,1074.0,167.0,290.0,63.0,7.0,64.0,196.0,...,147.7,948.6,37.4,13.4,0.947186,0.915427,1.650921,1.038536,0.260664,0.909129
25040,teixema01,2005,453,1718.0,279.0,484.0,104.0,10.0,107.0,340.0,...,147.7,948.6,37.4,12.4,1.05495,1.018544,1.840089,1.201028,0.312796,0.995713
25041,teixema01,2006,615,2346.0,378.0,661.0,149.0,11.0,140.0,450.0,...,147.7,948.6,37.4,11.4,1.071965,1.043271,1.805695,1.192197,0.28673,1.108001
25042,teixema01,2007,747,2840.0,464.0,812.0,182.0,13.0,170.0,555.0,...,147.7,948.6,37.4,10.4,1.052681,1.025278,1.754104,1.176301,0.229384,1.120177
25043,teixema01,2008,904,3414.0,566.0,989.0,223.0,13.0,203.0,676.0,...,147.7,948.6,37.4,9.4,1.070075,1.04064,1.745505,1.193963,0.225908,1.195938
25044,teixema01,2009,1060,4023.0,669.0,1167.0,266.0,16.0,242.0,798.0,...,147.7,948.6,37.4,8.4,1.084119,1.052515,1.783585,1.208092,0.223426,1.212945
25045,teixema01,2010,1218,4624.0,782.0,1321.0,302.0,16.0,275.0,906.0,...,147.7,948.6,37.4,7.4,1.108832,1.042481,1.773451,1.200145,0.195498,1.250053
25046,teixema01,2011,1374,5213.0,872.0,1467.0,328.0,17.0,314.0,1017.0,...,147.7,948.6,37.4,6.4,1.099064,1.029066,1.799963,1.197495,0.220116,1.248249
25047,teixema01,2012,1497,5664.0,938.0,1580.0,355.0,18.0,338.0,1101.0,...,147.7,948.6,37.4,5.4,1.064025,0.997499,1.743786,1.166763,0.218957,1.21109


In [13]:
all_hitters_ratios[all_hitters_ratios['playerID'] == 'teixema01'][['playerID', 'yearID', 'year', 'R_ratio', 'H_ratio', 'HR_ratio', 'RBI_ratio', 'SB_ratio', 'BB_ratio' ]]

Unnamed: 0,playerID,yearID,year,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
25038,teixema01,2003,1,0.748674,0.86492,1.341374,0.890173,0.104265,0.714316
25039,teixema01,2004,2,0.947186,0.915427,1.650921,1.038536,0.260664,0.909129
25040,teixema01,2005,3,1.05495,1.018544,1.840089,1.201028,0.312796,0.995713
25041,teixema01,2006,4,1.071965,1.043271,1.805695,1.192197,0.28673,1.108001
25042,teixema01,2007,5,1.052681,1.025278,1.754104,1.176301,0.229384,1.120177
25043,teixema01,2008,6,1.070075,1.04064,1.745505,1.193963,0.225908,1.195938
25044,teixema01,2009,7,1.084119,1.052515,1.783585,1.208092,0.223426,1.212945
25045,teixema01,2010,8,1.108832,1.042481,1.773451,1.200145,0.195498,1.250053
25046,teixema01,2011,9,1.099064,1.029066,1.799963,1.197495,0.220116,1.248249
25047,teixema01,2012,10,1.064025,0.997499,1.743786,1.166763,0.218957,1.21109


In [12]:
print [str(stat) for stat in all_hitters_ratios.columns]

['playerID', 'yearID', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'RBI_filled', 'SB_filled', 'birthYear', 'age', 'year', 'POS', 'DBE1', 'DBE2', 'SE', 'R_mean', 'H_mean', 'HR_mean', 'RBI_mean', 'SB_mean', 'BB_mean', 'retire_age_mean', 'yrs_remain', 'R_ratio', 'H_ratio', 'HR_ratio', 'RBI_ratio', 'SB_ratio', 'BB_ratio']


In [46]:
temp = master[['playerID', 'nameFirst', 'nameLast']]
temp['name'] = temp[['nameFirst', 'nameLast']].apply(lambda x: ' '.join(x), axis=1)

TypeError: ('sequence item 0: expected string, float found', u'occurred at index 1502')

In [51]:
temp['temp'] = temp['nameFirst'].map(str) + ' ' + temp['nameLast'].map(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [45]:
temp.head(5)

Unnamed: 0,playerID,nameFirst,nameLast,name
0,aardsda01,David,Aardsma,n a m e F i r s t D a v i d \n n a...
1,aaronha01,Hank,Aaron,n a m e F i r s t H a n k \n n a m e...
2,aaronto01,Tommie,Aaron,n a m e F i r s t T o m m i e \n n a m...
3,aasedo01,Don,Aase,n a m e F i r s t D o n \n n a m e L...
4,abadan01,Andy,Abad,n a m e F i r s t A n d y \n n a m e L...


In [36]:
temp['name'] = temp[['nameFirst', 'nameLast']].apply(lambda x: ' '.join(x), axis=1)

In [37]:
temp.head(5)

Unnamed: 0,playerID,nameFirst,nameLast,name
0,aardsda01,David,Aardsma,David Aardsma
1,aaronha01,Hank,Aaron,Hank Aaron
2,aaronto01,Tommie,Aaron,Tommie Aaron
3,aasedo01,Don,Aase,Don Aase
4,abadan01,Andy,Abad,Andy Abad


In [39]:
temp.drop(['nameFirst', 'nameLast'], axis=1)

Unnamed: 0,playerID,name
0,aardsda01,David Aardsma
1,aaronha01,Hank Aaron
2,aaronto01,Tommie Aaron
3,aasedo01,Don Aase
4,abadan01,Andy Abad


In [87]:
all_hitters_ratios[all_hitters_ratios['name'] == 'Starlin Castro']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
42466,castrst01,2010,125,463.0,53.0,139.0,31.0,5.0,3.0,41.0,10.0,8.0,29.0,71.0,7.0,6.0,4.0,4.0,14.0,0,0,1990.0,20.0,1,SS,Starlin Castro,0,0,0,1227.9,2323.0,98.1,1027.9,283.0,833.7,37.4,17.4,0.794201,1.10099,0.562691,0.733924,0.650177,0.640038
42467,castrst01,2011,283,1137.0,144.0,346.0,67.0,14.0,13.0,107.0,32.0,17.0,64.0,167.0,9.0,8.0,4.0,8.0,34.0,0,0,1990.0,21.0,2,SS,Starlin Castro,0,0,0,1227.9,2323.0,98.1,1027.9,283.0,833.7,37.4,16.4,1.078915,1.370297,1.219164,0.957681,1.040283,0.706249
42468,castrst01,2012,445,1783.0,222.0,529.0,96.0,26.0,27.0,185.0,57.0,30.0,100.0,267.0,14.0,12.0,4.0,13.0,49.0,0,0,1990.0,22.0,3,SS,Starlin Castro,0,0,0,1227.9,2323.0,98.1,1027.9,283.0,833.7,37.4,15.4,1.108885,1.3967,1.688073,1.103869,1.235336,0.735676
42469,castrst01,2013,606,2449.0,281.0,692.0,130.0,28.0,37.0,229.0,66.0,36.0,130.0,396.0,14.0,19.0,5.0,14.0,70.0,0,0,1990.0,23.0,4,SS,Starlin Castro,0,0,0,1227.9,2323.0,98.1,1027.9,283.0,833.7,37.4,14.4,1.052692,1.370297,1.734964,1.024808,1.072792,0.717284
42470,castrst01,2014,740,2977.0,339.0,846.0,163.0,29.0,51.0,294.0,70.0,40.0,165.0,496.0,18.0,23.0,5.0,16.0,88.0,0,0,1990.0,24.0,5,SS,Starlin Castro,0,0,0,1227.9,2323.0,98.1,1027.9,283.0,833.7,37.4,13.4,1.015978,1.340198,1.91315,1.052554,0.910247,0.72832
42471,castrst01,2015,891,3524.0,391.0,991.0,186.0,31.0,62.0,363.0,75.0,45.0,186.0,587.0,24.0,28.0,6.0,20.0,106.0,0,0,1990.0,25.0,6,SS,Starlin Castro,0,0,0,1227.9,2323.0,98.1,1027.9,283.0,833.7,37.4,12.4,0.976518,1.308251,1.938158,1.082985,0.812721,0.684179


In [88]:
hof_hitters_stats

Unnamed: 0,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,POS
0,1468.0,2569.9,250.3,1304.2,268.1,959.9,OF
1,1210.2,2327.9,210.7,1184.5,109.1,838.3,3B
2,1342.2,2431.8,154.3,1087.6,285.2,906.8,2B
3,1357.6,2439.3,298.5,1453.2,147.7,948.6,1B
4,1227.9,2323.0,98.1,1027.9,283.0,833.7,SS
5,911.3,1798.1,214.2,1035.9,86.9,709.1,C
6,1782.0,3319.0,234.0,1307.0,504.0,1094.0,DH


In [91]:
features6

['R_ratio',
 'H_ratio',
 'HR_ratio',
 'RBI_ratio',
 'SB_ratio',
 'BB_ratio',
 'DBE1',
 'DBE2',
 'SE',
 'RBI_filled',
 'SB_filled']

In [174]:
age24_OF_hofers = elig_hitters_ratios[(elig_hitters_ratios['inducted'] == 1) & (elig_hitters_ratios['age'] == 24) \
                          & (elig_hitters_ratios['POS'] == 'OF')][features6]

In [175]:
age24_OF_hofers

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled
4,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966,0,0,0,0,0
134,0.995811,1.250428,0.155513,0.600234,1.355091,1.040812,0,0,0,0,0
375,0.566281,0.54124,0.390731,0.299954,0.810643,0.379241,0,1,0,0,0
463,0.795754,0.710378,0.217073,0.579078,1.03357,0.696218,0,0,0,0,0
612,0.917493,0.863053,0.233959,0.536007,2.334427,0.84265,1,0,0,0,0
715,0.91049,0.934485,0.434146,0.8707,2.006341,0.577352,0,0,0,0,0
755,0.662892,0.932351,0.380144,0.620131,0.258113,0.382983,0,0,0,0,0
775,1.03205,1.200352,0.359835,1.169069,3.047495,0.666858,1,0,0,0,0
885,0.904961,1.028873,0.424091,1.043283,1.043827,0.613241,1,0,0,0,0
929,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0


In [111]:
np.array(test)

array([[ 1.23662125,  1.30170046,  2.04714343, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.01584469,  1.23326199,  0.13823412, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.84826975,  0.84951165,  1.01126648, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 1.00814221,  1.04590401,  1.7210084 , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.63399539,  0.87390023,  0.03436041, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.91151515,  0.85037662,  0.73846154, ...,  0.        ,
         0.        ,  0.        ]])

In [153]:
trout2015 = all_hitters_ratios[(all_hitters_ratios['name'] == 'Mike Trout') & (all_hitters_ratios['age'] == 24)][features6]

In [154]:
trout2015

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled
17916,1.189251,1.05959,2.032521,1.114108,1.542633,1.376456,0,0,0,0,0


In [146]:
trout2015

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled
17916,1.189251,1.05959,2.032521,1.114108,1.542633,1.376456,0,0,0,0,0


In [110]:
np.array(trout2015)

array([[ 1.18925068,  1.05958987,  2.03252097,  1.11410827,  1.54263335,
         1.37645588,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])

In [179]:
age24_OF_hofers_cossim = cosine_similarity(np.array(trout2015), np.array(age24_OF_hofers))

In [11]:
age24_OF_hofers_cossim[0]

array([ 0.90138869,  0.83050785,  0.72897986,  0.87236396,  0.74498474,
        0.84579241,  0.83319549,  0.7577144 ,  0.80365463,  0.        ,
        0.97262552,  0.78345416,  0.89720601,  0.64782082,  0.81396872,
        0.8357541 ,  0.87520189,  0.94388091,  0.84746329,  0.92639857,
        0.66059328,  0.67392072,  0.7067076 ,  0.72192161,  0.98910124,
        0.94760186,  0.79117835,  0.86531271,  0.50383282,  0.91883212,
        0.85983383,  0.94476493,  0.90312804,  0.97010734,  0.74239018,
        0.8674797 ,  0.89465069,  0.76917185,  0.93583072,  0.73073929,
        0.93122866,  0.9653834 ,  0.67573031,  0.72245467,  0.91293708,
        0.89325977,  0.97829347,  0.8069534 ,  0.62773892,  0.77175024,
        0.85203517,  0.74875931,  0.77989169,  0.90249549,  0.91632469,
        0.98653747,  0.8508316 ,  0.85483831])

In [9]:
age24_OF_hofers.index

58

In [117]:
test.index

Int64Index([    4,   135,   150,   377,   465,   612,   717,   756,   774,
              841,
            ...
            10228, 10366, 10412, 10444, 10461, 10576, 10719, 10975, 11105,
            11574],
           dtype='int64', length=146)

In [12]:
age24_OF_hofers_cossim_df = pd.DataFrame(age24_OF_hofers_cossim[0], index=age24_OF_hofers.index, columns=['cossim'])

In [182]:
cossim_df.head(15)

Unnamed: 0,cossim
4,0.901389
134,0.830508
375,0.72898
463,0.872364
612,0.744985
715,0.845792
755,0.833195
775,0.757714
885,0.803655
929,0.0


In [122]:
cossim.shape

(1, 146)

In [129]:
len(cossim_df)

146

In [184]:
elig_hitters_ratios_cossim = elig_hitters_ratios.merge(age24_OF_hofers_cossim_df, left_index=True, right_index=True)

In [186]:
elig_hitters_ratios_cossim[['name', 'cossim']].sort_values(by='cossim', ascending=False).head(10)

Unnamed: 0,name,cossim
1893,Reggie Jackson,0.989101
4408,Dave Winfield,0.986537
3655,Duke Snider,0.978293
1018,Andre Dawson,0.972626
2462,Willie Mays,0.970107
3310,Frank Robinson,0.965383
2020,Al Kaline,0.947602
2377,Mickey Mantle,0.944765
1543,Ken Griffey,0.943881
3015,Mel Ott,0.935831


### Cosine similarity of current players (Trout and Arenado)

In [13]:
age24_OF_hofers = elig_hitters_ratios[(elig_hitters_ratios['inducted'] == 1) & (elig_hitters_ratios['age'] == 24) \
                          & (elig_hitters_ratios['POS'] == 'OF')][features6]

trout2015 = all_hitters_ratios[(all_hitters_ratios['name'] == 'Mike Trout') & (all_hitters_ratios['age'] == 24)][features6]
age24_OF_hofers_cossim = cosine_similarity(np.array(trout2015), np.array(age24_OF_hofers))

age24_OF_hofers_cossim_df = pd.DataFrame(age24_OF_hofers_cossim[0], index=age24_OF_hofers.index, columns=['cossim'])

elig_hitters_ratios_cossim = elig_hitters_ratios.merge(age24_OF_hofers_cossim_df, left_index=True, right_index=True)
elig_hitters_ratios_cossim[['name', 'cossim']].sort_values(by='cossim', ascending=False).head(10)


Unnamed: 0,name,cossim
1893,Reggie Jackson,0.989101
4408,Dave Winfield,0.986537
3655,Duke Snider,0.978293
1018,Andre Dawson,0.972626
2462,Willie Mays,0.970107
3310,Frank Robinson,0.965383
2020,Al Kaline,0.947602
2377,Mickey Mantle,0.944765
1543,Ken Griffey,0.943881
3015,Mel Ott,0.935831


In [14]:
age24_3B_hofers = elig_hitters_ratios[(elig_hitters_ratios['inducted'] == 1) & (elig_hitters_ratios['age'] == 24) \
                          & (elig_hitters_ratios['POS'] == '3B')][features6]

arenado2015 = all_hitters_ratios[(all_hitters_ratios['name'] == 'Nolan Arenado') & (all_hitters_ratios['age'] == 24)][features6]
age24_3B_hofers_cossim = cosine_similarity(np.array(arenado2015), np.array(age24_3B_hofers))

age24_3B_hofers_cossim_df = pd.DataFrame(age24_3B_hofers_cossim[0], index=age24_3B_hofers.index, columns=['cossim'])

elig_hitters_ratios_cossim = elig_hitters_ratios.merge(age24_3B_hofers_cossim_df, left_index=True, right_index=True)
elig_hitters_ratios_cossim[['name', 'cossim']].sort_values(by='cossim', ascending=False).head(10)


Unnamed: 0,name,cossim
5408,Eddie Mathews,0.953571
5614,Ron Santo,0.918046
5637,Mike Schmidt,0.853749
4650,Wade Boggs,0.850889
5355,Freddie Lindstrom,0.792827
5258,George Kell,0.77395
4745,George Brett,0.740936
5729,Deacon White,0.715415
5583,Brooks Robinson,0.695572
5670,Pie Traynor,0.595989


In [21]:
elig_hitters_ratios[elig_hitters_ratios['inducted'] == 1].groupby('name').max()['year'].mean()

18.64

In [146]:
# df1 = elig_hitters_ratios
# df2 = all_hitters_ratios

def find_similar_players(name, age, position, df1, df2, features, num=10):
    '''
    Description: Using cosine similarity, find HOF players that are most similar to player in question at a
    particular age.
    
    INPUT:
    name, age, and position of player of interest
    df1: the dataframe containing the stats of all HOF eligible players
    df2: the dataframe containing the stats of all hitters
    features: the stats by which to compare players
    num: the number of most similar HOFers to return
    
    OUTPUT: dataframe that lists the most similar HOF players using cosine similarity
    '''
    # Get the stats of all HOF players at the same position and same age as player of interest
    hofers = df1[(df1['inducted'] == 1) & (df1['age'] == age) & (df1['POS'] == position)][features]
    
    # Get the stats of player of interest
    player_stats = df2[(df2['name'] == name) & (df2['age'] == age)][features]
    
    # Calculate cosine similarities between player of interest and HOF players
    cossim = cosine_similarity(np.array(player_stats), np.array(hofers))
    cossim_df = pd.DataFrame(cossim[0], index=hofers.index, columns=['cossim'])

    # 
    cossim_merged = df1.merge(cossim_df, left_index=True, right_index=True)
    return cossim_merged[['name', 'yearID', 'cossim']].sort_values(by='cossim', ascending=False).head(num)


In [159]:
find_similar_players('Mike Trout', 24, 'OF', elig_hitters_ratios, all_hitters_ratios, features9, 10)

Unnamed: 0,name,yearID,cossim
2462,Willie Mays,1955,0.96743
1893,Reggie Jackson,1970,0.95085
4384,Dave Winfield,1975,0.948385
3631,Duke Snider,1950,0.94046
1018,Andre Dawson,1978,0.935011
2377,Mickey Mantle,1955,0.908228
3015,Mel Ott,1933,0.899639
3234,Jim Rice,1977,0.895215
2815,Stan Musial,1944,0.893079
1630,Chick Hafey,1927,0.890572


In [148]:
find_similar_players('Bryce Harper', 23, 'OF', elig_hitters_ratios, all_hitters_ratios, features9, 10)

Unnamed: 0,name,yearID,cossim
2461,Willie Mays,1954,0.976989
3,Hank Aaron,1957,0.953523
1892,Reggie Jackson,1969,0.936237
3014,Mel Ott,1932,0.929045
2376,Mickey Mantle,1954,0.928081
2814,Stan Musial,1943,0.924105
4383,Dave Winfield,1974,0.921204
3630,Duke Snider,1949,0.917028
4334,Ted Williams,1941,0.914489
3233,Jim Rice,1976,0.894595


In [151]:
df1[(df1['inducted'] == 1) & (df1['age'] == 29) & (df1['POS'] == 'C')][features9]

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled,MVP,Triple Crown,Gold Glove
10000,1.528454,1.386575,2.199066,1.650121,1.122209,1.549119,0,0,0,0,0,2.0,0.0,9.0
10050,1.370496,1.263371,1.633676,1.474402,0.355965,0.905185,0,0,0,0,0,2.0,0.0,0.0
10112,0.949523,0.831989,0.15873,0.590791,2.73878,1.090819,1,0,0,0,0,0.0,0.0,0.0
10140,0.696368,0.676269,1.099907,0.792354,0.306099,0.846707,0,0,0,0,0,0.0,0.0,0.0
10180,1.227609,1.21773,1.614939,1.222048,0.67756,1.258497,0,0,0,0,0,0.0,0.0,3.0
10207,1.617415,1.344169,0.890056,1.226952,0.990794,1.46284,0,0,0,0,0,1.0,0.0,0.0
10346,1.090457,1.199933,0.884532,1.214982,0.444956,0.74705,0,0,0,0,0,0.0,0.0,0.0
10392,1.260178,0.858017,0.333956,0.628954,2.158036,0.406243,0,0,0,0,0,0.0,0.0,0.0
10421,0.75321,0.862243,0.123249,0.713582,0.359033,1.137216,0,0,0,0,0,0.0,0.0,0.0
10440,0.686108,0.61679,0.842204,0.542234,0.778481,0.569525,0,0,0,0,0,0.0,0.0,1.0


In [154]:
find_similar_players('Buster Posey', 28, 'C', elig_hitters_ratios, all_hitters_ratios, features9, 10)

Unnamed: 0,name,yearID,cossim
10049,Yogi Berra,1953,0.984413
10206,Mickey Cochrane,1931,0.929969
10139,Roy Campanella,1949,0.896952
10696,Ernie Lombardi,1936,0.896913
10554,Gabby Hartnett,1928,0.89554
10345,Bill Dickey,1935,0.892332
10951,Mike Piazza,1996,0.838908
10420,Rick Ferrell,1933,0.807616
10391,Buck Ewing,1887,0.717545
10439,Carlton Fisk,1975,0.704153


In [155]:
trout23 = df2[(df2['name'] == 'Mike Trout') & (df2['age'] == 23)][features9]

In [156]:
harper23 = df2[(df2['name'] == 'Bryce Harper') & (df2['age'] == 23)][features9]

In [157]:
cosine_similarity(np.array(trout23), np.array(harper23))

array([[ 0.95155603]])

### Determine the probability of the average position player to be inducted into HOF

In [24]:
hof_hitters = elig_hitters_cumstats[elig_hitters_cumstats['inducted'] == 1]['playerID'].unique()

In [25]:
nonhof_hitters_cumstats = all_hitters_cumstats[-all_hitters_cumstats['playerID'].isin(hof_hitters)]

In [65]:
nonhof_hitters_cumstats.groupby('playerID')[stats_of_interest].max().mean().round(1)

R      166.9
H      331.2
HR      25.2
RBI    149.9
SB      26.6
BB     116.8
dtype: float64

In [96]:
def get_nonhofer_stats(df, stats_of_interest, calculate='mean'):
    '''
    Description: Determine mean, median, or min of each stat for HOF players at each position
    '''
#     stats = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
#              'IBB', 'HBP', 'SH', 'SF', 'GIDP']

    positions = df['POS'].unique()
    position_stats_lst = []

    for position in positions:
        pos = pd.Series([position], index=['POS'])
        if calculate == 'mean':
            stats_labels = [stat + '_mean' for stat in stats_of_interest]
            stats_labels.append('POS')
            position_stats = df[df['POS'] == position]\
            .groupby('playerID')[stats_of_interest].max().mean().round(1).append(pos)
            print position_stats
        elif calculate == 'median':
            stats_labels = [stat + '_mean' for stat in stats_of_interest]
            stats_labels.append('POS')
            position_stats = df[df['POS'] == position]\
            .groupby('playerID')[stats_of_interest].max().median().round(1).append(pos)
            print position_stats
        elif calculate == 'min':
            stats_labels = [stat + '_mean' for stat in stats_of_interest]
            stats_labels.append('POS')
            position_stats = df[df['POS'] == position]\
            .groupby('playerID')[stats_of_interest].max().min().round(1).append(pos)
        
        position_stats_lst.append(pos)
    
    nonhofer_stats_df = pd.DataFrame(position_stats_lst)

#    nonhofer_stats_df.columns = stats_labels
    return nonhofer_stats_df, stats_labels, position_stats_lst

                                  

In [85]:
nonhof_hitters_stats, stats_labels, position_stats_lst = get_nonhofer_stats(nonhof_hitters_cumstats, stats_of_interest)

R      188.7
H      355.1
HR      29.8
RBI      162
SB      36.2
BB     126.1
POS       OF
dtype: object
R      193.9
H        389
HR      40.5
RBI    205.3
SB      19.2
BB       147
POS       1B
dtype: object
R      173.3
H      341.7
HR      15.6
RBI    131.2
SB      32.8
BB     115.9
POS       2B
dtype: object
R       91.3
H      213.4
HR      15.5
RBI    100.7
SB       7.5
BB      75.9
POS        C
dtype: object
R      177.7
H      360.6
HR      16.7
RBI      140
SB      30.5
BB     115.3
POS       SS
dtype: object
R      176.5
H      352.7
HR      28.6
RBI    166.9
SB      23.7
BB     125.9
POS       3B
dtype: object
R      173.6
H        338
HR      48.1
RBI    192.3
SB      13.9
BB     153.1
POS       DH
dtype: object


In [97]:
nonhof_hitters_stats, stats_labels, position_stats_lst = get_nonhofer_stats(nonhof_hitters_cumstats, stats_of_interest, 'median')

R      46
H      88
HR      3
RBI    35
SB      5
BB     28
POS    OF
dtype: object
R      38
H      78
HR      4
RBI    36
SB      2
BB     26
POS    1B
dtype: object
R       50
H      102
HR       2
RBI     37
SB       5
BB      32
POS     2B
dtype: object
R      20
H      49
HR      1
RBI    20
SB      1
BB     16
POS     C
dtype: object
R        47
H        95
HR        2
RBI    30.5
SB        5
BB     30.5
POS      SS
dtype: object
R        40
H      89.5
HR        3
RBI      38
SB        4
BB       30
POS      3B
dtype: object
R      18
H      31
HR      5
RBI    16
SB      1
BB     14
POS    DH
dtype: object


In [95]:
nonhof_hitters_stats

Unnamed: 0,POS
0,OF
1,1B
2,2B
3,C
4,SS
5,3B
6,DH


In [83]:
position_stats_lst

[POS    OF
 dtype: object, POS    1B
 dtype: object, POS    2B
 dtype: object, POS    C
 dtype: object, POS    SS
 dtype: object, POS    3B
 dtype: object, POS    DH
 dtype: object]

In [86]:
hof_hitters_stats

Unnamed: 0,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,POS
0,1468.0,2569.9,250.3,1304.2,268.1,959.9,OF
1,1210.2,2327.9,210.7,1184.5,109.1,838.3,3B
2,1342.2,2431.8,154.3,1087.6,285.2,906.8,2B
3,1357.6,2439.3,298.5,1453.2,147.7,948.6,1B
4,1227.9,2323.0,98.1,1027.9,283.0,833.7,SS
5,911.3,1798.1,214.2,1035.9,86.9,709.1,C
6,1782.0,3319.0,234.0,1307.0,504.0,1094.0,DH


In [89]:
nonhof_hitters_cumstats[nonhof_hitters_cumstats['POS'] == 'OF']\
            .groupby('playerID')[stats_of_interest].max()

Unnamed: 0_level_0,R,H,HR,RBI,SB,BB
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abbeych01,307.0,492.0,19.0,280.0,93.0,167.0
abbotje01,82.0,157.0,18.0,83.0,6.0,38.0
abbotod01,2.0,13.0,0.0,6.0,3.0,6.0
abercre01,65.0,86.0,9.0,34.0,18.0,21.0
aberscl01,25.0,45.0,5.0,26.0,0.0,25.0
abnersh01,89.0,191.0,11.0,71.0,6.0,43.0
abramca01,257.0,433.0,32.0,138.0,12.0,304.0
abreubo01,1453.0,2470.0,288.0,1363.0,400.0,1476.0
acostme01,56.0,111.0,0.0,37.0,17.0,63.0
adamsbu01,282.0,532.0,50.0,249.0,12.0,234.0


In [90]:
nonhof_hitters_cumstats[nonhof_hitters_cumstats['playerID'] == 'zoccope01']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain
19912,zoccope01,2003,20,37.0,0.0,4.0,1.0,0.0,0.0,3.0,...,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,11.3


In [92]:
all_hitters_cumstats[all_hitters_cumstats['playerID'] == 'zoccope01']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain
19912,zoccope01,2003,20,37.0,0.0,4.0,1.0,0.0,0.0,3.0,...,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,11.3


In [93]:
all_hitters[all_hitters['playerID'] == 'zoccope01']

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,POS
90529,zoccope01,2003,1,MIL,NL,20,37.0,0.0,4.0,1.0,...,0.0,0.0,2.0,13.0,0.0,0.0,0.0,0.0,1.0,OF


In [29]:
nonhof_hitters_cumstats[nonhof_hitters_cumstats['POS'] == 'OF']\
            .groupby('playerID').max()[features9]

KeyError: "['R_ratio' 'H_ratio' 'HR_ratio' 'RBI_ratio' 'SB_ratio' 'BB_ratio'] not in index"

In [30]:
nonhof_hitters_ratios = all_hitters_ratios[-all_hitters_ratios['playerID'].isin(hof_hitters)]

In [100]:
stats_ratios = [stat + '_ratio' for stat in stats_of_interest]

In [109]:
nonhof_hitters_ratios[nonhof_hitters_ratios['POS'] == 'OF'].mean()

yearID             1961.369994
G                   460.241238
AB                 1548.443706
R                   229.493326
H                   427.914269
2B                   73.213211
3B                   17.033239
HR                   37.864383
RBI                 196.628942
SB                   45.943998
CS                   19.491528
BB                  149.795405
SO                  227.255730
IBB                  14.604837
HBP                  11.836051
SH                   16.802392
SF                   13.447121
GIDP                 32.606563
RBI_filled            0.004999
SB_filled             0.017444
birthYear          1933.695466
age                  27.996849
year                  5.503111
DBE1                  0.104345
DBE2                  0.057491
SE                    0.184917
R_mean             1468.000000
H_mean             2569.900000
HR_mean             250.300000
RBI_mean           1304.200000
SB_mean             268.100000
BB_mean             959.900000
retire_a

In [106]:
nonhof_hitters_ratios[nonhof_hitters_ratios['POS'] == '3B'].mean()

yearID             1959.275416
G                   463.668969
AB                 1577.914276
R                   214.783564
H                   420.390188
2B                   73.678711
3B                   13.564825
HR                   35.752568
RBI                 199.979632
SB                   29.806766
CS                   13.197094
BB                  150.500000
SO                  219.232317
IBB                  14.188691
HBP                  12.187558
SH                   21.072971
SF                   15.264988
GIDP                 38.750324
RBI_filled            0.006376
SB_filled             0.019837
birthYear          1931.454142
age                  27.943410
year                  5.542331
DBE1                  0.116366
DBE2                  0.045696
SE                    0.186149
R_mean             1210.200000
H_mean             2327.900000
HR_mean             210.700000
RBI_mean           1184.500000
SB_mean             109.100000
BB_mean             838.300000
retire_a

In [110]:
nonhof_hitters_ratios[(nonhof_hitters_ratios['POS'] == '3B') & (nonhof_hitters_ratios['year'] >= 10)].mean()

yearID             1965.392892
G                  1228.585390
AB                 4244.516288
R                   592.107601
H                  1145.208292
2B                  201.253702
3B                   34.844028
HR                  102.684107
RBI                 556.080948
SB                   80.788746
CS                   33.778894
BB                  428.334650
SO                  552.332981
IBB                  40.422427
HBP                  31.606640
SH                   51.965227
SF                   40.227829
GIDP                100.001348
RBI_filled            0.004936
SB_filled             0.011846
birthYear          1931.565647
age                  33.827246
year                 12.500494
DBE1                  0.078973
DBE2                  0.040474
SE                    0.205331
R_mean             1210.200000
H_mean             2327.900000
HR_mean             210.700000
RBI_mean           1184.500000
SB_mean             109.100000
BB_mean             838.300000
retire_a

In [None]:
nonhof_hitters_ratio

In [59]:
position = '3B'
pos = pd.Series([position], index=['POS'])

In [85]:
pos = pd.DataFrame([position], index=['POS']).transpose()

In [86]:
pos

Unnamed: 0,POS
0,3B


In [66]:
features_pos = features9 + ['POS']

In [67]:
features_pos

['R_ratio',
 'H_ratio',
 'HR_ratio',
 'RBI_ratio',
 'SB_ratio',
 'BB_ratio',
 'DBE1',
 'DBE2',
 'SE',
 'RBI_filled',
 'SB_filled',
 'MVP',
 'Triple Crown',
 'Gold Glove',
 'POS']

In [68]:
pd.DataFrame(nonhof_hitters_ratios[(nonhof_hitters_ratios['POS'] == '3B') & (nonhof_hitters_ratios['year'] >= 10)].groupby('playerID').max().mean()[features_pos]).transpose()

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled,MVP,Triple Crown,Gold Glove,POS
0,0.527924,0.536016,0.502164,0.504462,0.833922,0.53347,0.111524,0.063197,0.226766,0.018587,0.011152,0.022305,0.0,0.271375,


In [39]:
X9.head()

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled,MVP,Triple Crown,Gold Glove
0,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806,0,0,0,0,0,0.0,0.0,0.0
1,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,0,0,0,0,0,0.0,0.0,0.0
2,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445,0,0,0,0,0,0.0,0.0,0.0
3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007,0,0,0,0,0,1.0,0.0,0.0
4,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966,0,0,0,0,0,1.0,0.0,1.0


In [None]:
nonhof_hitters_ratios

In [41]:
idx = nonhof_hitters_ratios.groupby('name')['year'].transform(max) == nonhof_hitters_ratios['year']

KeyError: 'prob'

In [42]:
nonhof_hitters_ratios_max = nonhof_hitters_ratios[idx].groupby('name').max()


In [44]:
nonhof_hitters_ratios_max.head(10)

Unnamed: 0_level_0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,Gold Glove,MVP,Triple Crown,RBI_filled,SB_filled,birthYear,age,year,POS,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
A. J. Ellis,ellisaj01,2015,491,1483.0,147.0,358.0,61.0,3.0,35.0,178.0,0.0,3.0,223.0,317.0,19.0,19.0,21.0,15.0,54.0,0.0,0.0,0.0,0,0,1981.0,34.0,8,C,0,0,0,911.3,1798.1,214.2,1035.9,86.9,709.1,37.4,3.4,0.229864,0.283716,0.232843,0.24486,0.0,0.448138
A. J. Hinch,hinchaj01,2004,350,953.0,104.0,209.0,28.0,3.0,32.0,112.0,13.0,6.0,71.0,214.0,1.0,14.0,26.0,11.0,19.0,0.0,0.0,0.0,0,0,1974.0,30.0,7,C,0,0,0,911.3,1798.1,214.2,1035.9,86.9,709.1,37.4,7.4,0.234767,0.23911,0.307323,0.222415,0.307743,0.205975
A. J. Pierzynski,pierzaj01,2015,1978,7043.0,792.0,1989.0,392.0,24.0,186.0,886.0,14.0,23.0,302.0,866.0,67.0,126.0,28.0,55.0,228.0,0.0,0.0,0.0,0,0,1976.0,39.0,18,C,0,0,0,911.3,1798.1,214.2,1035.9,86.9,709.1,37.4,0.0,0.869088,1.106168,0.868347,0.855295,0.161105,0.425892
A. J. Pollock,polloaj01,2015,400,1398.0,224.0,411.0,90.0,18.0,37.0,146.0,66.0,15.0,114.0,228.0,2.0,6.0,5.0,12.0,30.0,1.0,0.0,0.0,0,0,1987.0,28.0,4,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,9.3,0.507357,0.531762,0.49151,0.372221,0.818538,0.394885
Aaron Altherr,altheaa01,2015,41,142.0,25.0,33.0,11.0,4.0,5.0,22.0,6.0,2.0,16.0,43.0,0.0,5.0,1.0,2.0,3.0,0.0,0.0,0.0,0,0,1991.0,24.0,2,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,0.130279,0.098233,0.152817,0.129045,0.171205,0.127513
Aaron Bates,batesaa01,2009,5,11.0,2.0,4.0,2.0,0.0,0.0,2.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1984.0,25.0,1,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,12.4,0.019741,0.021974,0.0,0.018442,0.0,0.014126
Aaron Boone,booneaa01,2009,1152,3871.0,519.0,1017.0,216.0,17.0,126.0,555.0,107.0,30.0,303.0,707.0,20.0,80.0,39.0,36.0,74.0,0.0,0.0,0.0,0,0,1973.0,36.0,13,3B,0,0,0,1210.2,2327.9,210.7,1184.5,109.1,838.3,36.6,0.6,0.448648,0.457038,0.625607,0.490178,1.026017,0.378128
Aaron Clapp,clappaa01,1879,36,146.0,24.0,39.0,9.0,3.0,0.0,18.0,0.0,,6.0,10.0,,,,,,0.0,0.0,0.0,0,1,1856.0,23.0,1,1B,0,0,0,1357.6,2439.3,298.5,1453.2,147.7,948.6,37.4,14.4,0.272245,0.246218,0.0,0.190751,0.0,0.097407
Aaron Cunningham,cunniaa01,2012,222,452.0,47.0,99.0,31.0,3.0,7.0,51.0,4.0,6.0,34.0,110.0,3.0,6.0,5.0,4.0,9.0,0.0,0.0,0.0,0,0,1986.0,26.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,11.3,0.104373,0.125585,0.091171,0.12748,0.048639,0.11547
Aaron Guiel,guielaa01,2006,307,970.0,151.0,239.0,58.0,0.0,35.0,128.0,8.0,12.0,83.0,218.0,2.0,30.0,5.0,11.0,15.0,0.0,0.0,0.0,0,0,1972.0,34.0,5,OF,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,3.3,0.170749,0.15438,0.232121,0.16292,0.049534,0.143536


In [71]:
pos

POS    3B
dtype: object

In [74]:
hof_hitters_stats

Unnamed: 0,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,POS
0,1468.0,2569.9,250.3,1304.2,268.1,959.9,OF
1,1210.2,2327.9,210.7,1184.5,109.1,838.3,3B
2,1342.2,2431.8,154.3,1087.6,285.2,906.8,2B
3,1357.6,2439.3,298.5,1453.2,147.7,948.6,1B
4,1227.9,2323.0,98.1,1027.9,283.0,833.7,SS
5,911.3,1798.1,214.2,1035.9,86.9,709.1,C
6,1782.0,3319.0,234.0,1307.0,504.0,1094.0,DH


In [77]:
temp = pd.DataFrame(nonhof_hitters_ratios_max[nonhof_hitters_ratios_max['POS'] == '3B'][features9].mean()).transpose()

In [93]:
temp

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled,MVP,Triple Crown,Gold Glove
0,0.229113,0.241084,0.201284,0.21903,0.346584,0.234581,0.165725,0.031073,0.126177,0.017891,0.011299,0.006591,0.0,0.077213


In [99]:
temp.merge(pos, left_index=True, right_index=True)

Unnamed: 0_level_0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled,MVP,Triple Crown,Gold Glove
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3B,0.229113,0.241084,0.201284,0.21903,0.346584,0.234581,0.165725,0.031073,0.126177,0.017891,0.011299,0.006591,0.0,0.077213


In [82]:
type(pos)

pandas.core.series.Series

In [None]:
def get_hofer_stats(df, stats, calculate='mean'):
    '''
    Description: Determine mean, median, or min of each stat for HOF players at each position
    '''
#     stats = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
#              'IBB', 'HBP', 'SH', 'SF', 'GIDP']

    positions = df['POS'].unique()
    position_stats_lst = []

    for position in positions:
        pos = pd.Series([position], index=['POS'])
        if calculate == 'mean':
            stats_labels = [stat + '_mean' for stat in stats]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats].max().mean().round(1).append(pos)
        elif calculate == 'median':
            stats_labels = [stat + '_med' for stat in stats]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats].max().median().round(1).append(pos)
        elif calculate == 'min':
            stats_labels = [stat + '_min' for stat in stats]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats].max().min().round(1).append(pos)            
        
        position_stats_lst.append(position_stats)

    hofer_stats_df = pd.DataFrame(position_stats_lst)

    hofer_stats_df.columns = stats_labels
    return hofer_stats_df

In [129]:
pos = pd.Series([position], index=['POS'])

In [131]:
elig_hitters_cumstats[(elig_hitters_cumstats['inducted'] == 1) & (elig_hitters_cumstats['POS'] == position)]\
            .groupby('playerID')[stats_to_ratio].max().mean().round(1).append(pos)

R      1210.2
H      2327.9
HR      210.7
RBI    1184.5
SB      109.1
BB      838.3
POS        3B
dtype: object

In [136]:
# df = nonhof_hitters_ratios_max

def get_avg_position_player_stats(df, features):
    '''
    Description: Obtain the average stats for a position player
    '''

    positions = df['POS'].unique()
    position_stats_lst = []

#     df_stats = pd.DataFrame(df[df['POS'] == '3B'][features].mean()).transpose()
#     df_stats.merge(pos, left_index=True, right_index=True)
#     df[df['POS'] == '3B'].groupby('playerID')[features].mean()
    
    for position in positions:
#        pos = pd.DataFrame([position], index=['POS']).transpose()
        pos = pd.Series([position], index=['POS'])
        position_stats = df[df['POS'] == position][features].mean().append(pos)
        position_stats_lst.append(position_stats)
#        print position_stats_lst
    nonhofer_stats_df = pd.DataFrame(position_stats_lst)
#    print nonhofer_stats_df
#    nonhofer_stats_df.columns = features
    return nonhofer_stats_df.set_index('POS')
        
    #        print 'pos: ', pos
#        df_stats = pd.DataFrame(df[df['POS'] == position][features].mean()).transpose()

        #        print 'df_stats: ', df_stats
    #    position_stats = df_stats.merge(pos, left_index=True, right_index=True)
    


In [140]:
idx_max = nonhof_hitters_ratios.groupby('name')['year'].transform(max) == nonhof_hitters_ratios['year']
nonhof_hitters_ratios_max = nonhof_hitters_ratios[idx_max].groupby('name').max()
avg9 = get_avg_position_player_stats(nonhof_hitters_ratios_max, features9)

In [114]:
pd.options.display.max_columns=99

In [33]:
pd.DataFrame(nonhof_hitters_ratios[(nonhof_hitters_ratios['POS'] == '3B') & (nonhof_hitters_ratios['year'] >= 10)].groupby('playerID').max().mean())

Unnamed: 0,0
yearID,1964.624535
G,1135.799257
AB,3833.858736
R,519.003717
H,1020.479554
2B,178.635688
3B,31.115242
HR,86.375465
RBI,490.226766
SB,69.446097


## Add Awards won

In [124]:
awards = pd.read_csv('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/AwardsPlayers.csv')

In [129]:
awards.head(25)

Unnamed: 0,playerID,awardID,yearID,lgID,tie,notes
0,bondto01,Pitching Triple Crown,1877,NL,,
1,hinespa01,Triple Crown,1878,NL,,
2,heckegu01,Pitching Triple Crown,1884,AA,,
3,radboch01,Pitching Triple Crown,1884,NL,,
4,oneilti01,Triple Crown,1887,AA,,
5,keefeti01,Pitching Triple Crown,1888,NL,,
6,clarkjo01,Pitching Triple Crown,1889,NL,,
7,rusieam01,Pitching Triple Crown,1894,NL,,
8,duffyhu01,Triple Crown,1894,NL,,
9,youngcy01,Pitching Triple Crown,1901,AL,,


In [127]:
all_hitters_ratios.head()

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966


In [130]:
awards['awardID'].unique()

array(['Pitching Triple Crown', 'Triple Crown',
       'Baseball Magazine All-Star', 'Most Valuable Player',
       'TSN All-Star', 'TSN Guide MVP',
       'TSN Major League Player of the Year', 'TSN Pitcher of the Year',
       'TSN Player of the Year', 'Rookie of the Year', 'Babe Ruth Award',
       'Lou Gehrig Memorial Award', 'World Series MVP', 'Cy Young Award',
       'Gold Glove', 'TSN Fireman of the Year', 'All-Star Game MVP',
       'Hutch Award', 'Roberto Clemente Award', 'Rolaids Relief Man Award',
       'NLCS MVP', 'ALCS MVP', 'Silver Slugger', 'Branch Rickey Award',
       'Hank Aaron Award', 'TSN Reliever of the Year',
       'Comeback Player of the Year'], dtype=object)

In [144]:
awards_subset = awards[awards['awardID'].isin(['Most Valuable Player', 'Triple Crown', 'Gold Glove'])]

In [148]:
all_hitters_cumstats

Index([u'playerID', u'yearID', u'G', u'AB', u'R', u'H', u'2B', u'3B', u'HR',
       u'RBI', u'SB', u'CS', u'BB', u'SO', u'IBB', u'HBP', u'SH', u'SF',
       u'GIDP', u'RBI_filled', u'SB_filled', u'birthYear', u'age', u'year',
       u'POS', u'name', u'DBE1', u'DBE2', u'SE', u'R_mean', u'H_mean',
       u'HR_mean', u'RBI_mean', u'SB_mean', u'BB_mean', u'retire_age_mean',
       u'yrs_remain'],
      dtype='object')

In [171]:
awards_subset = pd.concat([awards_subset, pd.get_dummies(awards_subset['awardID'])], axis=1)

In [172]:
awards_subset.rename(columns={'Most Valuable Player': 'MVP'}, inplace=True)

In [173]:
awards_subset.head(5)

Unnamed: 0,playerID,awardID,yearID,lgID,tie,notes,Gold Glove,MVP,Triple Crown
1,hinespa01,Triple Crown,1878,NL,,,0.0,0.0,1.0
4,oneilti01,Triple Crown,1887,AA,,,0.0,0.0,1.0
8,duffyhu01,Triple Crown,1894,NL,,,0.0,0.0,1.0
10,lajoina01,Triple Crown,1901,AL,,,0.0,0.0,1.0
102,cobbty01,Triple Crown,1909,AL,,,0.0,0.0,1.0


In [188]:
s1 = pd.merge(all_hitters_cumstats, awards_subset[['playerID', 'yearID', 'Gold Glove', 'MVP', 'Triple Crown']], how='left', on=['playerID', 'yearID'])

In [189]:
s1.head(5)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,Gold Glove,MVP,Triple Crown
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,,,
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,,,
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,,,
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,0.0,1.0,0.0
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.0,0.0,0.0


In [192]:
s1['Gold Glove'].fillna(0, inplace=True)

In [194]:
s1['MVP'].fillna(0, inplace=True)

In [196]:
s1['Triple Crown'].fillna(0, inplace=True)

In [197]:
s1.head(5)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,Gold Glove,MVP,Triple Crown
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.0,0.0,0.0
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,0.0,0.0,0.0
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,0.0,0.0,0.0
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,0.0,1.0,0.0
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.0,0.0,0.0


In [110]:
# Function to add awards

def add_awards(df, awards_file):
    '''
    Description: Add MVP, Triple Crown, and Gold Glove awards.
    '''
    # Read in awards csv file
    awards = pd.read_csv(awards_file)
    
    # Select only MVP, Triple Crown, and Gold Glove awards
    awards_subset = awards[awards['awardID'].isin(['Most Valuable Player', 'Triple Crown', 'Gold Glove'])]
    awards_subset = pd.concat([awards_subset, pd.get_dummies(awards_subset['awardID'])], axis=1)
    awards_subset.rename(columns={'Most Valuable Player': 'MVP'}, inplace=True)
    awards_subset = awards_subset.groupby(['playerID', 'yearID']).sum().reset_index()    
    
    # Merge awards_subset with df from argument
    df_merged = pd.merge(df, awards_subset[['playerID', 'yearID', 'Gold Glove', 'MVP', 'Triple Crown']],\
                  how='left', on=['playerID', 'yearID'])
    
    # Fill NaNs with 0
    df_merged['Gold Glove'].fillna(0, inplace=True)
    df_merged['MVP'].fillna(0, inplace=True)
    df_merged['Triple Crown'].fillna(0, inplace=True)
    return df_merged

In [98]:
awards = pd.read_csv('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/AwardsPlayers.csv')

In [99]:
awards_subset = awards[awards['awardID'].isin(['Most Valuable Player', 'Triple Crown', 'Gold Glove'])]

In [100]:
awards_subset[awards_subset['playerID'] == 'aaronha01']

Unnamed: 0,playerID,awardID,yearID,lgID,tie,notes
2116,aaronha01,Most Valuable Player,1957,NL,,
2154,aaronha01,Gold Glove,1958,NL,,RF
2196,aaronha01,Gold Glove,1959,NL,,RF
2238,aaronha01,Gold Glove,1960,NL,,RF


In [105]:
awards_subset = awards[awards['awardID'].isin(['Most Valuable Player', 'Triple Crown', 'Gold Glove'])]
awards_subset = pd.concat([awards_subset, pd.get_dummies(awards_subset['awardID'])], axis=1)
awards_subset.rename(columns={'Most Valuable Player': 'MVP'}, inplace=True)
awards_subset = awards_subset.groupby(['playerID', 'yearID']).sum().reset_index()

In [106]:
len(awards_subset)

1264

In [109]:
len(awards_subset.groupby(['playerID', 'yearID']).sum().reset_index())

1213

In [111]:
filled_na_df.shape

(53898, 21)

In [17]:
filled_na_df.head()

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,...,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,...,1.0,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,...,4.0,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,...,1.0,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,...,1.0,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0


In [114]:
add_awards(filled_na_df, awards_file)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,Gold Glove,MVP,Triple Crown
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,0.0,0.0,0.0
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,3.0,1.0,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0,0.0,0.0,0.0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,2.0,4.0,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0,0.0,0.0,0.0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,1.0,1.0,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0,0.0,1.0,0.0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,4.0,1.0,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0,1.0,0.0,0.0
5,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,8.0,0.0,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0,0.0,0.0
6,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,16.0,7.0,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0,0.0,0.0
7,aaronha01,1961,155,603.0,115.0,197.0,39.0,10.0,34.0,120.0,21.0,9.0,56.0,64.0,20.0,2.0,1.0,9.0,16.0,0,0,0.0,0.0,0.0
8,aaronha01,1962,156,592.0,127.0,191.0,28.0,6.0,45.0,128.0,15.0,7.0,66.0,73.0,14.0,3.0,0.0,6.0,14.0,0,0,0.0,0.0,0.0
9,aaronha01,1963,161,631.0,121.0,201.0,29.0,4.0,44.0,130.0,31.0,5.0,78.0,94.0,18.0,0.0,0.0,5.0,11.0,0,0,0.0,0.0,0.0


## Add All-Star appearances

In [198]:
allstars = pd.read_csv('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/AllstarFull.csv')

In [199]:
allstars['allstar'] = 1

In [200]:
allstars.head(5)

Unnamed: 0,playerID,yearID,gameNum,gameID,teamID,lgID,GP,startingPos,allstar
0,gomezle01,1933,0,ALS193307060,NYA,AL,1.0,1.0,1
1,ferreri01,1933,0,ALS193307060,BOS,AL,1.0,2.0,1
2,gehrilo01,1933,0,ALS193307060,NYA,AL,1.0,3.0,1
3,gehrich01,1933,0,ALS193307060,DET,AL,1.0,4.0,1
4,dykesji01,1933,0,ALS193307060,CHA,AL,1.0,5.0,1


In [201]:
all_hitters_cumstats.head(5)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3


In [204]:
s2 = pd.merge(all_hitters_cumstats, allstars[['playerID', 'yearID', 'allstar']], how='left', on=['playerID', 'yearID'])

In [207]:
s2.head(10)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,allstar
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,1.0
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.0
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.0
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.0
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,20.0,9.0,281.0,315.0,59.0,13.0,18.0,30.0,107.0,0,0,1934.0,25.0,6,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,12.3,1.0
6,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,20.0,9.0,281.0,315.0,59.0,13.0,18.0,30.0,107.0,0,0,1934.0,25.0,6,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,12.3,1.0
7,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,36.0,16.0,341.0,378.0,72.0,15.0,18.0,42.0,115.0,0,0,1934.0,26.0,7,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,11.3,1.0
8,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,36.0,16.0,341.0,378.0,72.0,15.0,18.0,42.0,115.0,0,0,1934.0,26.0,7,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,11.3,1.0
9,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,57.0,25.0,397.0,442.0,92.0,17.0,19.0,51.0,131.0,0,0,1934.0,27.0,8,OF,Hank Aaron,0,1,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,10.3,1.0


In [208]:
s2['allstar'].fillna(0, inplace=True)

In [209]:
s2.head(5)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,allstar
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,1934.0,20.0,1,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,17.3,0.0
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,5.0,3.0,77.0,100.0,5.0,6.0,13.0,8.0,33.0,0,0,1934.0,21.0,2,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,16.3,1.0
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,7.0,7.0,114.0,154.0,11.0,8.0,18.0,15.0,54.0,0,0,1934.0,22.0,3,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,15.3,1.0
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,8.0,8.0,171.0,212.0,26.0,8.0,18.0,18.0,67.0,0,0,1934.0,23.0,4,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,1.0
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,12.0,9.0,230.0,261.0,42.0,9.0,18.0,21.0,88.0,0,0,1934.0,24.0,5,OF,Hank Aaron,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,13.3,1.0


In [77]:
# Function to add all-star games

def add_allstars(df, allstar_file):
    '''
    Description: add all-star game column
    '''
    allstars = pd.read_csv(allstar_file)
    allstars = allstars.groupby(['playerID', 'yearID']).sum().reset_index()
    allstars['allstar'] = 1
    df_merged = pd.merge(df, allstars[['playerID', 'yearID', 'allstar']],\
                  how='left', on=['playerID', 'yearID'])
    df_merged['allstar'].fillna(0, inplace=True)
    return df_merged

In [21]:
add_allstars(filled_na_df, allstar_file)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,allstar
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,...,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,0.0
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,...,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0,1.0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,...,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0,1.0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,...,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0,1.0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,...,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0,1.0
5,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,...,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0
6,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,...,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0
7,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,...,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0
8,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,...,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0
9,aaronha01,1961,155,603.0,115.0,197.0,39.0,10.0,34.0,120.0,...,56.0,64.0,20.0,2.0,1.0,9.0,16.0,0,0,1.0


In [68]:
awards_df.head(25)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,Gold Glove,MVP,Triple Crown
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,0.0,0.0,0.0
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,3.0,1.0,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0,0.0,0.0,0.0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,2.0,4.0,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0,0.0,0.0,0.0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,1.0,1.0,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0,0.0,1.0,0.0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,4.0,1.0,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0,1.0,0.0,0.0
5,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,8.0,0.0,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0,0.0,0.0
6,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,16.0,7.0,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0,0.0,0.0
7,aaronha01,1961,155,603.0,115.0,197.0,39.0,10.0,34.0,120.0,21.0,9.0,56.0,64.0,20.0,2.0,1.0,9.0,16.0,0,0,0.0,0.0,0.0
8,aaronha01,1962,156,592.0,127.0,191.0,28.0,6.0,45.0,128.0,15.0,7.0,66.0,73.0,14.0,3.0,0.0,6.0,14.0,0,0,0.0,0.0,0.0
9,aaronha01,1963,161,631.0,121.0,201.0,29.0,4.0,44.0,130.0,31.0,5.0,78.0,94.0,18.0,0.0,0.0,5.0,11.0,0,0,0.0,0.0,0.0


In [69]:
add_allstars(awards_df, awards_file)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,Gold Glove,MVP,Triple Crown,allstar
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,0.0,0.0,0.0,0.0
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,3.0,1.0,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0,0.0,0.0,0.0,0.0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,2.0,4.0,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0,0.0,0.0,0.0,1.0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,1.0,1.0,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0,0.0,1.0,0.0,1.0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,4.0,1.0,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0,1.0,0.0,0.0,1.0
5,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,8.0,0.0,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0,0.0,0.0,1.0
6,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,16.0,7.0,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0,0.0,0.0,1.0
7,aaronha01,1961,155,603.0,115.0,197.0,39.0,10.0,34.0,120.0,21.0,9.0,56.0,64.0,20.0,2.0,1.0,9.0,16.0,0,0,0.0,0.0,0.0,0.0
8,aaronha01,1962,156,592.0,127.0,191.0,28.0,6.0,45.0,128.0,15.0,7.0,66.0,73.0,14.0,3.0,0.0,6.0,14.0,0,0,0.0,0.0,0.0,0.0
9,aaronha01,1963,161,631.0,121.0,201.0,29.0,4.0,44.0,130.0,31.0,5.0,78.0,94.0,18.0,0.0,0.0,5.0,11.0,0,0,0.0,0.0,0.0,1.0


In [78]:
add_allstars(awards_df, allstar_file)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,Gold Glove,MVP,Triple Crown,allstar
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,0.0,0.0,0.0,0.0
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,3.0,1.0,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0,0.0,0.0,0.0,1.0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,2.0,4.0,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0,0.0,0.0,0.0,1.0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,1.0,1.0,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0,0.0,1.0,0.0,1.0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,4.0,1.0,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0,1.0,0.0,0.0,1.0
5,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,8.0,0.0,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0,0.0,0.0,1.0
6,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,16.0,7.0,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0,0.0,0.0,1.0
7,aaronha01,1961,155,603.0,115.0,197.0,39.0,10.0,34.0,120.0,21.0,9.0,56.0,64.0,20.0,2.0,1.0,9.0,16.0,0,0,0.0,0.0,0.0,1.0
8,aaronha01,1962,156,592.0,127.0,191.0,28.0,6.0,45.0,128.0,15.0,7.0,66.0,73.0,14.0,3.0,0.0,6.0,14.0,0,0,0.0,0.0,0.0,1.0
9,aaronha01,1963,161,631.0,121.0,201.0,29.0,4.0,44.0,130.0,31.0,5.0,78.0,94.0,18.0,0.0,0.0,5.0,11.0,0,0,0.0,0.0,0.0,1.0


In [70]:
allstars = pd.read_csv(allstar_file)


In [71]:
allstars[allstars['playerID'] == 'aaronha01']

Unnamed: 0,playerID,yearID,gameNum,gameID,teamID,lgID,GP,startingPos
1126,aaronha01,1955,0,NLS195507120,ML1,NL,1.0,
1178,aaronha01,1956,0,ALS195607100,ML1,NL,1.0,
1228,aaronha01,1957,0,NLS195707090,ML1,NL,1.0,9.0
1278,aaronha01,1958,0,ALS195807080,ML1,NL,1.0,9.0
1369,aaronha01,1959,1,NLS195907070,ML1,NL,1.0,9.0
1370,aaronha01,1959,2,NLS195908030,ML1,NL,1.0,9.0
1487,aaronha01,1960,1,ALS196007110,ML1,NL,1.0,9.0
1488,aaronha01,1960,2,ALS196007130,ML1,NL,1.0,9.0
1603,aaronha01,1961,1,NLS196107110,ML1,NL,1.0,
1604,aaronha01,1961,2,ALS196107310,ML1,NL,1.0,


In [52]:
allstars[allstars['playerID'] == 'aaronha01'].groupby(['playerID', 'yearID']).sum().reset_index()

Unnamed: 0,playerID,yearID,gameNum,GP,startingPos
0,aaronha01,1955,0,1.0,
1,aaronha01,1956,0,1.0,
2,aaronha01,1957,0,1.0,9.0
3,aaronha01,1958,0,1.0,9.0
4,aaronha01,1959,3,2.0,18.0
5,aaronha01,1960,3,2.0,18.0
6,aaronha01,1961,3,2.0,
7,aaronha01,1962,3,1.0,
8,aaronha01,1963,0,1.0,9.0
9,aaronha01,1964,0,1.0,


In [None]:
allstars['allstar'] = 1

In [55]:
allstars = pd.read_csv(allstar_file)
allstars = allstars.groupby(['playerID', 'yearID']).sum().reset_index()
allstars['allstar'] = 1
# df_merged = pd.merge(df, allstars[['playerID', 'yearID', 'allstar']],\
#               how='left', on=['playerID', 'yearID'])
# df_merged['allstar'].fillna(0, inplace=True)


In [62]:
s1 = pd.merge(awards_df, allstars[['playerID', 'yearID', 'allstar']], how='left', on=['playerID', 'yearID'])

In [63]:
s1.head(25)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,Gold Glove,MVP,Triple Crown,allstar
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,0.0,0.0,0.0,
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,3.0,1.0,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0,0.0,0.0,0.0,1.0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,2.0,4.0,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0,0.0,0.0,0.0,1.0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,1.0,1.0,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0,0.0,1.0,0.0,1.0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,4.0,1.0,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0,1.0,0.0,0.0,1.0
5,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,8.0,0.0,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0,0.0,0.0,1.0
6,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,16.0,7.0,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0,0.0,0.0,1.0
7,aaronha01,1961,155,603.0,115.0,197.0,39.0,10.0,34.0,120.0,21.0,9.0,56.0,64.0,20.0,2.0,1.0,9.0,16.0,0,0,0.0,0.0,0.0,1.0
8,aaronha01,1962,156,592.0,127.0,191.0,28.0,6.0,45.0,128.0,15.0,7.0,66.0,73.0,14.0,3.0,0.0,6.0,14.0,0,0,0.0,0.0,0.0,1.0
9,aaronha01,1963,161,631.0,121.0,201.0,29.0,4.0,44.0,130.0,31.0,5.0,78.0,94.0,18.0,0.0,0.0,5.0,11.0,0,0,0.0,0.0,0.0,1.0


In [72]:
allstars.groupby(['playerID', 'yearID']).sum().reset_index()

Unnamed: 0,playerID,yearID,gameNum,GP,startingPos
0,aaronha01,1955,0,1.0,
1,aaronha01,1956,0,1.0,
2,aaronha01,1957,0,1.0,9.0
3,aaronha01,1958,0,1.0,9.0
4,aaronha01,1959,3,2.0,18.0
5,aaronha01,1960,3,2.0,18.0
6,aaronha01,1961,3,2.0,
7,aaronha01,1962,3,1.0,
8,aaronha01,1963,0,1.0,9.0
9,aaronha01,1964,0,1.0,


In [56]:
allstars.head(25)

Unnamed: 0,playerID,yearID,gameNum,GP,startingPos,allstar
0,aaronha01,1955,0,1.0,,1
1,aaronha01,1956,0,1.0,,1
2,aaronha01,1957,0,1.0,9.0,1
3,aaronha01,1958,0,1.0,9.0,1
4,aaronha01,1959,3,2.0,18.0,1
5,aaronha01,1960,3,2.0,18.0,1
6,aaronha01,1961,3,2.0,,1
7,aaronha01,1962,3,1.0,,1
8,aaronha01,1963,0,1.0,9.0,1
9,aaronha01,1964,0,1.0,,1


In [65]:
s1['allstar'].fillna(0, inplace=True)

In [66]:
s1.head(25)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,RBI_filled,SB_filled,Gold Glove,MVP,Triple Crown,allstar
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,2.0,2.0,28.0,39.0,,3.0,6.0,4.0,13.0,0,0,0.0,0.0,0.0,0.0
1,aaronha01,1955,153,602.0,105.0,189.0,37.0,9.0,27.0,106.0,3.0,1.0,49.0,61.0,5.0,3.0,7.0,4.0,20.0,0,0,0.0,0.0,0.0,1.0
2,aaronha01,1956,153,609.0,106.0,200.0,34.0,14.0,26.0,92.0,2.0,4.0,37.0,54.0,6.0,2.0,5.0,7.0,21.0,0,0,0.0,0.0,0.0,1.0
3,aaronha01,1957,151,615.0,118.0,198.0,27.0,6.0,44.0,132.0,1.0,1.0,57.0,58.0,15.0,0.0,0.0,3.0,13.0,0,0,0.0,1.0,0.0,1.0
4,aaronha01,1958,153,601.0,109.0,196.0,34.0,4.0,30.0,95.0,4.0,1.0,59.0,49.0,16.0,1.0,0.0,3.0,21.0,0,0,1.0,0.0,0.0,1.0
5,aaronha01,1959,154,629.0,116.0,223.0,46.0,7.0,39.0,123.0,8.0,0.0,51.0,54.0,17.0,4.0,0.0,9.0,19.0,0,0,1.0,0.0,0.0,1.0
6,aaronha01,1960,153,590.0,102.0,172.0,20.0,11.0,40.0,126.0,16.0,7.0,60.0,63.0,13.0,2.0,0.0,12.0,8.0,0,0,1.0,0.0,0.0,1.0
7,aaronha01,1961,155,603.0,115.0,197.0,39.0,10.0,34.0,120.0,21.0,9.0,56.0,64.0,20.0,2.0,1.0,9.0,16.0,0,0,0.0,0.0,0.0,1.0
8,aaronha01,1962,156,592.0,127.0,191.0,28.0,6.0,45.0,128.0,15.0,7.0,66.0,73.0,14.0,3.0,0.0,6.0,14.0,0,0,0.0,0.0,0.0,1.0
9,aaronha01,1963,161,631.0,121.0,201.0,29.0,4.0,44.0,130.0,31.0,5.0,78.0,94.0,18.0,0.0,0.0,5.0,11.0,0,0,0.0,0.0,0.0,1.0


In [123]:
X7.head()

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE,RBI_filled,SB_filled,MVP,Triple Crown,Gold Glove,allstar
0,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806,0,0,0,0,0,0.0,0.0,0.0,0.0
1,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,0,0,0,0,0,0.0,0.0,0.0,1.0
2,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445,0,0,0,0,0,0.0,0.0,0.0,2.0
3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007,0,0,0,0,0,1.0,0.0,0.0,3.0
4,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966,0,0,0,0,0,1.0,0.0,1.0,4.0


In [125]:
len(elig_hitters_cumstats[elig_hitters_cumstats['playerID'] == 'rosepe01'])

24

In [15]:
pd.options.display.max_columns=99

In [19]:
all_hitters_ratios[all_hitters_ratios['name'] == 'Ichiro Suzuki']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,Gold Glove,MVP,Triple Crown,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
17270,suzukic01,2001,157,692.0,127.0,242.0,34.0,8.0,8.0,69.0,56.0,14.0,30.0,53.0,10.0,8.0,4.0,4.0,3.0,1.0,1.0,0.0,0,0,1973.0,28.0,1,OF,Ichiro Suzuki,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,9.3,0.891076,0.969921,0.329205,0.544932,2.151436,0.321909
17271,suzukic01,2002,314,1339.0,238.0,450.0,61.0,16.0,16.0,120.0,87.0,29.0,98.0,115.0,37.0,13.0,7.0,9.0,11.0,2.0,1.0,0.0,0,0,1973.0,29.0,2,OF,Ichiro Suzuki,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,8.3,0.834946,0.901786,0.329205,0.473854,1.671205,0.525784
17272,suzukic01,2003,473,2018.0,349.0,662.0,90.0,24.0,29.0,182.0,121.0,37.0,134.0,184.0,44.0,19.0,10.0,10.0,14.0,3.0,1.0,0.0,0,0,1973.0,30.0,3,OF,Ichiro Suzuki,0,0,1,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,7.3,0.816235,0.884418,0.397789,0.479119,1.549546,0.479286
17273,suzukic01,2004,634,2722.0,450.0,924.0,114.0,29.0,37.0,242.0,157.0,48.0,183.0,247.0,63.0,23.0,12.0,13.0,20.0,4.0,1.0,0.0,0,0,1973.0,31.0,4,OF,Ichiro Suzuki,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,6.3,0.789339,0.925834,0.380643,0.477802,1.507926,0.490911
17274,suzukic01,2005,796,3401.0,561.0,1130.0,135.0,41.0,52.0,310.0,190.0,56.0,231.0,313.0,86.0,27.0,14.0,19.0,25.0,5.0,1.0,0.0,0,0,1973.0,32.0,5,OF,Ichiro Suzuki,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,5.3,0.787234,0.905794,0.427966,0.489649,1.459903,0.495739
17275,suzukic01,2006,957,4096.0,671.0,1354.0,155.0,50.0,61.0,359.0,235.0,58.0,280.0,384.0,102.0,32.0,15.0,21.0,27.0,6.0,1.0,0.0,0,0,1973.0,33.0,6,OF,Ichiro Suzuki,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,4.3,0.784662,0.904458,0.418365,0.472537,1.504725,0.500747
17276,suzukic01,2007,1118,4774.0,782.0,1592.0,177.0,57.0,67.0,427.0,272.0,66.0,329.0,461.0,115.0,35.0,19.0,23.0,34.0,7.0,1.0,0.0,0,0,1973.0,34.0,7,OF,Ichiro Suzuki,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,3.3,0.783826,0.91152,0.39387,0.481751,1.492833,0.504323
17277,suzukic01,2008,1280,5460.0,885.0,1805.0,197.0,64.0,73.0,469.0,315.0,70.0,380.0,526.0,127.0,40.0,22.0,27.0,42.0,8.0,1.0,0.0,0,0,1973.0,35.0,8,OF,Ichiro Suzuki,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,2.3,0.776184,0.904291,0.375499,0.462995,1.512728,0.509689
17278,suzukic01,2009,1426,6099.0,973.0,2030.0,228.0,68.0,84.0,515.0,341.0,79.0,412.0,597.0,142.0,44.0,24.0,28.0,43.0,9.0,1.0,0.0,0,0,1973.0,36.0,9,OF,Ichiro Suzuki,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,1.3,0.758545,0.904013,0.384072,0.451916,1.455634,0.491209
17279,suzukic01,2010,1588,6779.0,1047.0,2244.0,258.0,71.0,90.0,558.0,383.0,88.0,457.0,683.0,155.0,47.0,27.0,29.0,46.0,10.0,1.0,0.0,0,0,1973.0,37.0,10,OF,Ichiro Suzuki,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,0.3,0.734612,0.899381,0.370356,0.440684,1.471429,0.490374


In [18]:
all_hitters_ratios[all_hitters_ratios['name'] == 'Ramon Flores']

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,Gold Glove,MVP,Triple Crown,RBI_filled,SB_filled,birthYear,age,year,POS,name,DBE1,DBE2,SE,R_mean,H_mean,HR_mean,RBI_mean,SB_mean,BB_mean,retire_age_mean,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
5746,florera02,2015,12,32.0,3.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,1992.0,23.0,1,OF,Ramon Flores,0,0,0,1468.0,2569.9,250.3,1304.2,268.1,959.9,37.3,14.3,0.031267,0.041675,0.0,0.0,0.0,0.0


In [160]:
features9

['R_ratio',
 'H_ratio',
 'HR_ratio',
 'RBI_ratio',
 'SB_ratio',
 'BB_ratio',
 'DBE1',
 'DBE2',
 'SE',
 'RBI_filled',
 'SB_filled',
 'MVP',
 'Triple Crown',
 'Gold Glove']