### Go back to using mean retirement age of only HOF eligible players throughout history to get remaining career years.

In [2]:
import pandas as pd
import numpy as np
import pickle




In [29]:
def calculate_stat_ratio(row, stat, denom):
    '''
    Description: Calculate the ratio of a player's stats in a particular category to the mean, median, or min of
    that stat for a Hall of Famer at that same position
    '''
    return ((row[stat] / row['year']) * (row['year'] + row['yrs_remain'])) / row[denom]


def create_eras_cols(df):
    # Dead Ball Era 1 (1900-1919)
    df['DBE1'] = 0
    df.ix[(temp['yearID'] >= 1900) & (temp['yearID'] <= 1919), 'DBE1'] = 1
    
    # Dead Ball Era 2 (1961-1968)
    df['DBE2'] = 0
    df.ix[(temp['yearID'] >= 1961) & (temp['yearID'] <= 1968), 'DBE2'] = 1
    
    # Steroid era (1988-2003)
    df['SE'] = 0
    df.ix[(temp['yearID'] >= 1988) & (temp['yearID'] <= 2003), 'SE'] = 1
    return df


def combine_stints(df, hof_elig_labels):
    df2 = df.copy()
    if 'inducted' in df2.columns:
        df2 = df2.drop('inducted', axis=1)
    if 'stint' in df2.columns:
        df2 = df2.drop('stint', axis=1)
        
    df_combine_stints = df2.groupby(['playerID', 'yearID']).sum().reset_index().sort_values(by=['playerID', 'yearID'])
    # Add hof 'inducted' column back to df
    return df_combine_stints.merge(elig_labels, on='playerID')

#def create_stat_ratio_cols(df, stats_of_interest, denominator='mean'):

# def combine_stints(df):
#     return df.groupby(['playerID', 'yearID']).sum().reset_index().sort_values(by=['playerID', 'yearID'])


def create_stat_ratio_cols(df, stats_of_interest, denominator='mean'):
    '''
    Description: Calculate the desired stats ratios and add them as new column to df
    '''
    if denominator == 'mean':
        denom_stats = [stat + '_mean' for stat in stats_of_interest]
    elif denominator == 'median':
        denom_stats = [stat + '_med' for stat in stats_of_interest]
    elif denominator == 'min':
        denom_stats = [stat + '_min' for stat in stats_of_interest]
        
    stats_ratio = [stat + '_ratio' for stat in stats_of_interest]

    for stat, denom, stat_ratio in zip(stats_of_interest, denom_stats, stats_ratio):
        df[stat_ratio] = df.apply(calculate_stat_ratio, axis=1, args=(stat, denom))
    return df

def create_yr_col(df):
    # Create 'year' variable indicating the number of years players have played in the MLB.
    player_startyr_dict = pd.DataFrame(df.groupby('playerID').min()['yearID']).to_dict()['yearID']
    df['year'] = df.apply(subtract_start_yr, axis=1, args=(player_startyr_dict,))
    return df

# def create_remain_yrs_col(df):
#     # Create 'yrs_remain' variable that estimates the remaining number of years for that player.
#     # Based on the median number of years that eligible players at that position played
#     player_totalyrs_dict = pd.DataFrame(df.groupby('playerID').max()['year']).to_dict()['year']
#     df['yrs_remain'] = df.apply(get_remaining_yrs, axis=1, args=(player_totalyrs_dict,))
#     return df

def fill_na(df, stats_of_interest):
    stats_filled = []
    for stat in stats_of_interest:
        if df[stat].isnull().sum() > 0:
            stat_filled = stat + '_filled'
            df[stat_filled] = 0
            df.ix[df[stat].isnull(), stat_filled] = 1
            df.ix[df[stat].isnull(), stat] = 0
            stats_filled.append(stat_filled)
    return df, stats_filled

def get_birth_year(filename):
    master = pd.read_csv(filename)
    return pd.DataFrame(master.groupby('playerID').sum()['birthYear']).reset_index()

def get_cumulative_stats(df):
    # Calculate cumulative stats over the years for each player.
    stats = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
             'IBB', 'HBP', 'SH', 'SF', 'GIDP'] 
    return df.groupby('playerID')[stats].cumsum()[stats]

# def get_cumulative_stats(df):
#     # Calculate cumulative stats over the years for each player.
#     stats = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
#              'IBB', 'HBP', 'SH', 'SF', 'GIDP']
#     return df.groupby('playerID')[stats].cumsum()[stats]


def get_hof_labels(filename):
    '''
    INPUT: 1 file
    OUTPUT: Pandas df

    Given Hall of Fame (HOF) data file, create HOF labels for all eligible players (inducted and not inducted)
    Returns: dataframe of all eligible HOF players with labels indicating if they were inducted or not.
    '''

    # Load HallofFame.csv file containing players who were/are eligible for election to HOF.
    hof = pd.read_csv(filename)

    # Select those who were inducted into HOF
    hof_players = hof[(hof['inducted'] == 'Y') & (hof['category'] == 'Player')][['playerID', 'inducted']]
    hof_players['inducted'] = hof_players['inducted'].map({'Y' : 1})

#    hof_player_indices = set(hof_players.index)
    hof_playerID = set(hof_players['playerID'])

    # Select all eligible players for the HOF (i.e., those who were on the ballot)
    elig = hof[(hof['category'] == 'Player')]

#    elig_indices = set(elig.index)
    elig_playerID = set(elig['playerID'])

    # Select players who were on the ballot but were not inducted into HOF
    nonhof_playerID = elig_playerID - hof_playerID
    nonhof_playerID = list(nonhof_playerID)
    nonhof_players = pd.DataFrame(nonhof_playerID, columns=['playerID'])
    nonhof_players['inducted'] = 0

    # Merge hof_players and nonhof_players
    return pd.concat([hof_players, nonhof_players])

def get_hofer_stats(df, stats_of_interest, calculate='mean'):
    '''
    Description: Determine mean, median, or min of each stat for HOF players at each position
    '''
#     stats = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', \
#              'IBB', 'HBP', 'SH', 'SF', 'GIDP']

    positions = df['POS'].unique()
    position_stats_lst = []

    for position in positions:
        pos = pd.Series([position], index=['POS'])
        if calculate == 'mean':
            stats_labels = [stat + '_mean' for stat in stats_of_interest]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats_of_interest].max().mean().round(1).append(pos)
        elif calculate == 'median':
            stats_labels = [stat + '_med' for stat in stats_of_interest]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats_of_interest].max().median().round(1).append(pos)
        elif calculate == 'min':
            stats_labels = [stat + '_min' for stat in stats_of_interest]
            stats_labels.append('POS')
            position_stats = df[(df['inducted'] == 1) & (df['POS'] == position)]\
            .groupby('playerID')[stats_of_interest].max().min().round(1).append(pos)            
        
        position_stats_lst.append(position_stats)

    hofer_stats_df = pd.DataFrame(position_stats_lst)

    hofer_stats_df.columns = stats_labels
    return hofer_stats_df


def get_positions(filename):

    # Load Fielding.csv file
    fielding = pd.read_csv(filename)

    # Set position of each player to the one at which he played the most games
    fielding_grouped = fielding.groupby(['playerID', 'POS']).sum().reset_index()
    max_game_indices = np.array(fielding_grouped.groupby('playerID')['G'].idxmax())
    player_pos = fielding_grouped.iloc[max_game_indices][['playerID', 'POS']]

    # Set all outfield positions (LF, CF, OF) to OF.
    positions_dict = {'P': 'P', 'OF': 'OF', '1B': '1B', '2B': '2B', 'C': 'C', 'SS': 'SS', \
                    '3B': '3B', 'DH': 'DH', 'CF': 'OF', 'LF': 'OF'}
    player_pos['POS'] = player_pos['POS'].map(positions_dict)

    # # Write out player position dataframe
    # with open('player_pos.pkl', 'w') as f:
    #     pickle.dump(player_pos, f)
    #
    return player_pos


def get_remaining_yrs(row, retire_age='mean'):
    if retire_age == 'mean':      
        yrs_remain = row['retire_age_mean'] - row['age']
    elif retire_age == 'median':
        yrs_remain = row['retire_age_med'] - row['age']        
    elif retire_age == 'max':
        yrs_remain = row['retire_age_max'] - row['age']        
    if yrs_remain >= 0:
        return yrs_remain
    # In case player has played longer than mean career, set to 0.
    else:
        return 0

def get_retirement_age(df, calculate='mean'):
    '''
    Description: Get mean or median retirement age of all eligible players at the different positions 
    '''    
    positions = df['POS'].unique()    
    df_age = pd.DataFrame()
    
    for position in positions:
        pos = pd.Series([position], index=['POS'])
        if calculate == 'mean':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().mean(), 1)
            col = 'retire_age_mean'         
        elif calculate == 'median':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().median(), 1)
            col = 'retire_age_med'
        elif calculate == 'max':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().max(), 1)
            col = 'retire_age_max'
        
        df_age = df_age.append({'POS': position, col: retirement_age}, ignore_index=True)
    return df_age


def subtract_start_yr(row, player_dict):
    '''
    Description: Subtract start year from each player's yearID 
    '''    
    name = row['playerID']
    return row['yearID'] - player_dict[name] + 1


# # -------------------------------------------------------------------

# if __name__ == '__main__':

# Select only players from batting table who were/are eligible for HOF.
elig_labels = get_hof_labels('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/HallOfFame.csv')

# Drop a few players from the list (brownwi02, irvinmo01, tennefr01). Willard Brown (brownwi02) and Monte Irvin
# (irvinmo01) were both inducted into the HOF, but they played the majority of their careers in the Negro League
# for which stats are not available. Fred Tenney was nominated but only one years worth of stat is available.

omit = ['brownwi02', 'irvinmo01', 'tennefr01']
elig_labels = elig_labels[-elig_labels['playerID'].isin(omit)]

# Also, Jacque Jones' playerID is jonesja05 in the HallofFame file. This corresponds to only one year's worth of stats.
# Majority of his career stats are associated with playerID jonesja04. Replace with this playerID instead.

elig_labels.loc[elig_labels['playerID'] == 'jonesja05', 'playerID'] = 'jonesja04'

# Merge hof labels with batting stats
batting = pd.read_csv('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Batting.csv')
elig_players = batting.merge(elig_labels, on='playerID')

# Get and join player positions to the df
player_pos = get_positions('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Fielding.csv')
elig_players = elig_players.merge(player_pos, on='playerID')

# Select only hitters (remove pitchers from df)
elig_hitters = elig_players[elig_players['POS'] != 'P']

# Select which stats to include in model
stats_of_interest = ['R', 'H', 'HR', 'RBI', 'SB', 'BB']

# Some players played a stint for different teams in the same season.
# Combine the stats in those cases into one row.
combined_stints = combine_stints(elig_hitters, elig_labels)

# Fill in null values that are present in the stats of interest
filled_na_df, filled_stats = fill_na(combined_stints, stats_of_interest)

# Calculate cumulative stats over the years for each player.
cumulative_stats = get_cumulative_stats(filled_na_df)

# Combine cumulative stats with non-stats columns
# cols_to_add = list(set(combined_stints.columns) - set(cumulative_stats.columns))
cols_to_add = list(set(filled_na_df.columns) - set(cumulative_stats.columns))
elig_hitters_cumstats = cumulative_stats.join(filled_na_df[cols_to_add])[filled_na_df.columns]

# Get and join birth year of each player to df and create 'age' column
birth_year = get_birth_year('../BaseballHOF-repo/data/SeanLahmanBaseballDB/baseballdatabank-master/core/Master.csv')
elig_hitters_cumstats = elig_hitters_cumstats.merge(birth_year, on='playerID')
elig_hitters_cumstats['age'] = elig_hitters_cumstats['yearID'] - elig_hitters_cumstats['birthYear']

# Re-join positions to df
elig_hitters_cumstats = elig_hitters_cumstats.merge(player_pos, on='playerID')

# Get mean, median, or min stats of different positions for HOF hitters and merge to df
hof_hitters_stats = get_hofer_stats(elig_hitters_cumstats, stats_of_interest, 'mean')
elig_hitters_cumstats = elig_hitters_cumstats.merge(hof_hitters_stats, on='POS')

# Get mean or median retirement age of different positions for all eligible hitters and merge to df
retirement_age_elig = get_retirement_age(elig_hitters_cumstats, 'mean')
elig_hitters_cumstats = elig_hitters_cumstats.merge(retirement_age_elig, on='POS')

# Get mean or median retirement age of different positions for all MLB players throughout history and merge to df
# batting_position = batting.merge(player_pos, on='playerID').sort('playerID')
# batting_position_birthyr = batting_position.merge(birth_year, on='playerID')
# batting_position_birthyr['age'] = batting_position_birthyr['yearID'] - batting_position_birthyr['birthYear']

# retirement_age_allMLB = get_retirement_age(batting_position_birthyr, 'max')
# elig_hitters_cumstats = elig_hitters_cumstats.merge(retirement_age_allMLB, on='POS')

# Create 'year' variable indicating the number of years players have played in MLB.
elig_hitters_cumstats = create_yr_col(elig_hitters_cumstats)

# Create 'yrs_remain' column that estimates the remaining number of years in the career of that player 
# based on the mean retirement age of eligible players at that position
elig_hitters_cumstats['yrs_remain'] = elig_hitters_cumstats.apply(get_remaining_yrs, axis=1, \
                                                                  args=('mean',))

# Add variables corresponding to different baseball eras
elig_hitters_cumstats = create_eras_cols(elig_hitters_cumstats)

# Calculate the ratio of a player's cumulative total for a particular stat to the mean, median, or min of that
# stat for players at that position who are in the HOF
elig_hitters_ratios = create_stat_ratio_cols(elig_hitters_cumstats, stats_of_interest, 'mean')



# Select the stat ratio columns as feature set on which to train model
features = [stat + '_ratio' for stat in stats_of_interest] + filled_stats
X = elig_hitters_ratios[features]

features2 = features = [stat + '_ratio' for stat in stats_of_interest] + ['year'] + filled_stats
X2 = elig_hitters_ratios[features2]

features3 = features = [stat + '_ratio' for stat in stats_of_interest] + ['year', 'yearID'] + filled_stats
X3 = elig_hitters_ratios[features3]

features4 = features = [stat + '_ratio' for stat in stats_of_interest] + ['year', 'yearID', 'DBE1',
                                                                         'DBE2', 'SE'] + filled_stats
X4 = elig_hitters_ratios[features4]


# Select 'inducted' column as target variable (1 = inducted into HOF, 0 = not inducted into HOF)
y = elig_hitters_ratios['inducted']

# Write out feature and label data
with open('X_features_hitters.pkl', 'w') as f:
    pickle.dump(X, f)

with open('X2_features_hitters.pkl', 'w') as f:
    pickle.dump(X2, f)

with open('X3_features_hitters.pkl', 'w') as f:
    pickle.dump(X3, f)

with open('X4_features_hitters.pkl', 'w') as f:
    pickle.dump(X4, f)

with open('y_labels_hitter.pkl', 'w') as f:
    pickle.dump(y, f)


In [84]:
def get_retirement_age_test(df, calculate='mean'):
    '''
    Description: Get mean or median retirement age of all eligible players at the different positions 
    '''    
    positions = df['POS'].unique()    
    df_age = pd.DataFrame()
    
    for position in positions:
        pos = pd.Series([position], index=['POS'])
        if calculate == 'mean':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().mean(), 1)
            col = 'retire_age_mean'         
        elif calculate == 'median':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().median(), 1)
            col = 'retire_age_med'
        elif calculate == 'max':
            retirement_age = round(df[df['POS'] == position].groupby('playerID')['age'].max().max(), 1)
            col = 'retire_age_max'
        
        df_age = df_age.append({'POS': position, col: retirement_age}, ignore_index=True)
    return df_age

In [94]:
get_retirement_age_test(elig_hitters_cumstats, 'median')

Unnamed: 0,POS,retire_age_med
0,OF,37.0
1,3B,36.0
2,2B,36.0
3,1B,37.0
4,SS,37.0
5,C,37.0
6,DH,41.0


In [95]:
get_retirement_age_test(elig_hitters_cumstats, 'mean')

Unnamed: 0,POS,retire_age_mean
0,OF,37.3
1,3B,36.6
2,2B,36.4
3,1B,37.4
4,SS,37.4
5,C,37.4
6,DH,40.2


In [7]:
elig_hitters_cumstats.head(25)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,BB_mean,retire_age_mean,year,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,...,959.9,37.3,1,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,...,959.9,37.3,2,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,...,959.9,37.3,3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,...,959.9,37.3,4,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,...,959.9,37.3,5,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,...,959.9,37.3,6,12.3,1.271526,1.34941,2.181183,1.442915,0.227527,0.892853
6,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,...,959.9,37.3,7,11.3,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713
7,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,...,959.9,37.3,8,10.3,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075
8,aaronha01,1962,1350,5309.0,956.0,1697.0,292.0,73.0,298.0,991.0,...,959.9,37.3,9,9.3,1.32416,1.342685,2.420828,1.545034,0.546065,0.980762
9,aaronha01,1963,1511,5940.0,1077.0,1898.0,321.0,77.0,342.0,1121.0,...,959.9,37.3,10,8.3,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389


In [10]:
y.value_counts()

0    8927
1    2690
Name: inducted, dtype: int64

In [11]:
X3.head()

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,year,yearID,RBI_filled,SB_filled
0,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806,1,1954,0,0
1,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,2,1955,0,0
2,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445,3,1956,0,0
3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007,4,1957,0,0
4,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966,5,1958,0,0


### Add columns corresponding to Dead Ball Era (1901-1920), Dead Ball Era 2 (1961-1968), and Steroid Era (1988-2003)

In [12]:
elig_hitters_cumstats.head(10)

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,BB_mean,retire_age_mean,year,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,...,959.9,37.3,1,17.3,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,...,959.9,37.3,2,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,...,959.9,37.3,3,15.3,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,...,959.9,37.3,4,14.3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,...,959.9,37.3,5,13.3,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,...,959.9,37.3,6,12.3,1.271526,1.34941,2.181183,1.442915,0.227527,0.892853
6,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,...,959.9,37.3,7,11.3,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713
7,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,...,959.9,37.3,8,10.3,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075
8,aaronha01,1962,1350,5309.0,956.0,1697.0,292.0,73.0,298.0,991.0,...,959.9,37.3,9,9.3,1.32416,1.342685,2.420828,1.545034,0.546065,0.980762
9,aaronha01,1963,1511,5940.0,1077.0,1898.0,321.0,77.0,342.0,1121.0,...,959.9,37.3,10,8.3,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389


In [22]:
temp = elig_hitters_cumstats.copy()

In [21]:
def create_eras_cols(df):
    # Dead Ball Era 1 (1900-1919)
    df['DBE1'] = 0
    df.ix[(temp['yearID'] >= 1900) & (temp['yearID'] <= 1919), 'DBE1'] = 1
    
    # Dead Ball Era 2 (1961-1968)
    df['DBE2'] = 0
    df.ix[(temp['yearID'] >= 1961) & (temp['yearID'] <= 1968), 'DBE2'] = 1
    
    # Steroid era (1988-2003)
    df['SE'] = 0
    df.ix[(temp['yearID'] >= 1988) & (temp['yearID'] <= 2003), 'SE'] = 1
    return df


In [18]:
temp2 = create_steroid_era(temp)

In [20]:
temp2[temp2['SE'] == 1]['yearID'].unique()

array([1990, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001, 2002,
       2003, 1988, 1989, 1991, 1999])

In [23]:
create_eras_cols(temp)
    

Unnamed: 0,playerID,yearID,G,AB,R,H,2B,3B,HR,RBI,...,yrs_remain,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,DBE1,DBE2,SE
0,aaronha01,1954,122,468.0,58.0,131.0,27.0,6.0,13.0,69.0,...,17.3,0.723025,0.932838,0.950459,0.968180,0.136516,0.533806,0,0,0
1,aaronha01,1955,275,1070.0,163.0,320.0,64.0,15.0,40.0,175.0,...,16.3,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,0,0,0
2,aaronha01,1956,428,1679.0,269.0,520.0,98.0,29.0,66.0,267.0,...,15.3,1.117779,1.234289,1.608470,1.248812,0.159269,0.724450,0,0,0
3,aaronha01,1957,579,2294.0,387.0,718.0,125.0,35.0,110.0,399.0,...,14.3,1.206080,1.278201,2.010587,1.399651,0.136516,0.815007,0,0,0
4,aaronha01,1958,732,2895.0,496.0,914.0,159.0,39.0,140.0,494.0,...,13.3,1.236621,1.301700,2.047143,1.386321,0.163819,0.876966,0,0,0
5,aaronha01,1959,886,3524.0,612.0,1137.0,205.0,46.0,179.0,617.0,...,12.3,1.271526,1.349410,2.181183,1.442915,0.227527,0.892853,0,0,0
6,aaronha01,1960,1039,4114.0,714.0,1309.0,225.0,57.0,219.0,743.0,...,11.3,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713,0,0,0
7,aaronha01,1961,1194,4717.0,829.0,1506.0,264.0,67.0,253.0,863.0,...,10.3,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075,0,1,0
8,aaronha01,1962,1350,5309.0,956.0,1697.0,292.0,73.0,298.0,991.0,...,9.3,1.324160,1.342685,2.420828,1.545034,0.546065,0.980762,0,1,0
9,aaronha01,1963,1511,5940.0,1077.0,1898.0,321.0,77.0,342.0,1121.0,...,8.3,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389,0,1,0


In [26]:
X4.head(15)

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,year,yearID,DBE1,DBE2,SE,RBI_filled,SB_filled
0,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806,1,1954,0,0,0,0,0
1,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,2,1955,0,0,0,0,0
2,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445,3,1956,0,0,0,0,0
3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007,4,1957,0,0,0,0,0
4,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966,5,1958,0,0,0,0,0
5,1.271526,1.34941,2.181183,1.442915,0.227527,0.892853,6,1959,0,0,0,0,0
6,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713,7,1960,0,0,0,0,0
7,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075,8,1961,0,1,0,0,0
8,1.32416,1.342685,2.420828,1.545034,0.546065,0.980762,9,1962,0,1,0,0,0
9,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389,10,1963,0,1,0,0,0


In [28]:
X4.head(15)

Unnamed: 0,R_ratio,H_ratio,HR_ratio,RBI_ratio,SB_ratio,BB_ratio,year,DBE1,DBE2,SE,RBI_filled,SB_filled
0,0.723025,0.932838,0.950459,0.96818,0.136516,0.533806,1,0,0,0,0,0
1,1.015974,1.139344,1.462245,1.227764,0.170645,0.733983,2,0,0,0,0,0
2,1.117779,1.234289,1.60847,1.248812,0.159269,0.72445,3,0,0,0,0,0
3,1.20608,1.278201,2.010587,1.399651,0.136516,0.815007,4,0,0,0,0,0
4,1.236621,1.3017,2.047143,1.386321,0.163819,0.876966,5,0,0,0,0,0
5,1.271526,1.34941,2.181183,1.442915,0.227527,0.892853,6,0,0,0,0,0
6,1.271526,1.331608,2.287369,1.489353,0.351042,0.928713,7,0,0,0,0,0
7,1.291783,1.340509,2.312175,1.513658,0.486339,0.946075,8,0,1,0,0,0
8,1.32416,1.342685,2.420828,1.545034,0.546065,0.980762,9,0,1,0,0,0
9,1.342582,1.351547,2.500439,1.572941,0.703059,1.031389,10,0,1,0,0,0
