# Final Project

## Import code needed in this notebook

In [313]:
import numpy as np
import pandas
import torch

import time
import os

import matplotlib.pyplot as plt
%matplotlib inline

In [309]:
# Get a list of all players
# Note: We only need the first column ('playerID') from this data set
df_players = pandas.read_csv('baseballdatabank-2019.2/baseballdatabank-2019.2/core/People.csv')
df_players = df_players['playerID']

# Get batting information for all players
# Note: For records with missing data, pandas inserts 'NaN', replace
#       these values with a 0 for downstream processing.
df_batting = pandas.read_csv('baseballdatabank-2019.2/baseballdatabank-2019.2/core/Batting.csv')
df_batting.fillna(0, inplace=True)

# Get fielding information for all players
# Note: For records with missing data, pandas inserts 'NaN', replace
#       these values with a 0 for downstream processing.
# Note: There may be more than one entry for a specific combination of
#       'playerID' and 'yearID'.  This occurs if a player played more than
#       one position
df_fielding = pandas.read_csv('baseballdatabank-2019.2/baseballdatabank-2019.2/core/Fielding.csv')
df_fielding.fillna(0, inplace=True)

# Get batting information for all players
# The db has records of anyone (players & coaches) who received votes.
# We only want players who were voted in
df_hof = pandas.read_csv('baseballdatabank-2019.2/baseballdatabank-2019.2/core/HallOfFame.csv')
df_hof.fillna(0, inplace=True)
df_hof = df_hof.loc[(df_hof['inducted'] == 'Y') & (df_hof['category'] == 'Player')]
df_hof = df_hof['playerID']

In [317]:
# Create new dataframe to hold career stats
df_career = pandas.DataFrame()
if os.path.isfile('df_career.csv'):
    df_career = pandas.read_csv('df_career.csv')
else:
    for player in df_players:
        # Get all fielding data for this player
        df_tmp = df_fielding.loc[df_fielding['playerID'] == player]

        # Sum each column
        ds = df_tmp.sum()

        # The player name is a string and is concatenated when summed
        # replace with a single instance of the players name
        ds['playerID'] = player

        # This might not be necessary, but we'll set the 'stint' field
        # the total number of years played (a.k.a. number of uniques years)
        ds['stint'] = df_tmp['yearID'].nunique()

        # Remove the data points I don't care about
        ds.drop(['yearID', 'teamID', 'lgID', 'POS'], inplace=True, errors='ignore')

        # Add data set to data frame
        df_career = df_career.append(ds, ignore_index=True)

    # Save to file so we only need to generate once    
    df_career.to_csv('df_career.csv')

In [316]:
df_career.count()[0]

19617

In [301]:
df_hof.count()[0]

256

In [306]:
print(df_hof.loc[[1]])

   playerID  yearID votedBy  ballots  needed  votes inducted category  \
1  ruthba01    1936   BBWAA    226.0   170.0  215.0        Y   Player   

  needed_note  
1           0  


In [311]:
df_career

Unnamed: 0,A,CS,DP,E,G,GS,InnOuts,PB,PO,SB,WP,ZR,playerID,stint
0,29.0,0.0,2.0,3.0,331.0,0.0,1011.0,0.0,11.0,0.0,0.0,0.0,aardsda01,9.0
1,429.0,0.0,218.0,144.0,3020.0,2977.0,78414.0,0.0,7436.0,0.0,0.0,0.0,aaronha01,23.0
2,113.0,0.0,124.0,22.0,387.0,206.0,6472.0,0.0,1317.0,0.0,0.0,0.0,aaronto01,7.0
3,135.0,0.0,10.0,13.0,448.0,91.0,3328.0,0.0,67.0,0.0,0.0,0.0,aasedo01,13.0
4,1.0,0.0,3.0,1.0,9.0,4.0,138.0,0.0,37.0,0.0,0.0,0.0,abadan01,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19612,1.0,1.0,0.0,2.0,13.0,1.0,114.0,1.0,31.0,2.0,1.0,0.0,zupofr01,3.0
19613,415.0,0.0,84.0,23.0,201.0,136.0,3844.0,0.0,267.0,0.0,0.0,0.0,zuvelpa01,9.0
19614,145.0,0.0,10.0,7.0,265.0,31.0,1847.0,0.0,45.0,0.0,0.0,0.0,zuverge01,8.0
19615,37.0,0.0,10.0,25.0,342.0,27.0,654.0,0.0,755.0,0.0,0.0,0.0,zwilldu01,4.0


In [314]:
if os.path.isfile('df_career.csv'):
    print("True")
else:
    print("false")

True


In [285]:
foo = df_fielding.loc[df_fielding['playerID'] == 'yelicch01']
#foo

In [176]:
df_fielding.loc[df_fielding['playerID'] == 'yelicch01'].sum()['playerID']

'yelicch01yelicch01yelicch01yelicch01yelicch01yelicch01'

In [177]:
foo.iloc[:,3:4].sum(axis=0)

teamID    MIAMIAMIAMIAMIAMIL
dtype: object