In [15]:
import duckdb
import pandas as pd

In [16]:
#download data
exit_velocity_data = pd.read_csv('data/mlb-batter-exit-velocity.csv')

# run sql file to create tables
con = duckdb.connect('baseball_data.db').execute(open('baseball_data.sql').read())

# Create a DataFrame from a query
batting = con.execute("SELECT * FROM batting").df()

# Close the connection to the database
con.close()


In [17]:
exit_velocity_data['name'] = exit_velocity_data['player'].str.split(', ').str[1] + ' ' + exit_velocity_data['player'].str.split(', ').str[0].str.strip()

In [18]:
batting['name'] = batting['name'].str.strip()

In [19]:
# join the two dataframes on name and year
df = batting.merge(exit_velocity_data, how='inner', left_on=['name', 'Year'], right_on=['name', 'year'])
df

Unnamed: 0,Rk,name,Age,Team,Lg,G,PA,AB,R,H,...,ground_ball_ev,max_distance,average_distance,average_homerun,hard_hit_95mph+,hard_hit_percentage,hard_hit_swing_percentage,total_barrels,barrels_batted_balls_percentage,barrels_plate_appearance_percentage
0,3,José Abreu,28,CHW,AL,154,668,613,88,178,...,89.5,447,166,396.0,198,41.8,16.1,44,9.3,6.6
1,13,Nick Ahmed,25,ARI,NL,134,459,421,49,95,...,84.9,402,171,387.0,78,22.4,9.9,6,1.7,1.3
2,22,Jose Altuve,25,HOU,AL,154,689,638,86,200,...,84.0,410,159,386.0,166,28.6,14.4,18,3.1,2.6
3,34,Elvis Andrus,26,TEX,AL,160,661,596,69,154,...,87.3,423,153,397.0,145,27.1,13.8,8,1.5,1.2
4,40,Nolan Arenado,24,COL,NL,157,665,616,97,177,...,87.8,456,200,411.0,208,40.2,16.9,43,8.3,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903,1276,Andrew Vaughn,23,CHW,AL,127,469,417,56,98,...,88.2,447,169,408.0,151,47.3,18.4,35,10.9,7.5
904,1277,Christian Vázquez,30,BOS,AL,138,498,458,51,118,...,83.9,409,157,390.0,118,31.1,12.9,10,2.6,2.0
905,1297,Christian Walker,30,ARI,NL,115,445,401,55,98,...,86.7,440,177,404.0,122,41.1,12.8,19,6.4,4.3
906,1343,Patrick Wisdom,29,CHC,NL,106,375,338,54,78,...,83.8,453,204,410.0,96,51.9,11.8,30,16.2,8.0


In [20]:
#total bases
df['TB'] = df['H'] + df['B'] + 2 * df['B_1'] + 3 * df['HR']

#Bat Avg
df['AVG'] = df['H'].divide(df['AB'], fill_value=0) #, where=df['AB'] != 0

#on base
numerator_obp = df['H'] + df['BB'] + df['HBP']
denominator_obp = df['AB'] + df['BB'] + df['HBP'] + df['SF']
df['OBP'] = numerator_obp.divide(denominator_obp, fill_value=0)# , where=denominator_obp != 0

#slugging
df['SLG'] = df['TB'].divide(df['AB'])# , where=df['AB'] != 0

#on base + slugging 
df['OPS'] = df['OBP'] + df['SLG']

print("DataFrame with new stats columns (AVG, OBP, SLG, OPS):")
print(df[['name', 'Year', 'AB', 'H', 'TB', 'AVG', 'OBP', 'SLG', 'OPS']].head())

DataFrame with new stats columns (AVG, OBP, SLG, OPS):
            name  Year   AB    H   TB       AVG       OBP       SLG       OPS
0     José Abreu  2015  613  178  276  0.290375  0.347305  0.450245  0.797550
1     Nick Ahmed  2015  421   95  130  0.225653  0.275330  0.308789  0.584119
2    Jose Altuve  2015  638  200  253  0.313480  0.352770  0.396552  0.749321
3   Elvis Andrus  2015  596  154  183  0.258389  0.309342  0.307047  0.616388
4  Nolan Arenado  2015  616  177  311  0.287338  0.323308  0.504870  0.828178


In [25]:
# batting = con.execute("SELECT * FROM batting").df()

stats_to_pivot = ['G', 'PA', 'AB', 'R', 'H', 'HR', 'RBI']


# - index: The column(s) to remain as rows (Player/Name)
# - columns: The column whose values will become the new column names (Year)
# - values: The columns whose values will populate the new wide columns (Selected Stats)
wide_batting_df = batting.pivot_table(
    index=['name'],           # one row per player
    columns='Year',           # Use Year values  as new columns
    values=stats_to_pivot,    # stats to pivot
    aggfunc='sum'             # Use 'sum' to aggregate if a player has multiple entries in a year,
).reset_index()

wide_batting_df['name'] = wide_batting_df['name'].str.strip()
wide_batting_df['name'] = wide_batting_df['name'].str.replace('*', '', regex=False)
wide_batting_df['name'] = wide_batting_df['name'].str.replace('#', '', regex=False)
# if duplicate names exist sum all their stats
wide_batting_df = wide_batting_df.groupby('name', as_index=False).sum(numeric_only=False)

wide_batting_df.to_csv('wide_batting_stats.csv')


  wide_batting_df = wide_batting_df.groupby('name', as_index=False).sum(numeric_only=False)
