In [6]:
### Import Packages

import pandas as pd
import numpy as np
import os
import pickle

In [7]:
### Read in all batting data

teams = pd.read_csv('playoff_teams.csv')
all_batting_stats = pd.read_csv('baseball_data/batting.csv')

  all_batting_stats = pd.read_csv('baseball_data/batting.csv')


In [11]:
### Select features

# Features used before feature engineering
batting_features = ['id', 'b_pa', 'b_ab', 'b_r', 'b_h', 'b_d', 'b_t', 'b_hr', 'b_rbi', 'b_sh', 'b_sf', 'b_hbp', 'b_w', 'b_iw', 'b_k', 'b_sb', 'b_cs', 'b_gdp', 'b_xi', 'b_roe']
# Final feature selection
batting_features_final = ['b_pa', 'b_r', 'b_d', 'b_t', 'b_hr', 'b_rbi', 'b_sh', 'b_sf', 'b_hbp', 'b_w', 'b_iw', 'b_k', 'b_sb', 'b_cs', 'b_gdp', 'b_xi', 'b_roe', 'ba', 'ops']

In [None]:
### Loop through all playoff teams and save data

# Define number of players in list
max_players = 25
num_features = len(batting_features_final)

all_seasons = np.empty([0, max_players, num_features])

## Start loop
for index, team in teams.iterrows():
    print(f"Index: {index}, Year: {team['Year']} Team: {team['Team']}, Level of Success: {team['Level of Success']}")
    # Filter all stats to appropriate team
    temp_stats = all_batting_stats[all_batting_stats['team'] == team['Team']][all_batting_stats['date'].astype(str).str.startswith(str(team['Year']))][all_batting_stats['gametype'] == 'regular'][batting_features]
    temp_stats = temp_stats.groupby(by='id').sum()

    ## Feature engineering
    # Total number of hits / Total number of at-bats
    temp_stats['ba'] = temp_stats['b_h'] / temp_stats['b_ab']
    # Total number of trips to first base / Total at-bats
    temp_stats['obp'] = (temp_stats['b_h'] + temp_stats['b_w'] + temp_stats['b_hbp'] + temp_stats['b_iw'] + temp_stats['b_roe'] + temp_stats['b_xi']) / temp_stats['b_ab']
    # Total bases / Total at-bats
    temp_stats['slg'] = ((temp_stats['b_h'] - temp_stats['b_d'] - temp_stats['b_t'] - temp_stats['b_hr']) + (temp_stats['b_d'] * 2) + (temp_stats['b_t'] * 3) + (temp_stats['b_hr'] * 4)) / temp_stats['b_ab']
    # OBP + SLG
    temp_stats['ops'] = temp_stats['obp'] + temp_stats['slg']

    # Sort players by total number of plate appearances and then by batting average
    # This will place the most-used players at the top of the list and then the best performing players on top in case of a tie.
    temp_stats = temp_stats.sort_values(by=['b_pa', 'ba'], ascending=[False, False])
    temp_stats = temp_stats.drop(columns=['b_ab', 'b_h', 'obp', 'slg'])
    temp_stats = temp_stats.to_numpy()

    # Adding padding to teams which used fewer than max_players for batting throughout the season
    padding = max_players - temp_stats.shape[0]
    if padding > 0:
        temp_stats = np.pad(temp_stats, ((0, padding), (0, 0)), mode='constant')
    elif padding < 0:
        temp_stats = temp_stats[:max_players,]
    temp_stats = np.reshape(temp_stats, (1, max_players, num_features))

    all_seasons = np.vstack((all_seasons, temp_stats))

# Save data to .pkl file
with open('all_batting_data.pkl', 'wb') as file:
    pickle.dump(all_seasons, file)

print("Data saved to 'all_batting_data.pkl'")