In [48]:
# Import Packages

import pandas as pd
import numpy as np
import os
import pickle

In [49]:
### Read in all pitching data

teams = pd.read_csv('playoff_teams.csv')
all_pitching_stats = pd.read_csv('baseball_data/pitching.csv')

In [61]:
### Select features

# Features used before feature engineering
pitching_features = ['id', 'p_ipouts', 'p_h', 'p_d', 'p_t', 'p_hr', 'p_r', 'p_er', 'p_w', 'p_k', 'p_hbp', 'p_wp', 'p_bk', 'p_sh', 'p_sf', 'wp', 'lp', 'save', 'p_cg']
# Final feature selection
pitching_features_final = ['p_d', 'p_t', 'p_hr', 'p_r', 'p_k', 'p_hbp', 'p_wp', 'p_bk', 'psh', 'p_sf', 'wp', 'lp', 'save', 'p_cg', 'whip', 'era']

In [None]:
### Loop through all playoff teams and save data

# Define number of players in list
max_players = 25
num_features = len(pitching_features_final)

all_seasons = np.empty([0, max_players, num_features])

## Start loop
for index, team in teams.iterrows():
    print(f"Index: {index}, Year: {team['Year']} Team: {team['Team']}, Level of Success: {team['Level of Success']}")
    # Filter all stats to appropriate team
    temp_stats = all_pitching_stats[all_pitching_stats['team'] == team['Team']][all_pitching_stats['date'].astype(str).str.startswith(str(team['Year']))][all_pitching_stats['gametype'] == 'regular'][pitching_features]
    temp_stats = temp_stats.groupby(by='id').sum()

    ## Feature engineering
    # (Total number of walks + Total number of hits) / Total number of innings pitched
    temp_stats['whip'] = (temp_stats['p_w'] + temp_stats['p_h']) / temp_stats['p_ipouts']
    # (Earned Runs / Innings Pitched) x 9
    temp_stats['era'] = (temp_stats['p_er'] / temp_stats['p_ipouts']) * 9

    # Sort players by total number of outs pitched and then by era
    # This will palce the most-used players at the top of the list and then the best performing players on top in case of a tie.
    temp_stats = temp_stats.sort_values(by=['p_ipouts', 'era'], ascending=[False, True])
    temp_stats = temp_stats.drop(columns=['p_ipouts', 'p_er', 'p_h', 'p_w']) 
    temp_stats = temp_stats.to_numpy()

    # Adding padding to teams which used fewer than max_players for batting throughout the season
    padding = max_players - temp_stats.shape[0]
    if padding > 0:
        temp_stats = np.pad(temp_stats, ((0, padding), (0, 0)), mode='constant')
    elif padding < 0:
        temp_stats = temp_stats[:max_players,]
    temp_stats = np.reshape(temp_stats, (1, max_players, num_features))

    all_seasons = np.vstack((all_seasons, temp_stats))

# Save data to .pkl file
with open('all_pitching_data.pkl', 'wb') as file:
    pickle.dump(all_seasons, file)

print("Data saved to 'all_pitching_data.pkl'")