In [None]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats # Use this library to download the baseball stats

In [None]:
START = 2002
END = 2022

In [None]:
batting = batting_stats(START, END, qual=200) # Qual determines how many minimum plate appearances a batter can have

In [None]:
batting.to_csv("batting.csv") # Store this batting data into a CSV file

In [None]:
# Split batting data columns into groups for each player
# Filter groups to only include a player's data group that has more than 1 season of data
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [None]:
batting

In [None]:
# Setting up ML target
def next_season(player):
    # Split data up by player and for each player, backfill WAR value for next season as target
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [None]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

In [None]:
null_count = batting.isnull().sum()

In [None]:
null_count

In [None]:
complete_cols = list(batting.columns[null_count == 0]) # Select columns without any missing values
batting = batting[complete_cols + ["Next_WAR"]].copy() # Select all complete columns and "Next_War" column for batting

In [None]:
batting

In [None]:
batting.dtypes[batting.dtypes == "object"]

In [None]:
# Delete unnecessary String type columns - only player name and team should remain
del batting["Age Rng"]
del batting["Dol"]

In [None]:
# Assign numbers to each 3-letter team name abbreviation
batting["team_code"] = batting["Team"].astype("category").cat.codes # Use pandas to categorically sort teams, and convert categories to a number

In [None]:
# Make copy of batting data because we're dropping any rows where "Next_WAR" is empty but those rows could be useful in predicting future seasons
batting_full = batting.copy()
batting = batting.dropna()

In [None]:
# Going to run feature selection to optimize model's accuracy
from sklearn.linear_model import Ridge # Ridge regression model
from sklearn.feature_selector import SequentialFeatureSelector
from sklearn.model_selector import TimeSeriesSplit

rr = Ridge(alpha=1) # Set lambda coefficient to 1

split = TimeSeriesSplit(n_split=3) # Split data into 3 parts and make predictions for each part in a time-series aware way
