In [1]:
import requests
import time
import json
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import logging

In [96]:
# Create and configure logger
logging.basicConfig(filename="./logs/playerAnalysis.log",
                    format='%(asctime)s %(message)s',
                    filemode='w')

# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG
logger.setLevel(logging.DEBUG)

In [97]:
with open('./static_data/players.json', 'r') as file:
    players = json.load(file)

with open("./static_data/basic_player_data.json", 'r') as file:
    player_stats = json.load(file)

In [None]:
# with open("./static_data/all_player_data.json", 'r') as file:
#     next_batch = json.load(file)

# print(len(player_stats))
# player_stats.update(next_batch)
# print(len(player_stats))

# with open("./static_data/all_player_data_copy.json", "w") as file:
#     json.dump(player_stats, file, indent=4)

1829
1830


In [98]:
key_data_types = {
    # General Player Info
    "age": int,
    "team_name_abbr": str,
    "comp_name_abbr": str,
    "pos": str,
    "awards": str, 
    "salary_info": float,
    
    # Pitching Stats
    "p_war": float, 
    "p_w": int,
    "p_l": int,
    "p_win_loss_perc": float,
    "p_earned_run_avg": float,
    "p_g": int,
    "p_gs": int,
    "p_gf": int,
    "p_cg": int,
    "p_sho": int,
    "p_sv": int,
    "p_ip": float,
    "p_h": float,
    "p_r": float,
    "p_er": float,
    "p_hr": int,
    "p_bb": int,
    "p_ibb": int,
    "p_so": int,
    "p_hbp": int,
    "p_bk": int,
    "p_wp": int,
    "p_bfp": int,
    "p_earned_run_avg_plus": int,
    "p_fip": float,
    "p_whip": float,
    "p_hits_per_nine": float,
    "p_hr_per_nine": float,
    "p_bb_per_nine": float,
    "p_so_per_nine": float,
    "p_strikeouts_per_base_on_balls": float,
    "p_g_per_dollar": float,
    "p_war_per_dollar": float,
    "p_w_per_dollar": float,
    "p_ip_per_dollar": float,
    "p_so_per_dollar": float,
    "p_so_per_nine_per_dollar": float,
    "p_earned_run_avg_plus_per_dollar": float,
    
    # Batting Stats
    "b_war": float,
    "b_games": int,
    "b_pa": int,
    "b_ab": int,
    "b_r": int,
    "b_h": int,
    "b_doubles": int,
    "b_triples": int,
    "b_hr": int,
    "b_rbi": int,
    "b_sb": int,
    "b_cs": int,
    "b_bb": int,
    "b_so": int,
    "b_batting_avg": float,
    "b_onbase_perc": float,
    "b_slugging_perc": float,
    "b_onbase_plus_slugging": float,
    "b_onbase_plus_slugging_plus": int,
    "b_roba": float,
    "b_rbat_plus": int,
    "b_tb": int,
    "b_gidp": int,
    "b_hbp": int,
    "b_sh": int,
    "b_sf": int,
    "b_ibb": int,
    "b_war_per_dollar": float,
    "b_h_per_dollar": float,
    "b_hr_per_dollar": float,
    "b_doubles_per_dollar": float,
    "b_triples_per_dollar": float,
    "b_sb_per_dollar": float,
    "b_bb_per_dollar": float,
    "b_tb_per_dollar": float,
    "b_games_per_dollar": float,
    "ops_plus_per_dollar": float,
}

In [158]:
# Step 1: Flatten the nested dictionary
pitcher_rows = []
batter_rows = []
for player, years in player_stats.items():
    position = list(years.items())[-1][1]
    for year, stats in list(years.items())[:-1]:
        row = {'player': player, 'year': year, 'position': position}
        for key, value in stats.items():
            # Get the target data type
            target_type = key_data_types.get(key, str)  # Default to string if type not found
            # Convert value to the target type   
            try:
                row[key] = target_type(value) if value not in [None, "null"] else None
            except ValueError:
                logger.warning(f"Error converting key '{key}' with value '{value}' to {target_type}")
            
        # row.update({key: float(value) if value.replace('.', '', 1).isdigit() else value for key, value in stats.items()})
        if (position == "pitching"):
            pitcher_rows.append(row)
        elif (position == "batting"):
            batter_rows.append(row)

# Convert to DataFrame
pitcher_df = pd.DataFrame(pitcher_rows)
batter_df = pd.DataFrame(batter_rows)

# Step 2: Select numeric columns
# numeric_cols = df.select_dtypes(include=['float', 'int']).columns

# Step 3: Group by player and year, calculate mean and standard deviation
# grouped = df.groupby(['player', 'year'])[numeric_cols].agg(['mean', 'std']).reset_index()

# Output
# print(grouped)

In [159]:
# Step 2: Select numeric columns
numeric_cols = batter_df.select_dtypes(include=['float', 'int']).columns
aggregate_batter_df = batter_df.groupby(['year'])[numeric_cols].agg(['mean', 'std']).reset_index()

numeric_cols = pitcher_df.select_dtypes(include=['float', 'int']).columns
aggregate_pitcher_df = pitcher_df.groupby(['year'])[numeric_cols].agg(['mean', 'std']).reset_index()

# Flatten the hierarchical columns
aggregate_batter_df.columns = ["_".join(filter(None, col)) for col in aggregate_batter_df.columns]
aggregate_pitcher_df.columns = ["_".join(filter(None, col)) for col in aggregate_pitcher_df.columns]

In [160]:
aggregate_batter_df.set_index("year", inplace=True)
aggregate_pitcher_df.set_index("year", inplace=True)

In [161]:
aggregate_batter_json = aggregate_batter_df.to_json(orient="index", indent=4)
aggregate_pitcher_json = aggregate_pitcher_df.to_json(orient="index", indent=4)

In [116]:
batting_stat_names = [
    "b_war",
    "b_games",
    "b_pa",
    "b_ab",
    "b_r",
    "b_h",
    "b_doubles",
    "b_triples",
    "b_hr",
    "b_rbi",
    "b_sb",
    "b_cs",
    "b_bb",
    "b_so",
    "b_batting_avg",
    "b_onbase_perc",
    "b_slugging_perc",
    "b_onbase_plus_slugging",
    "b_onbase_plus_slugging_plus",
    "b_roba",
    "b_rbat_plus",
    "b_tb",
    "b_gidp",
    "b_hbp",
    "b_sh",
    "b_sf",
    "b_ibb",
    "b_war_per_dollar",
    "b_h_per_dollar",
    "b_hr_per_dollar",
    "b_doubles_per_dollar",
    "b_triples_per_dollar",
    "b_sb_per_dollar",
    "b_bb_per_dollar",
    "b_tb_per_dollar",
    "b_games_per_dollar",
    "ops_plus_per_dollar",
]

pitching_stat_names = [
    "p_war",
    "p_w",
    "p_l",
    "p_win_loss_perc",
    "p_earned_run_avg",
    "p_g",
    "p_gs",
    "p_gf",
    "p_cg",
    "p_sho",
    "p_sv",
    "p_ip",
    "p_h",
    "p_r",
    "p_er",
    "p_hr",
    "p_bb",
    "p_ibb",
    "p_so",
    "p_hbp",
    "p_bk",
    "p_wp",
    "p_bfp",
    "p_earned_run_avg_plus",
    "p_fip",
    "p_whip",
    "p_hits_per_nine",
    "p_hr_per_nine",
    "p_bb_per_nine",
    "p_so_per_nine",
    "p_strikeouts_per_base_on_balls",
    "p_g_per_dollar",
    "p_war_per_dollar",
    "p_w_per_dollar",
    "p_ip_per_dollar",
    "p_so_per_dollar",
    "p_so_per_nine_per_dollar",
    "p_earned_run_avg_plus_per_dollar",
]

In [162]:
batter_df = batter_df.merge(aggregate_batter_df, on="year")
pitcher_df = pitcher_df.merge(aggregate_pitcher_df, on="year")

In [163]:
for batting_stat in batting_stat_names:
    batter_df[f"{batting_stat}_score"] = (batter_df[batting_stat] - batter_df[f"{batting_stat}_mean"]) / batter_df[f"{batting_stat}_std"] # Calculates z-score for player for each metric

In [164]:
for pitching_stat in pitching_stat_names:
    pitcher_df[f"{pitching_stat}_score"] = (pitcher_df[pitching_stat] - pitcher_df[f"{pitching_stat}_mean"]) / pitcher_df[f"{pitching_stat}_std"]

In [174]:
all_players = { player: {} for player in batter_df["player"].to_list() + pitcher_df["player"].to_list()}

In [175]:
for i in range(len(batter_df)):
    all_players[batter_df.iloc[i]["player"]][batter_df.iloc[i]["year"]] = batter_df.iloc[i][2:].to_dict()

for i in range(len(pitcher_df)):
    all_players[pitcher_df.iloc[i]["player"]][pitcher_df.iloc[i]["year"]] = pitcher_df.iloc[i][2:].to_dict()

In [176]:
with open("./static_data/all_player_data.json", "w") as file:
    json.dump(all_players, file, indent=4)

In [182]:
batting_leaders_2022 = batter_df[batter_df["year"] == "2022"].sort_values(by="b_war_per_dollar_score", ascending=False)[:10]
batting_leaders_2023 = batter_df[batter_df["year"] == "2023"].sort_values(by="b_war_per_dollar_score", ascending=False)[:10]
batting_leaders_2024 = batter_df[batter_df["year"] == "2024"].sort_values(by="b_war_per_dollar_score", ascending=False)[:10]

pitching_leaders_2022 = pitcher_df[pitcher_df["year"] == "2022"].sort_values(by="p_war_per_dollar_score", ascending=False)[:10]
pitching_leaders_2023 = pitcher_df[pitcher_df["year"] == "2023"].sort_values(by="p_war_per_dollar_score", ascending=False)[:10]
pitching_leaders_2024 = pitcher_df[pitcher_df["year"] == "2024"].sort_values(by="p_war_per_dollar_score", ascending=False)[:10]

In [185]:
with open("./static_data/batting_leaders_2022.json", "w") as file:
    json.dump(batting_leaders_2022, file, indent=4)

Unnamed: 0,player,year,position,age,team_name_abbr,comp_name_abbr,p_war,p_w,p_l,p_win_loss_perc,...,p_bb_per_nine_score,p_so_per_nine_score,p_strikeouts_per_base_on_balls_score,p_g_per_dollar_score,p_war_per_dollar_score,p_w_per_dollar_score,p_ip_per_dollar_score,p_so_per_dollar_score,p_so_per_nine_per_dollar_score,p_earned_run_avg_plus_per_dollar_score
1771,paul skenes,2024,pitching,22.0,PIT,NL,5.9,11.0,3.0,0.786,...,-0.428882,1.058594,1.251056,0.44214,7.272163,3.590589,2.711539,3.817497,1.33667,1.884113
1564,cole ragans,2024,pitching,26.0,KCR,AL,4.9,11.0,9.0,0.55,...,-0.168606,0.818093,0.230017,0.934571,5.86266,3.513357,4.027397,5.144488,1.111093,0.477659
193,ronel blanco,2024,pitching,30.0,HOU,AL,4.5,13.0,6.0,0.684,...,-0.038468,0.165304,-0.228935,0.828765,5.384674,4.294962,3.556459,3.650181,0.653583,0.585434
412,garrett crochet,2024,pitching,25.0,CHW,AL,4.1,6.0,12.0,0.333,...,-0.480938,1.539596,1.777045,0.828303,4.545373,1.492967,2.765809,4.450049,1.445433,0.115857
871,tanner houck,2024,pitching,28.0,BOS,AL,3.5,9.0,10.0,0.474,...,-0.376827,-0.212626,0.168136,0.783319,3.990487,2.68586,3.724331,3.220877,0.329791,0.498797
1304,bryce miller,2024,pitching,25.0,SEA,AL,3.4,12.0,8.0,0.6,...,-0.428882,0.027875,0.472385,0.87985,3.963887,3.898149,3.878607,3.766977,0.546459,0.304938
1686,cristopher sánchez,2024,pitching,27.0,PHI,NL,3.4,11.0,9.0,0.55,...,-0.428882,-0.281341,0.307368,0.87772,3.958721,3.514736,3.901607,3.282133,0.321676,0.252387
1,andrew abbott,2024,pitching,25.0,CIN,NL,3.3,10.0,10.0,0.5,...,-0.116551,-0.350055,-0.357854,0.554234,3.893775,3.192745,2.831605,2.288418,0.299357,0.187636
673,luis gil,2024,pitching,26.0,NYY,AL,3.1,15.0,7.0,0.682,...,0.19578,0.577592,-0.342384,0.793198,3.649381,5.130129,3.193353,3.844616,0.984605,0.171046
865,tyler holton,2024,pitching,28.0,DET,AL,3.1,7.0,2.0,0.778,...,-0.585048,-0.384413,0.848828,2.864858,3.552586,1.986031,1.621611,1.245028,0.236173,1.352966
