In [None]:
import statsapi
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Spectral11
import os
import requests
from bs4 import BeautifulSoup
import statsapi


batting = pd.read_csv('Batting.csv')
pitching = pd.read_csv('Pitching.csv')
teams = pd.read_csv('Teams.csv')
payroll = pd.read_csv('Team_Payroll.csv')


min_year, max_year = 2011, 2024
batting = batting[(batting['yearID'] >= min_year) & (batting['yearID'] <= max_year)]
pitching = pitching[(pitching['yearID'] >= min_year) & (pitching['yearID'] <= max_year)]
teams = teams[(teams['yearID'] >= min_year) & (teams['yearID'] <= max_year)]


team_batting = batting.groupby(['yearID', 'teamID']).agg({
    'H': 'sum', 'HR': 'sum', 'BB': 'sum', 'SO': 'sum',
    'AB': 'sum', 'RBI': 'sum', 'R': 'sum'
}).reset_index()


team_stats = pd.merge(
    team_batting,
    teams[['yearID', 'teamID', 'W', 'G']],
    on=['yearID', 'teamID']
)


correlation_batting = team_stats.corr(numeric_only=True)
print("\nCorrelation with Wins (W) - Batting Stats:")
print(correlation_batting['W'].sort_values(ascending=False))

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_batting, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Batting Stats vs Wins')
plt.tight_layout()
plt.show()


team_pitching = pitching.groupby(['yearID', 'teamID']).agg({
    'ER': 'sum',
    'IPouts': 'sum'
}).reset_index()
team_pitching['IP'] = team_pitching['IPouts'] / 3
team_pitching['ERA'] = (team_pitching['ER'] * 9) / team_pitching['IP']
team_pitching = team_pitching[['yearID', 'teamID', 'ERA']]


team_features = pd.merge(
    teams[['yearID', 'teamID', 'W', 'G', 'R', 'RA', 'HR', 'H', 'SO', 'SV', 'attendance']],
    team_pitching,
    on=['yearID', 'teamID']
).dropna()

correlation_teams = team_features.corr(numeric_only=True)
print("\nCorrelation with Wins (W) - Full Team Stats:")
print(correlation_teams['W'].sort_values(ascending=False))

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_teams, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Full Team Stats vs Wins')
plt.tight_layout()
plt.show()


payroll = payroll.rename(columns={
    'Total Payroll Allocations': 'Payroll',
    'Team Name': 'teamID',
    'Year': 'yearID'
})

team_name_to_id = {
    "Arizona Diamondbacks": "ARI", "Atlanta Braves": "ATL", "Baltimore Orioles": "BAL", "Boston Red Sox": "BOS",
    "Chicago White Sox": "CHW", "Chicago Cubs": "CHC", "Cincinnati Reds": "CIN", "Cleveland Guardians": "CLE",
    "Colorado Rockies": "COL", "Detroit Tigers": "DET", "Houston Astros": "HOU", "Kansas City Royals": "KCR",
    "Los Angeles Angels": "LAA", "Los Angeles Dodgers": "LAD", "Miami Marlins": "MIA", "Milwaukee Brewers": "MIL",
    "Minnesota Twins": "MIN", "New York Mets": "NYM", "New York Yankees": "NYY", "Oakland Athletics": "OAK",
    "Philadelphia Phillies": "PHI", "Pittsburgh Pirates": "PIT", "San Diego Padres": "SDP", "San Francisco Giants": "SFG",
    "Seattle Mariners": "SEA", "St. Louis Cardinals": "STL", "Tampa Bay Rays": "TBR", "Texas Rangers": "TEX",
    "Toronto Blue Jays": "TOR", "Washington Nationals": "WSN"
}
payroll['teamID'] = payroll['teamID'].map(team_name_to_id)
payroll = payroll.dropna(subset=['teamID'])
payroll['Payroll'] = payroll['Payroll'].str.replace(r'[$,]', '', regex=True).astype(float)

combined = pd.merge(team_features, payroll, on=['yearID', 'teamID'])


features = combined[['R', 'RA', 'ERA', 'HR', 'SO', 'SV', 'attendance', 'Payroll']]
target = combined['W']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\nRegression Results:")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")


fig = px.scatter(combined, x='Payroll', y='W', size='attendance', color='ERA',
                 hover_data=['teamID', 'yearID'], title='Team Payroll vs Wins (Bubble = Attendance, Color = ERA)')
fig.show()



def get_batting_average_leaders(year, limit=10):
    leaders = statsapi.league_leader_data(
        'battingAverage',
        season=year,
        statGroup='hitting',  
        playerPool='qualified',
        limit=limit
    )
    df = pd.DataFrame(leaders, columns=["rank", "name", "team", "average"])
    return df


df = get_batting_average_leaders(2024)
print("Top 10 Batting Leaders by Average (2024):")
print(df)




os.makedirs('data', exist_ok=True)
team_stats.to_csv('team_performance_summary.csv', index=False)
print("\nCleaned team-level dataset saved as 'team_performance_summary.csv'")

team_features.to_csv('./data/team_features.csv', index=False)
print("Team features data saved to './data/team_features.csv'")


dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


y_pred_dt = dt_model.predict(X_test)


print("\nDecision Tree Results:")
print(f"R^2 Score: {r2_score(y_test, y_pred_dt):.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred_dt):.2f}")


output_file("team_performance_bokeh.html") 


source = ColumnDataSource(combined)


p = figure(title="Team Payroll vs Wins", x_axis_label='Payroll', y_axis_label='Wins', height=400, width=700)


p.scatter('Payroll', 'W', source=source, size=8, color='blue', legend_label="Teams")


p.add_tools(HoverTool(tooltips=[('Team', '@teamID'), ('Wins', '@W'), ('Payroll', '@Payroll')]))


show(p)
