In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
file_path = 'C:/Users/RaymondCarpenter/Documents/GitHub/14thstreetanalytics/baseball/ws_winners_2024_teams.csv'
data = pd.read_csv(file_path)

# Step 3: Drop non-numeric columns and exclude 'ws_winner' from features
numeric_data = data.select_dtypes(include=['float64', 'int64']).drop(columns=['ws_winner'])

X = numeric_data  
y = data['ws_winner']  

# Check for missing values
print(f"Total missing values: {numeric_data.isna().sum().sum()}")  


X = X.fillna(X.median())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_scaled, y)

feature_importances = rf_clf.feature_importances_
important_features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("Top 10 important features:")
print(important_features.head(10)) 

team_2024 = data[data['Tm'].str.contains("2024")]

team_2024_important = team_2024[X.columns]  

team_2024_important_scaled = scaler.transform(team_2024_important)

predictions = rf_clf.predict_proba(team_2024_important_scaled)[:, 1]

team_2024['Win_Probability'] = predictions

ranked_teams = team_2024[['Tm', 'Win_Probability']].sort_values(by='Win_Probability', ascending=False)

print("\n2024 Teams Ranked by World Series Win Probability:\n", ranked_teams)


Total missing values: 4
Top 10 important features:
   Feature  Importance
0     #Bat    0.079689
72       A    0.065120
8       2B    0.064309
24      SH    0.048978
26     IBB    0.046263
70      Ch    0.041278
63    #Fld    0.038649
56    WHIP    0.035667
38      CG    0.034242
28      #P    0.028252

2024 Teams Ranked by World Series Win Probability:
                             Tm  Win_Probability
14  2024 Philadelphia Phillies             0.26
15    2024 Los Angeles Dodgers             0.21
18         2024 Detroit Tigers             0.16
23       2024 New York Yankees             0.12
22     2024 Kansas City Royals             0.12
21      2024 Baltimore Orioles             0.11
20    2024 Cleveland Guardians             0.11
17       2024 San Diego Padres             0.10
19         2024 Houston Astros             0.10
16      2024 Milwaukee Brewers             0.06


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_2024['Win_Probability'] = predictions
