In [None]:
#Step 1: Add new game results
#Step 2: Update all attributes including L10 games 
#Step 4: Hyperparameter tune this (below)
#Step 5: Add hyperparameter tuning to other notebook
#Step 7: Run the Bracket Code with the seedings put in to both cells
#Step 8: Make a small sample game results bracket once done


In [None]:
#HyperParameter Tuning once we have finalized data:
#from sklearn.model_selection import GridSearchCV

 #Define the hyperparameter grid
#param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 5, 7],
#     'n_estimators': [50, 100, 200],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }

# Initialize the XGBoost classifier
#xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Perform grid search
#grid_search = GridSearchCV(
#     estimator=xgb_model,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=5,  # 5-fold cross-validation
#     verbose=2,
#     n_jobs=-1
 #)

# Fit the grid search to the data
#grid_search.fit(X, y)

#Print the best parameters and score
#print("Best Parameters:", grid_search.best_params_)
#print("Best Score:", grid_search.best_score_)



In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import shap

# Load data
game_results = pd.read_csv('/Users/a.j.sager/Desktop/March Madness Code/game_results_2025.csv')
team_stats = pd.read_csv('/Users/a.j.sager/Desktop/March Madness Code/2024-2025 March Madness Data.csv', encoding="ISO-8859-1")

# Merge datasets on team 1 and team 2
merged = game_results.merge(
    team_stats, left_on="team_1", right_on="Team", how="left", suffixes=("", "_team1")
).merge(
    team_stats, left_on="team_2", right_on="Team", how="left", suffixes=("", "_team2")
)

# Drop redundant 'Team' columns
merged = merged.drop(columns=["Team", "Team_team2"])

# Calculate feature differences
# Rest of Features are redacted
stat_columns = [
    "PPG", "OPPG", "TO/Game"
]
for col in stat_columns:
    merged[f"{col}_diff"] = merged[f"{col}"] - merged[f"{col}_team2"]

# Drop rows where 'outcome' column or other critical columns have NaN values
merged.dropna(subset=['outcome'], inplace=True)

merged['game_date'] = pd.to_datetime(merged['game'])

# Compute time-based sample weights
merged['days_since_start'] = (merged['game_date'] - merged['game_date'].min()).dt.days
merged['game_weight'] = 1 + (merged['days_since_start'] / merged['days_since_start'].max())

# Define features and target
X = merged[[f"{col}_diff" for col in stat_columns]]
y = merged["outcome"]
weights = merged["game_weight"]  # Use these weights for training
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set the best hyperparameters obtained from grid search
best_params = {
    'learning_rate': 0.1,
    'max_depth': 5,
    'n_estimators': 50,
    'subsample': 0.6,
    'colsample_bytree': 1.0
}

# Initialize the XGBoost classifier with the best parameters
model = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric="logloss")

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Perform 5-fold cross-validation and evaluate model
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.2f}")
print(f"Standard Deviation of CV Accuracy: {cv_scores.std():.2f}")

# Feature importance plot
plt.figure(figsize=(10, 6))
xgb.plot_importance(model, max_num_features=10, importance_type='gain', title='Feature Importance')
plt.show()

# SHAP Visualization (after fitting the model)
# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Summary plot
shap.summary_plot(shap_values, X, plot_type="bar")

# Detailed force plot for the first prediction
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X.iloc[0])
# Function to predict winner and assign spread
def predict_winner_with_spread(team_1, team_2):
    # Get stats for both teams
    team_1_stats = team_stats[team_stats["Team"] == team_1].iloc[0]
    team_2_stats = team_stats[team_stats["Team"] == team_2].iloc[0]

    # Determine the correct order using NET Ranking or BPI
    team_1_first = team_1_stats["NET Ranking"] <= team_2_stats["NET Ranking"]  # Lower ranking is better

    if not team_1_first:
        team_1_stats, team_2_stats = team_2_stats, team_1_stats  # Swap teams
        swapped = True
    else:
        swapped = False

    # Compute feature differences
    features = [team_1_stats[col] - team_2_stats[col] for col in stat_columns]

    # Predict probability
    prob = model.predict_proba([features])[0][1]

    # Ensure prob is within the expected range [0, 1]
    prob = max(0, min(prob, 1))

    # Adjust probability if order was swapped
    if swapped:
        prob = 1 - prob

    # Calculate spread based on probability
    if prob >= 0.95:
        spread = "-15.5"
    elif prob >= 0.92:
        spread = "-14.5"
    elif prob >= 0.90:
        spread = "-13.5"
    elif prob >= 0.875:
        spread = "-12.5"
    elif prob >= 0.825:
        spread = "-11.5"
    elif prob >= 0.80:
        spread = "-10.5"
    elif prob >= 0.75:
        spread = "-9.5"
    elif prob >= 0.725:
        spread = "-8.5"
    elif prob >= 0.70:
        spread = "-7.5"
    elif prob >= 0.675:
        spread = "-6.5"
    elif prob >= 0.65:
        spread = "-5.5"
    elif prob >= 0.625:
        spread = "-4.5"
    elif prob >= 0.60:
        spread = "-3.5"
    elif prob >= 0.575:
        spread = "-2.5"
    elif prob >= 0.55:
        spread = "-1.5"
    elif prob >= 0.525:
        spread = "-0.5"
    elif prob >= 0.50:
        spread = "Even"
    elif prob >= 0.475:
        spread = "+0.5"
    elif prob >= 0.45:
        spread = "+1.5"
    elif prob >= 0.425:
        spread = "+2.5"
    elif prob >= 0.40:
        spread = "+3.5"
    elif prob >= 0.375:
        spread = "+4.5"
    elif prob >= 0.35:
        spread = "+5.5"
    elif prob >= 0.30:
        spread = "+6.5"
    elif prob >= 0.25:
        spread = "+7.5"
    elif prob >= 0.20:
        spread = "+8.5"
    elif prob >= 0.15:
        spread = "+9.5"
    elif prob >= 0.10:
        spread = "+10.5"
    elif prob >= 0.05:
        spread = "+11.5"
    else:
        spread = "+12.5"  # For probabilities closer to 0

    # If Team 1 is less likely to win, adjust spread to reflect Team 2 as the favorite
    if not swapped:
        if prob < 0.50:
            spread = f"+{spread[1:]}"  # Adjust for Team 2 as the favorite
    else:
        if prob >= 0.50:
            spread = f"-{spread[1:]}"  # Adjust for Team 1 as the favorite

    # Always print in the original input order
    print(f"The chance of {team_1} beating {team_2} is {prob:.2%}")
    print(f"Spread: {team_1} {spread}")



In [None]:
# Get input and call the function
team_1 = input("Enter Team 1: ")
team_2 = input("Enter Team 2: ")

predict_winner_with_spread(team_1, team_2)

In [None]:
# Remove leading/trailing whitespace and drop any rows with NaN values in the 'Team' column
team_stats['Team'] = team_stats['Team'].str.strip()  # Strip extra spaces
team_stats = team_stats.dropna(subset=['Team'])  # Drop rows with NaN values in the 'Team' column

# Ensure no duplicates exist
team_stats = team_stats.drop_duplicates(subset=['Team'])



In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Create an empty dictionary to store the wins for each team
team_wins = {team: 0 for team in team_stats["Team"]}

# Iterate over all pairs of teams and calculate the win probability for team 1
for team_1 in team_stats["Team"]:
    for team_2 in team_stats["Team"]:
        if team_1 != team_2:  # Skip comparisons of the same team
            try:
                # Get stats for the two teams
                team_1_stats = team_stats[team_stats["Team"] == team_1].iloc[0]
                team_2_stats = team_stats[team_stats["Team"] == team_2].iloc[0]
                
                # Compute feature differences
                features = [
                    team_1_stats[col] - team_2_stats[col] for col in stat_columns
                ]
                
                # Predict the probability of team_1 winning
                probability_team_1_wins = model.predict_proba([features])[0][1]
                
                # If team_1 wins, increment their win count
                if probability_team_1_wins > 0.5:
                    team_wins[team_1] += 1

            except IndexError as e:
                # Print a message if a team doesn't have stats in the dataset
                print(f"Error processing match between {team_1} and {team_2}: {e}")

# Create a DataFrame from the dictionary
win_counts = pd.DataFrame(list(team_wins.items()), columns=["Team", "Wins"])
win_counts = win_counts.sort_values(by="Wins", ascending=False)




In [None]:
win_counts.head(20)