In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("/kaggle/input/dataset-match/basketball_matches_with_opponents.csv")

# Encode team names
team_encoder = LabelEncoder()
df["team_encoded"] = team_encoder.fit_transform(df["team"])
df["opponent_team_encoded"] = team_encoder.transform(df["opponent_team"])

drop_cols = ["game_id", "team", "opponent_team", "notD1_incomplete"]
df.drop(columns=drop_cols, inplace=True, errors='ignore')

# Fill missing values with median
df.fillna(df.median(), inplace=True)

#Feature Engineering: Compute key basketball metrics
df["FG2_PCT"] = df["FGM_2"] / df["FGA_2"]
df["FG3_PCT"] = df["FGM_3"] / df["FGA_3"]
df["FT_PCT"] = df["FTM"] / df["FTA"]
df["AST_TO_RATIO"] = df["AST"] / df["TOV"]
df["DREB_RATE"] = df["DREB"] / (df["DREB"] + df["OREB"])
df["OREB_RATE"] = df["OREB"] / (df["DREB"] + df["OREB"])
df["TURNOVER_RATE"] = df["TOV"] / (df["FGA_2"] + df["FGA_3"] + df["FTA"])
df["MARGIN_VICTORY"] = df["team_score"] - df["opponent_team_score"]

# Fill NaN values created from division
df.fillna(0, inplace=True)

# Define target variable (1 = Win, 0 = Loss)
df["Win"] = (df["team_score"] > df["opponent_team_score"]).astype(int)

# Prepare training data
X = df.drop(columns=["Win", "team_score", "opponent_team_score", "result",'FGM_2',"FGA_2","FGM_3","FGA_3","FTM","FTA","AST","TOV","DREB","OREB","FTA"], errors='ignore')
y = df["Win"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

import joblib

# Save the trained model
joblib.dump(rf_model, "random_forest_model.pkl")
print("Model saved successfully as 'random_forest_model.pkl'")

# Evaluate the model
y_pred_probs = rf_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, (y_pred_probs > 0.5).astype(int))
auc_roc = roc_auc_score(y_test, y_pred_probs)

print(f"Model Accuracy: {accuracy:.4f}")
print(f"AUC-ROC Score: {auc_roc:.4f}")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/dataset-match/basketball_matches_with_opponents.csv'

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

# Select key stats for prediction
stat_features = ["FG2_PCT", "FG3_PCT", "FT_PCT", "AST_TO_RATIO", "DREB_RATE", "OREB_RATE", "TURNOVER_RATE", "MARGIN_VICTORY"]
sequence_length = 5

# Normalize data
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[stat_features] = scaler.fit_transform(df[stat_features])

# Function to create sequences for LSTM
def create_sequences(team_df, stat_features, sequence_length):
    X, y = [], []
    for i in range(len(team_df) - sequence_length):
        X.append(team_df[stat_features].iloc[i:i+sequence_length].values)
        y.append(team_df[stat_features].iloc[i+sequence_length].values)
    return np.array(X), np.array(y)

# Create dataset for LSTM
X_lstm, y_lstm = create_sequences(df_scaled, stat_features, sequence_length)

# Split into training/testing sets
split = int(len(X_lstm) * 0.8)
X_train_lstm, X_test_lstm = X_lstm[:split], X_lstm[split:]
y_train_lstm, y_test_lstm = y_lstm[:split], y_lstm[split:]

In [None]:
lstm_model = Sequential([
    LSTM(64, activation="relu", return_sequences=True, input_shape=(sequence_length, len(stat_features))),
    LSTM(32, activation="relu"),
    Dense(len(stat_features))
])

lstm_model.compile(optimizer="adam", loss="mean_squared_error")

# Train the model
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=16, validation_data=(X_test_lstm, y_test_lstm))

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predict on test data
y_pred_lstm = lstm_model.predict(X_test_lstm)

# Calculate MSE and MAE
mse = mean_squared_error(y_test_lstm, y_pred_lstm)
mae = mean_absolute_error(y_test_lstm, y_pred_lstm)

print(f"LSTM Model Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Save trained model
lstm_model.save("lstm_stat_predictor.h5")


based on only predicted stats

In [None]:
import numpy as np
import tensorflow as tf

# Load trained LSTM model
lstm_model = tf.keras.models.load_model("lstm_stat_predictor.h5")

def predict_future_stats(team_encoded):
    """Predicts future stats for a given team using LSTM."""
    team_history = df_scaled[df_scaled["team_encoded"] == team_encoded].tail(sequence_length)

    if len(team_history) < sequence_length:
        print(f"Not enough historical data for team {team_encoded}")
        return None

    team_history_input = np.array([team_history[stat_features].values])
    predicted_stats = lstm_model.predict(team_history_input)[0]

    # Convert back to original scale
    predicted_stats = scaler.inverse_transform([predicted_stats])[0]

    return dict(zip(stat_features, predicted_stats))

def predict_win_probability(team_name, opponent_name):
    """Predicts the win probability of team_name against opponent_name."""
    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        print("Error: One or both team names are not in the dataset.")
        return

    team_encoded = team_encoder.transform([team_name])[0]
    opponent_encoded = team_encoder.transform([opponent_name])[0]

    # Predict future stats using LSTM
    team_predicted_stats = predict_future_stats(team_encoded)
    opponent_predicted_stats = predict_future_stats(opponent_encoded)

    if team_predicted_stats is None or opponent_predicted_stats is None:
        print("Error: Insufficient historical data for prediction.")
        return

    # Prepare input for Random Forest model (Ensure correct feature names)
    input_features = X.iloc[0:1].copy()
    for col in stat_features:
        input_features[col] = team_predicted_stats[col]  # Team 1 stats
        input_features[f"opponent_{col}"] = opponent_predicted_stats[col]  # Opponent stats

    input_features["team_encoded"] = team_encoded
    input_features["opponent_team_encoded"] = opponent_encoded

    # Ensure feature names match those seen during training
    input_features = input_features[X.columns]  # Reorder and match feature names exactly

    # Predict win probability using Random Forest
    win_probability = rf_model.predict_proba(input_features)[:, 1][0]

    # Print results
    print("\n=======================")
    print(f"Matchup: {team_name} vs {opponent_name}")
    print("=======================")
    print(f"Win Probability for {team_name}: {win_probability:.4f}\n")

    print(f"=== Predicted Stats for {team_name} ===")
    for stat, value in team_predicted_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\n=== Predicted Stats for {opponent_name} ===")
    for stat, value in opponent_predicted_stats.items():
        print(f"{stat}: {value:.4f}")

predict_win_probability("oklahoma_sooners","baylor_bears")


mean of all stats + predicted

In [None]:
import numpy as np
import tensorflow as tf
import joblib

lstm_model = tf.keras.models.load_model("lstm_stat_predictor.h5")
rf_model = joblib.load("random_forest_model.pkl")
def get_mean_stats(team_name):
    """Computes the mean stats for a given team using encoded team values."""

    # Check if team exists in encoding
    if team_name not in team_encoder.classes_:
        print(f" Team '{team_name}' not found in team_encoder.")
        return None

    # Get the encoded team value
    team_encoded = team_encoder.transform([team_name])[0]

    # Filter dataset based on encoded value
    team_data = df[df["team_encoded"] == team_encoded]

    if team_data.empty:
        print(f" No historical data available for {team_name}")
        return None

    return team_data[stat_features].mean().to_dict()

def predict_future_stats(team_encoded):
    """Predicts future stats for a given team using LSTM."""
    team_history = df_scaled[df_scaled["team_encoded"] == team_encoded].tail(sequence_length)

    if len(team_history) < sequence_length:
        print(f" Not enough historical data for team {team_encoded}")
        return None

    team_history_input = np.array([team_history[stat_features].values])
    predicted_stats = lstm_model.predict(team_history_input)[0]

    predicted_stats = scaler.inverse_transform([predicted_stats])[0]

    return dict(zip(stat_features, predicted_stats))

def predict_win_probability(team_name, opponent_name):
    """Predicts the win probability of team_name against opponent_name."""
    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        print(" Error: One or both team names are not in the dataset.")
        return

    team_encoded = team_encoder.transform([team_name])[0]
    opponent_encoded = team_encoder.transform([opponent_name])[0]

    # Compute mean stats
    team_mean_stats = get_mean_stats(team_name)
    opponent_mean_stats = get_mean_stats(opponent_name)

    # Predict future stats using LSTM
    team_predicted_stats = predict_future_stats(team_encoded)
    opponent_predicted_stats = predict_future_stats(opponent_encoded)

    if None in (team_mean_stats, opponent_mean_stats, team_predicted_stats, opponent_predicted_stats):
        print(" Error: Insufficient historical data for prediction.")
        return

    # Prepare input for Random Forest model
    input_features = X.iloc[0:1].copy()
    for col in stat_features:
        input_features[col] = (0.6 * team_mean_stats[col]) + (0.4 * team_predicted_stats[col])
        input_features[f"opponent_{col}"] = (0.6 * opponent_mean_stats[col]) + (0.4 * opponent_predicted_stats[col])
    input_features["team_encoded"] = team_encoded
    input_features["opponent_team_encoded"] = opponent_encoded

    # Ensure feature names match those seen during training
    input_features = input_features[X.columns]

    # Predict win probability using Random Forest
    win_probability = rf_model.predict_proba(input_features)[:, 1][0]

    # Print results
    print("\n=======================")
    print(f"Matchup: {team_name} vs {opponent_name}")
    print("=======================")
    print(f"Win Probability for {team_name}: {win_probability:.4f}\n")

    print(f" Mean Stats for {team_name}:")
    for stat, value in team_mean_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\n LSTM Predicted Stats for {team_name}:")
    for stat, value in team_predicted_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\n Mean Stats for {opponent_name}:")
    for stat, value in opponent_mean_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\n LSTM Predicted Stats for {opponent_name}:")
    for stat, value in opponent_predicted_stats.items():
        print(f"{stat}: {value:.4f}")

predict_win_probability("oklahoma_sooners","baylor_bears")


only mean of all stats and prediction soley on the basis of calculations

In [None]:
import numpy as np
import tensorflow as tf
import joblib

lstm_model = tf.keras.models.load_model("lstm_stat_predictor.h5")

def get_mean_stats(team_name):
    """Computes the mean stats for a given team using encoded team values."""
    if team_name not in team_encoder.classes_:
        print(f" Team '{team_name}' not found in team_encoder.")
        return None

    team_encoded = team_encoder.transform([team_name])[0]
    team_data = df[df["team_encoded"] == team_encoded]

    if team_data.empty:
        print(f" No historical data available for {team_name}")
        return None

    return team_data[stat_features].mean().to_dict()

def predict_margin_victory(team_name, opponent_name):
    """Predicts the margin of victory of team_name against opponent_name using only mean stats."""
    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        print(" Error: One or both team names are not in the dataset.")
        return

    # Compute mean stats only
    team_mean_stats = get_mean_stats(team_name)
    opponent_mean_stats = get_mean_stats(opponent_name)

    if None in (team_mean_stats, opponent_mean_stats):
        print(" Error: Insufficient historical data for prediction.")
        return

    # Calculate margin of victory using mean stats
    margin_victory = team_mean_stats.get("points", 0) - opponent_mean_stats.get("points", 0)

    # Determine the winner
    winner = team_name if margin_victory > 0 else opponent_name

    # Print results
    print("\n=======================")
    print(f"Matchup: {team_name} vs {opponent_name}")
    print("=======================")
    print(f"Predicted Margin of Victory for {team_name}: {margin_victory:.2f} points\n")
    print(f"Winner: {winner}\n")

    print(f" Mean Stats for {team_name}:")
    for stat, value in team_mean_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\n Mean Stats for {opponent_name}:")
    for stat, value in opponent_mean_stats.items():
        print(f"{stat}: {value:.4f}")

predict_margin_victory("oklahoma_sooners", "baylor_bears")


mean stats of head to head + predicted stats

In [None]:
import numpy as np
import tensorflow as tf
import joblib

# Load trained models
lstm_model = tf.keras.models.load_model("lstm_stat_predictor.h5")
rf_model = joblib.load("random_forest_model.pkl")

def get_matchup_mean_stats(team_name, opponent_name):
    """Computes mean stats for team_name only from matches where opponent_name was the opponent."""

    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        print(f"Error: One or both teams ('{team_name}', '{opponent_name}') not found in team_encoder.")
        return None

    # Get encoded values for both teams
    team_encoded = team_encoder.transform([team_name])[0]
    opponent_encoded = team_encoder.transform([opponent_name])[0]

    # Filter dataset where the team was in 'team_encoded' and opponent was in 'opponent_team_encoded'
    matchup_data = df[(df["team_encoded"] == team_encoded) & (df["opponent_team_encoded"] == opponent_encoded)]

    if matchup_data.empty:
        print(f"No historical data available for {team_name} vs {opponent_name}.")
        return None

    # Compute mean stats for this specific matchup
    return matchup_data[stat_features].mean().to_dict()

def predict_future_stats(team_encoded):
    """Predicts future stats for a given team using LSTM."""
    team_history = df_scaled[df_scaled["team_encoded"] == team_encoded].tail(sequence_length)

    if len(team_history) < sequence_length:
        print(f"Not enough historical data for team {team_encoded}")
        return None

    team_history_input = np.array([team_history[stat_features].values])
    predicted_stats = lstm_model.predict(team_history_input)[0]

    predicted_stats = scaler.inverse_transform([predicted_stats])[0]

    return dict(zip(stat_features, predicted_stats))

def predict_win_probability(team_name, opponent_name):
    """Predicts the win probability of team_name against opponent_name."""
    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        print("Error: One or both team names are not in the dataset.")
        return

    team_encoded = team_encoder.transform([team_name])[0]
    opponent_encoded = team_encoder.transform([opponent_name])[0]

    # Compute matchup-specific mean stats
    team_mean_stats = get_matchup_mean_stats(team_name, opponent_name)
    opponent_mean_stats = get_matchup_mean_stats(opponent_name, team_name)

    # Predict future stats using LSTM
    team_predicted_stats = predict_future_stats(team_encoded)
    opponent_predicted_stats = predict_future_stats(opponent_encoded)

    if None in (team_mean_stats, opponent_mean_stats, team_predicted_stats, opponent_predicted_stats):
        print("Error: Insufficient historical data for prediction.")
        return

    # Prepare input for Random Forest model
    input_features = X.iloc[0:1].copy()
    for col in stat_features:
        input_features[col] = (0.8 * team_mean_stats[col]) + (0.2 * team_predicted_stats[col])
        input_features[f"opponent_{col}"] = (0.8 * opponent_mean_stats[col]) + (0.2 * opponent_predicted_stats[col])
    input_features["team_encoded"] = team_encoded
    input_features["opponent_team_encoded"] = opponent_encoded

    # Ensure feature names match those seen during training
    input_features = input_features[X.columns]

    # Predict win probability using Random Forest
    win_probability = rf_model.predict_proba(input_features)[:, 1][0]

    # Print results
    print("\n=======================")
    print(f"Matchup: {team_name} vs {opponent_name}")
    print("=======================")
    print(f"Win Probability for {team_name}: {win_probability:.4f}\n")

    print(f"Mean Stats for {team_name} (against {opponent_name}):")
    for stat, value in team_mean_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\nLSTM Predicted Stats for {team_name}:")
    for stat, value in team_predicted_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\nMean Stats for {opponent_name} (against {team_name}):")
    for stat, value in opponent_mean_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\nLSTM Predicted Stats for {opponent_name}:")
    for stat, value in opponent_predicted_stats.items():
        print(f"{stat}: {value:.4f}")

predict_win_probability("oklahoma_sooners","baylor_bears")


only mean for head to head

In [None]:
import numpy as np
import tensorflow as tf
import joblib

lstm_model = tf.keras.models.load_model("lstm_stat_predictor.h5")
rf_model = joblib.load("random_forest_model.pkl")

def get_matchup_mean_stats(team_name, opponent_name):
    """Computes mean stats for team_name only from matches where opponent_name was the opponent."""
    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        print(f"Error: One or both teams ('{team_name}', '{opponent_name}') not found in team_encoder.")
        return None

    team_encoded = team_encoder.transform([team_name])[0]
    opponent_encoded = team_encoder.transform([opponent_name])[0]

    matchup_data = df[(df["team_encoded"] == team_encoded) & (df["opponent_team_encoded"] == opponent_encoded)]

    if matchup_data.empty:
        print(f"No historical data available for {team_name} vs {opponent_name}.")
        return None

    return matchup_data[stat_features].mean().to_dict()

def predict_win_probability(team_name, opponent_name):
    """Predicts the win probability of team_name against opponent_name using only mean stats."""
    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        print("Error: One or both team names are not in the dataset.")
        return

    team_encoded = team_encoder.transform([team_name])[0]
    opponent_encoded = team_encoder.transform([opponent_name])[0]

    team_mean_stats = get_matchup_mean_stats(team_name, opponent_name)
    opponent_mean_stats = get_matchup_mean_stats(opponent_name, team_name)

    if None in (team_mean_stats, opponent_mean_stats):
        print("Error: Insufficient historical data for prediction.")
        return

    # Prepare input for Random Forest model using only mean stats
    input_features = X.iloc[0:1].copy()
    for col in stat_features:
        input_features[col] = team_mean_stats[col]
        input_features[f"opponent_{col}"] = opponent_mean_stats[col]
    input_features["team_encoded"] = team_encoded
    input_features["opponent_team_encoded"] = opponent_encoded

    # Ensure feature names match those seen during training
    input_features = input_features[X.columns]  # Reorder and match feature names exactly

    # Predict win probability using Random Forest
    win_probability = rf_model.predict_proba(input_features)[:, 1][0]

    # Print results
    print("\n=======================")
    print(f"Matchup: {team_name} vs {opponent_name}")
    print("=======================")
    print(f"Win Probability for {team_name}: {win_probability:.4f}\n")

    print(f"Mean Stats for {team_name} (against {opponent_name}):")
    for stat, value in team_mean_stats.items():
        print(f"{stat}: {value:.4f}")

    print(f"\nMean Stats for {opponent_name} (against {team_name}):")
    for stat, value in opponent_mean_stats.items():
        print(f"{stat}: {value:.4f}")

# Example Prediction
predict_win_probability("oklahoma_sooners", "baylor_bears")


for finding the probablity of all team permuations with mean head to head and predicted

In [None]:
import pandas as pd
import itertools
import numpy as np
import tensorflow as tf
import joblib

# Load trained models
lstm_model = tf.keras.models.load_model("lstm_stat_predictor.h5")
rf_model = joblib.load("random_forest_model.pkl")

def get_unique_teams(csv_file):
    df = pd.read_csv(csv_file)
    return df["team"].unique()

def generate_matchups(teams):
    return list(itertools.permutations(teams, 2))

def predict_win_probability(team_name, opponent_name):
    """Predicts the win probability of team_name against opponent_name."""
    if team_name not in team_encoder.classes_ or opponent_name not in team_encoder.classes_:
        return None

    team_encoded = team_encoder.transform([team_name])[0]
    opponent_encoded = team_encoder.transform([opponent_name])[0]

    team_mean_stats = get_matchup_mean_stats(team_name, opponent_name)
    opponent_mean_stats = get_matchup_mean_stats(opponent_name, team_name)

    team_predicted_stats = predict_future_stats(team_encoded)
    opponent_predicted_stats = predict_future_stats(opponent_encoded)

    if None in (team_mean_stats, opponent_mean_stats, team_predicted_stats, opponent_predicted_stats):
        return None

    input_features = X.iloc[0:1].copy()
    for col in stat_features:
        input_features[col] = (0.8 * team_mean_stats[col]) + (0.2 * team_predicted_stats[col])
        input_features[f"opponent_{col}"] = (0.8 * opponent_mean_stats[col]) + (0.2 * opponent_predicted_stats[col])
    input_features["team_encoded"] = team_encoded
    input_features["opponent_team_encoded"] = opponent_encoded

    input_features = input_features[X.columns]
    win_probability = rf_model.predict_proba(input_features)[:, 1][0]

    return win_probability

def main():
    csv_file = "/kaggle/input/ranked-teams/teamwise_stats_east_xgboost_2_ranking.csv"  # Change to your CSV file path
    teams = get_unique_teams(csv_file)
    matchups = generate_matchups(teams)

    results = []
    for team_1, team_2 in matchups:
        win_prob = predict_win_probability(team_1, team_2)
        if win_prob is not None:
            results.append({"team_1": team_1, "team_2": team_2, "win_probability": win_prob})

    output_df = pd.DataFrame(results)
    output_df.to_csv("predicted_matchup_probabilities_east.csv", index=False)
    print("Predictions saved to predicted_matchup_probabilities.csv")

if __name__ == "__main__":
    main()
