## Dependencies

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

## Function to preprocess data for each sport

This function handles preprocessing of each sport's dataset:

    Fills missing values.
    Encodes categorical features like Team 1 and Team 2 or Player 1 and Player 2.
    Creates a binary target variable (match_result) for win/loss based on Score 1 and Score 2.

In [2]:
def preprocess_data(df, sport):
    # Handle missing values
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    
    # Ensure 'Score 1' and 'Score 2' are numeric
    if 'Score 1' in df.columns and 'Score 2' in df.columns:
        df['Score 1'] = pd.to_numeric(df['Score 1'], errors='coerce')
        df['Score 2'] = pd.to_numeric(df['Score 2'], errors='coerce')
    
    # Convert categorical columns to numerical (if applicable)
    if 'Team 1' in df.columns and 'Team 2' in df.columns:
        label_encoder = LabelEncoder()
        df['Team 1'] = label_encoder.fit_transform(df['Team 1'])
        df['Team 2'] = label_encoder.fit_transform(df['Team 2'])
    
    if 'Player 1' in df.columns and 'Player 2' in df.columns:
        label_encoder = LabelEncoder()
        df['Player 1'] = label_encoder.fit_transform(df['Player 1'])
        df['Player 2'] = label_encoder.fit_transform(df['Player 2'])

    # Create a new target column for binary classification (win/loss)
    if 'Score 1' in df.columns and 'Score 2' in df.columns:
        df['match_result'] = np.where(df['Score 1'] > df['Score 2'], 1, 0)
        df.drop(['Score 1', 'Score 2'], axis=1, inplace=True)

    return df


## Function to train models and compare results

This function trains the models (XGBoost, Random Forest, and Logistic Regression) on the preprocessed data and evaluates each model's performance:

    XGBoost is tuned using GridSearchCV.
    Random Forest and Logistic Regression are trained with default settings.
    For each model, accuracy and classification reports are printed.

In [5]:
def train_and_compare(df, sport):
    print(f"\n--- Training Models for {sport} ---")
    
    # Define features and target
    if 'Team 1' in df.columns:
        X = df[['Team 1', 'Team 2']]
    else:
        X = df[['Player 1', 'Player 2']]
    
    y = df['match_result']
    
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Check the distribution of classes in y_train and y_test
    print(f"Class distribution in y_train for {sport}:")
    print(y_train.value_counts())
    
    print(f"Class distribution in y_test for {sport}:")
    print(y_test.value_counts())
    
    # Proceed with the models only if there are at least two classes in y_train
    if len(y_train.unique()) < 2:
        print(f"Skipping {sport} due to only one class in y_train.")
        return None  # Skip this sport
        
    # Dictionary to store results
    results = {}
    
    # XGBoost Model with Hyperparameter Tuning
    xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)
    param_grid = {
        'max_depth': [3, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300],
        'colsample_bytree': [0.3, 0.7]
    }
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)
    grid_search.fit(X_train, y_train)
    
    # Best model from Grid Search
    best_xgb_model = grid_search.best_estimator_
    
    # Make Predictions
    y_pred_xgb = best_xgb_model.predict(X_test)
    
    # Evaluate XGBoost Model
    xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
    print(f"XGBoost Accuracy: {xgb_accuracy * 100:.2f}%")
    results['XGBoost'] = xgb_accuracy
    
    # Random Forest Model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    
    # Evaluate Random Forest Model
    rf_accuracy = accuracy_score(y_test, y_pred_rf)
    print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
    results['Random Forest'] = rf_accuracy
    
    # Logistic Regression Model
    lr_model = LogisticRegression(max_iter=1000, random_state=42)
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    
    # Evaluate Logistic Regression Model
    lr_accuracy = accuracy_score(y_test, y_pred_lr)
    print(f"Logistic Regression Accuracy: {lr_accuracy * 100:.2f}%")
    results['Logistic Regression'] = lr_accuracy
    
    # Classification Report for each model
    print("XGBoost Classification Report:")
    print(classification_report(y_test, y_pred_xgb))
    
    print("Random Forest Classification Report:")
    print(classification_report(y_test, y_pred_rf))
    
    print("Logistic Regression Classification Report:")
    print(classification_report(y_test, y_pred_lr))
    
    return results


## Loading Datasets and Applying Models

Here we loop through the datasets for all 12 sports, apply the preprocessing, and train the models for each sport. We store the results for later comparison.

In [12]:
# List of sports datasets (replace with actual file paths)
sports_datasets = {
    'Basketball': 'basketball.csv',
    'Cricket': 'cricket.csv',
    'Hockey': 'hockey.csv',
    'Handball': 'handball.csv',
    'Ice Hockey': 'ice_hockey.csv',
    'Lacrosse': 'lacrosse.csv',
    'Roller Hockey': 'roller_hockey.csv',
    'Rugby': 'rugby.csv',
    'Soccer': 'soccer.csv',
    'Volleyball': 'volleyball.csv',
    'Water Polo': 'water_polo.csv',
    'Futsal': 'futsal.csv'
}

# Dictionary to store results for all sports
all_sports_results = {}

# Loop through each sport, preprocess data, and train models
for sport, file in sports_datasets.items():
    df = pd.read_csv(file)
    df = preprocess_data(df, sport)
    results = train_and_compare(df, sport)
    all_sports_results[sport] = results



--- Training Models for Basketball ---
Class distribution in y_train for Basketball:
match_result
1    322
0    246
Name: count, dtype: int64
Class distribution in y_test for Basketball:
match_result
1    72
0    70
Name: count, dtype: int64
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost Accuracy: 73.94%
Random Forest Accuracy: 72.54%
Logistic Regression Accuracy: 58.45%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.71      0.73        70
           1       0.73      0.76      0.75        72

    accuracy                           0.74       142
   macro avg       0.74      0.74      0.74       142
weighted avg       0.74      0.74      0.74       142

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.69      0.71        70
           1       0.71      0.76      0.74        72

    accuracy                           0.73

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.61      1.00      0.76        79

    accuracy                           0.61       129
   macro avg       0.31      0.50      0.38       129
weighted avg       0.38      0.61      0.47       129


--- Training Models for Handball ---
Class distribution in y_train for Handball:
match_result
1    206
0    123
Name: count, dtype: int64
Class distribution in y_test for Handball:
match_result
1    47
0    36
Name: count, dtype: int64
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost Accuracy: 66.27%
Random Forest Accuracy: 60.24%
Logistic Regression Accuracy: 56.63%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.47      0.55        36
           1       0.67      0.81      0.73        47

    accuracy                           0.66        83
   macro avg       0.66      0.6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        46
           1       0.69      1.00      0.82       104

    accuracy                           0.69       150
   macro avg       0.35      0.50      0.41       150
weighted avg       0.48      0.69      0.57       150


--- Training Models for Lacrosse ---
Class distribution in y_train for Lacrosse:
match_result
1    182
0     62
Name: count, dtype: int64
Class distribution in y_test for Lacrosse:
match_result
1    44
0    18
Name: count, dtype: int64
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost Accuracy: 74.19%
Random Forest Accuracy: 77.42%
Logistic Regression Accuracy: 70.97%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.11      0.20        18
           1       0.73      1.00      0.85        44

    accuracy                           0.74        62
   macro avg       0.87      0.5

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.71      1.00      0.83        44

    accuracy                           0.71        62
   macro avg       0.35      0.50      0.42        62
weighted avg       0.50      0.71      0.59        62


--- Training Models for Roller Hockey ---
Class distribution in y_train for Roller Hockey:
match_result
1    122
0     83
Name: count, dtype: int64
Class distribution in y_test for Roller Hockey:
match_result
0    27
1    25
Name: count, dtype: int64
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost Accuracy: 76.92%
Random Forest Accuracy: 59.62%
Logistic Regression Accuracy: 48.08%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.81      0.79        27
           1       0.78      0.72      0.75        25

    accuracy                           0.77        52
   macro avg     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        27
           1       0.48      1.00      0.65        25

    accuracy                           0.48        52
   macro avg       0.24      0.50      0.32        52
weighted avg       0.23      0.48      0.31        52


--- Training Models for Rugby ---
Class distribution in y_train for Rugby:
match_result
1    189
0     71
Name: count, dtype: int64
Class distribution in y_test for Rugby:
match_result
1    49
0    16
Name: count, dtype: int64
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost Accuracy: 76.92%
Random Forest Accuracy: 73.85%
Logistic Regression Accuracy: 75.38%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.56      0.55        16
           1       0.85      0.84      0.85        49

    accuracy                           0.77        65
   macro avg       0.69      0.70      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.75      1.00      0.86        49

    accuracy                           0.75        65
   macro avg       0.38      0.50      0.43        65
weighted avg       0.57      0.75      0.65        65


--- Training Models for Soccer ---
Class distribution in y_train for Soccer:
match_result
1    402
0    279
Name: count, dtype: int64
Class distribution in y_test for Soccer:
match_result
1    101
0     70
Name: count, dtype: int64
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost Accuracy: 60.23%
Random Forest Accuracy: 57.89%
Logistic Regression Accuracy: 57.31%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.11      0.19        70
           1       0.61      0.94      0.74       101

    accuracy                           0.60       171
   macro avg       0.59      0.53   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.84      1.00      0.91       119

    accuracy                           0.84       142
   macro avg       0.42      0.50      0.46       142
weighted avg       0.70      0.84      0.76       142


--- Training Models for Water Polo ---
Class distribution in y_train for Water Polo:
match_result
0    113
1     99
Name: count, dtype: int64
Class distribution in y_test for Water Polo:
match_result
0    37
1    17
Name: count, dtype: int64
Fitting 3 folds for each of 54 candidates, totalling 162 fits
XGBoost Accuracy: 53.70%
Random Forest Accuracy: 55.56%
Logistic Regression Accuracy: 48.15%
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.51      0.60        37
           1       0.36      0.59      0.44        17

    accuracy                           0.54        54
   macro avg       0.54   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.70      1.00      0.82        58

    accuracy                           0.70        83
   macro avg       0.35      0.50      0.41        83
weighted avg       0.49      0.70      0.57        83



## Model Comparison Across All Sports

Finally, we compare the accuracy of XGBoost, Random Forest, and Logistic Regression for each sport and print a summary.

In [13]:
# Displaying results comparison for all sports
print("\n--- Comparison of Model Results Across Sports ---")
for sport, results in all_sports_results.items():
    if results is not None:  # Ensure that results are not None
        print(f"\nResults for {sport}:")
        for model, accuracy in results.items():
            print(f"{model}: {accuracy * 100:.2f}%")
    else:
        print(f"\nSkipping {sport} due to insufficient class variety.")



--- Comparison of Model Results Across Sports ---

Results for Basketball:
XGBoost: 73.94%
Random Forest: 72.54%
Logistic Regression: 58.45%

Results for Cricket:
XGBoost: 51.58%
Random Forest: 48.42%
Logistic Regression: 64.21%

Results for Hockey:
XGBoost: 72.87%
Random Forest: 58.14%
Logistic Regression: 61.24%

Results for Handball:
XGBoost: 66.27%
Random Forest: 60.24%
Logistic Regression: 56.63%

Results for Ice Hockey:
XGBoost: 72.00%
Random Forest: 69.33%
Logistic Regression: 69.33%

Results for Lacrosse:
XGBoost: 74.19%
Random Forest: 77.42%
Logistic Regression: 70.97%

Results for Roller Hockey:
XGBoost: 76.92%
Random Forest: 59.62%
Logistic Regression: 48.08%

Results for Rugby:
XGBoost: 76.92%
Random Forest: 73.85%
Logistic Regression: 75.38%

Results for Soccer:
XGBoost: 60.23%
Random Forest: 57.89%
Logistic Regression: 57.31%

Results for Volleyball:
XGBoost: 87.32%
Random Forest: 80.99%
Logistic Regression: 83.80%

Results for Water Polo:
XGBoost: 53.70%
Random Forest: 