# UFC Predictor

Train an AI model to predict UFC fights and method of victory.

Notebook for:
- Feature Selection
- Feature encoding/normalization
- Model Selection/Training/Validation

In [15]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

import sys

# Print out system info
print(f"XGBoost version: {xgb.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

XGBoost version: 3.0.3
PyTorch version: 2.8.0
CUDA available: False


In [16]:
df = pd.read_csv('../datasets/ufc-clean.csv')

# Dataset shape and basic info
print("UFC dataframe shape:" , df.shape)          
df.info()  

UFC dataframe shape: (6528, 83)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Data columns (total 83 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   RedFighter              6528 non-null   object 
 1   BlueFighter             6528 non-null   object 
 2   RedExpectedValue        6528 non-null   float64
 3   BlueExpectedValue       6528 non-null   float64
 4   Date                    6528 non-null   object 
 5   Location                6528 non-null   object 
 6   Country                 6528 non-null   object 
 7   Winner                  6528 non-null   int64  
 8   TitleBout               6528 non-null   int64  
 9   WeightClass             6528 non-null   object 
 10  Gender                  6528 non-null   object 
 11  NumberOfRounds          6528 non-null   int64  
 12  BlueCurrentLoseStreak   6528 non-null   int64  
 13  BlueCurrentWinStreak    6528 non-null   int64  
 14  BlueDraw

In [None]:
categorical_cols = ['WeightClass', 'BlueStance', 'RedStance', 'Finish']

# Encode any categorical)
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [18]:
numeric_df = df.select_dtypes(include=['number'])

to_remove = set(["Winner", "NumberOfRounds", "ExpectedValueDiff", "UFC_DebutDiff", "CurrELODiff", "RedWeightLbs", "BlueWeightLbs"])
feature_cols = set(numeric_df.columns.tolist())
for col in to_remove:
    feature_cols.remove(col)
feature_cols = list(feature_cols)

# Split into train/test (time-based)
split_idx = int(len(df) * 0.8)
train_df = numeric_df.iloc[:split_idx]
test_df  = numeric_df.iloc[split_idx:]

X_train, y_train = train_df[feature_cols], train_df['Winner']
X_test,  y_test  = test_df[feature_cols],  test_df['Winner']


In [19]:
# Use a model to recursively eliminate features
estimator = RandomForestClassifier(
    random_state=42,
    n_estimators=100,  
    max_depth=10       
)

selector = RFE(estimator, n_features_to_select=25, step=1)
selector.fit(X_train, y_train)

selected_features = X_train.columns[selector.support_]
print(selected_features)

X_train = selector.transform(X_train)
X_test = selector.transform(X_test)


Index(['RedAvgSubAtt', 'RedAvgTDPct', 'AvgSigStrLandedDiff', 'AvgTDLandedDiff',
       'BlueAvgTDLanded', 'RedCurrELO', 'AgeDiff', 'BlueDaysSinceLastFight',
       'BlueAge', 'RedAge', 'TotalRoundsFoughtDiff', 'RedExpectedValue',
       'BlueAvgTDPct', 'BlueAvgSigStrPct', 'BlueCurrELO', 'TotalFightTimeSecs',
       'BlueExpectedValue', 'RedAvgTDLanded', 'LossesDiff',
       'DaysSinceLastFightDiff', 'RedAvgSigStrLanded', 'RedAvgSigStrPct',
       'RedDaysSinceLastFight', 'AvgSubAttDiff', 'BlueAvgSigStrLanded'],
      dtype='object')


In [20]:


# Train binary classifier for money line winner
model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
model.fit(X_train, y_train)

# Predict probabilities and class labels
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

# Evaluate
acc = accuracy_score(y_test, y_pred)
ll  = log_loss(y_test, y_prob)

print(f"Accuracy: {acc:.3f}, Log Loss: {ll:.3f}")

Accuracy: 0.557, Log Loss: 0.756
