# UFC Predictor

Train an AI model to predict UFC fights and method of victory.

Notebook for:
- Feature Selection
- Feature encoding/normalization
- Model Selection/Training/Validation

In [51]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, log_loss

import sys

# Print out system info
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.7.1
CUDA available: False


In [52]:
df = pd.read_csv('../datasets/ufc-clean.csv')

# Dataset shape and basic info
print("UFC dataframe shape:" , df.shape)  

# TODO: encode this columns before exporting dataset in preprocess
# Encode any categorical)
categorical_cols = ['WeightClass', 'BlueStance', 'RedStance', 'Finish']
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

UFC dataframe shape: (6528, 83)


In [53]:
numeric_df = df.select_dtypes(include=['number'])

to_drop = ['BlueAvgSigStrLanded', 'BlueAvgSubAtt', 'BlueAvgTDLanded', 'BlueTotalRoundsFought',
           'BlueHeightCms', 'BlueReachCms', 'BlueWeightLbs', 'BlueAge', 'FinishRound', 'TotalFightTimeSecs', 
           'UFC_DebutDiff', 'CurrELODiff', 'ExpectedValueDiff']

N = len(to_drop)
for i in range(N):
    if to_drop[i][:4] == "Blue":
        to_drop.append("Red" + to_drop[i][4:])

numeric_df.drop(columns=to_drop, inplace=True)

## Feature Selection

Despite our wide feature set to choose from, one pitfall from selecting too many can be overfitting. To prevent this, we can use methods like Recrusive Feature Elimination (RFE) provided by sklearn to help us select a set of features best for predicting UFC outcomes.

In [54]:
X = numeric_df.drop(columns=['Winner'], axis=1)
y = numeric_df['Winner']

estimator = RandomForestClassifier(
    random_state=42, 
    n_estimators=10,  
    max_depth=5       
)

rfe = RFE(estimator, n_features_to_select=25)
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]
X_selected = X[selected_features]
print(selected_features)

X_train, X_test, y_train, y_test = train_test_split(
    X_selected,
    y,
    test_size=0.2,
    random_state=42
)

Index(['RedExpectedValue', 'BlueExpectedValue', 'BlueCurrentWinStreak',
       'BlueAvgSigStrPct', 'BlueAvgTDPct', 'BlueWins', 'RedCurrentWinStreak',
       'RedAvgSigStrPct', 'RedAvgTDPct', 'RedLosses', 'RedDaysSinceLastFight',
       'BlueDaysSinceLastFight', 'RedCurrELO', 'BlueCurrELO',
       'CurrentWinStreakDiff', 'AvgSigStrLandedDiff', 'AvgSubAttDiff',
       'AvgTDLandedDiff', 'LossesDiff', 'TotalRoundsFoughtDiff', 'WinsDiff',
       'ReachCmsDiff', 'AgeDiff', 'WinsByKOTKODiff', 'DaysSinceLastFightDiff'],
      dtype='object')


## Training a Neural Network

In [55]:
# First, let's normalize our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define our Neural Network
class NN(nn.Module):
    def __init__(self, input_len):
        super(NN, self).__init__()

        self.layer1 = nn.Linear(input_len, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output_layer = nn.Linear(32, 2) # binary classification

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.output_layer(x)
    

# Init model
nn_model = NN(X_train.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.001)

# Training loop
for epoch in range(20):
    nn_model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = nn_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# Evaluation
nn_model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = nn_model(batch_X)

        _, pred = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (pred == batch_y).sum().item()

nn_acc = correct / total
print(f"Neural Network Accuracy: {nn_acc:4f}")


Epoch 1, Loss: 0.6116
Epoch 2, Loss: 0.7233
Epoch 3, Loss: 0.6619
Epoch 4, Loss: 0.6473
Epoch 5, Loss: 0.6690
Epoch 6, Loss: 0.5855
Epoch 7, Loss: 0.6829
Epoch 8, Loss: 0.6147
Epoch 9, Loss: 0.5731
Epoch 10, Loss: 0.8684
Epoch 11, Loss: 1.0993
Epoch 12, Loss: 0.6512
Epoch 13, Loss: 0.6055
Epoch 14, Loss: 0.8193
Epoch 15, Loss: 0.4970
Epoch 16, Loss: 0.8329
Epoch 17, Loss: 0.4930
Epoch 18, Loss: 0.6428
Epoch 19, Loss: 0.8089
Epoch 20, Loss: 0.7106
Neural Network Accuracy: 0.600306
