# UFC Predictor 

Train an AI UFC sports-betting model

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# import torchvision.transforms as transforms
# from torchvision.datasets import ImageFolder
# import timm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from datetime import datetime, timedelta
from collections import defaultdict
import sys

# Print out system info
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Python version: 3.13.2 (v3.13.2:4f8bb3947cf, Feb  4 2025, 11:51:10) [Clang 15.0.0 (clang-1500.3.9.4)]
PyTorch version: 2.7.1
CUDA available: False
Numpy version: 2.3.1
Pandas version: 2.3.1


In [14]:
df = pd.read_csv('../datasets/ufc-master.csv')

# Dataset shape and basic info
print("UFC dataframe shape:" , df.shape)   
print("\n--------")           
df.info()  
print("--------")                  

# df.columns.tolist() # Column information

UFC dataframe shape: (6528, 118)

--------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Columns: 118 entries, RedFighter to BKOOdds
dtypes: bool(1), float64(60), int64(43), object(14)
memory usage: 5.8+ MB
--------


In [15]:
# Sample 5 random rows and display first 10 columsn
df.sample(5).iloc[:, :10] 

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner
6002,Chris Camozzi,Dustin Jacoby,-145.0,135.0,68.9655,135.0,2012-01-28,"Chicago, Illinois, USA",USA,Red
3663,Anthony Pettis,Jim Miller,-235.0,195.0,42.5532,195.0,2017-07-08,"Las Vegas, Nevada, USA",USA,Red
4894,Caio Magalhaes,Trevor Smith,-280.0,255.0,35.7143,255.0,2014-11-08,"Uberlandia, Minas Gerais, Brazil",Brazil,Red
1123,Cory Sandhagen,Song Yadong,-205.0,175.0,48.7805,175.0,2022-09-17,"Las Vegas, Nevada, USA",USA,Red
4202,Demian Maia,Matt Brown,-350.0,290.0,28.5714,290.0,2016-05-14,"Curitiba, Parana, Brazil",Brazil,Red


## Data Preprocessing 

Let's begin by cleaning our dataset, removing any noise/outliers, exploring the potential features to train a predictive model on, and curating some of our own data (feature engineering)

### Dropping UFC Rankings

The reason for dropping the official UFC rankings are because they're both limited and political/subjective in nature. See the following:
1. Only the top 15 fighters per division are ranked
2. Rankings are infrequent and do not update after every fight
3. These often involve media voting or influenced by marketability of the fighter
4. Fighters must be also be active within the last 12 months or have an upcoming fight booked to be considered in the rankings

A better alternative which we will see below would be to include a pre-fight ELO score for each fighter. This will serve as a normalized/relative ranking metric that is less biased and better serves as a qualitative metric on fighter's ranking/skills at the time of their fight.

### Dropping UFC Betting Odds

We don't want to include betting odds since bookmakers already make their predictions based on available data such as injuries, training camps, and sourcing people's opinions/pre-fight wagers on who will win. This sort of circular logic could result in target leakage. We are not trying to improve upon or build on top of Vegas odds.

### Dropping other features (misc.)

TODO: 


In [16]:
# Drop all official UFC rankings
rank_cols=df.filter(regex='.*Rank$', axis=1).columns
df = df.drop(columns=rank_cols)
print(f"DataFrame after dropping UFC rankings: {df.shape}") # dropped 29 columns on 'Rank'

# Drop all UFC betting odds
betting_odds=df.filter(regex='.*Odds$', axis=1).columns
df = df.drop(columns=betting_odds)
print(f"DataFrame after dropping betting odds: {df.shape}") # dropped 8 columns on 'Odds'

# Drop some other cols (unuseful for either fighter stats or actual model training)
drop_cols = ["EmptyArena", "FinishDetails", "RedExpectedValue", "BlueExpectedValue"]
df = df.drop(columns=drop_cols)
print(f"DataFrame after dropping other misc cols: {df.shape}") 


DataFrame after dropping UFC rankings: (6528, 89)
DataFrame after dropping betting odds: (6528, 81)
DataFrame after dropping other misc cols: (6528, 77)


### Data Exploration

Let's see how much missing data (i.e. NaN cells) exists in our dataset and for which columns. Then for each case, we'll consider some approaches on how to handle cleaning these values. We want to avoid dropping rows since we only have about 6500 fights in-total to train on.

In [17]:
# List all columns and their count of missing values
missing_cols = df.isnull().any().to_list()
missing_cnt = df.isnull().sum().to_list()

def print_cols_with_missing_data(missing_cols, missing_cnt):
    for i, null_pair in enumerate(zip(missing_cols, missing_cnt)):
        if null_pair[0]:
            print(f"{df.columns[i]}: {null_pair[1]}")

print_cols_with_missing_data(missing_cols, missing_cnt)


BlueAvgSigStrLanded: 930
BlueAvgSigStrPct: 765
BlueAvgSubAtt: 832
BlueAvgTDLanded: 833
BlueAvgTDPct: 842
BlueStance: 3
RedAvgSigStrLanded: 455
RedAvgSigStrPct: 357
RedAvgSubAtt: 357
RedAvgTDLanded: 357
RedAvgTDPct: 367
Finish: 238
FinishRound: 622
FinishRoundTime: 622
TotalFightTimeSecs: 622


In [18]:
# One metric that we are going to add is ELO scores, but before we do this,
# let's see how draws/no-contest are handled in our dataset...
winner_vals = df['Winner'].unique()
finish_vals = df['Finish'].unique()
print("Winner outcomes: ", winner_vals)
print("Finish outcomes: ", finish_vals)

# Notice there are about ~200 NaN values which need to be verified, but out of laziness, 
# we'll assume that there was no draw in these cases and that the winner column has the correct result
finish_nan_indices = df.index[df['Finish'].isnull()].tolist()
print("Missing finish outcomes: ", finish_nan_indices[:5], len(finish_nan_indices))

# The 'Date' column can be converted to "datetime" objects for ease of use
print(f"\nCurrent type of \'Date\': {df['Date'].dtype}")
df['Date'] = pd.to_datetime(df['Date'])
print(f"New \'Date\' dtype: {df['Date'].dtype}")

Winner outcomes:  ['Red' 'Blue']
Finish outcomes:  ['SUB' 'U-DEC' 'S-DEC' 'KO/TKO' 'M-DEC' 'DQ' nan 'Overturned']
Missing finish outcomes:  [2160, 2202, 2203, 2204, 2205] 238

Current type of 'Date': object
New 'Date' dtype: datetime64[ns]


In [27]:
def split_rows_by_fighter():
    '''
    Reshape dataset such that stats are split by fighter (instead of by fight)
    '''
    R_cols = ['RedFighter', 'Date', 'WeightClass', 'RedAvgSigStrLanded', 'RedAvgSigStrPct', 'RedAvgSubAtt', 'RedAvgTDLanded', 'RedAvgTDPct']
    B_cols = ['BlueFighter', 'Date', 'WeightClass', 'BlueAvgSigStrLanded', 'BlueAvgSigStrPct', 'BlueAvgSubAtt', 'BlueAvgTDLanded', 'BlueAvgTDPct']

    renameTo = {}
    for rCol, bCol in zip(R_cols, B_cols):
        if rCol == "Date" or rCol == "WeightClass":
                continue      
        renameTo[rCol] = rCol[3:] # Ignore 'Red'
        renameTo[bCol] = bCol[4:] # Ignore 'Blue'

    red = df[R_cols].rename(columns=renameTo)
    blue = df[B_cols].rename(columns=renameTo)
    return pd.concat([red, blue])

all_fights = split_rows_by_fighter() 

# Now let's aggregate some statistics by WeightClass
wc_stats = all_fights.groupby('WeightClass').agg(
    Med_AvgSigStrLanded=("AvgSigStrLanded", "median"),
    Med_AvgSigStrPct=("AvgSigStrPct", "median"),
    Med_AvgSubAtt=("AvgSubAtt", "median"),
    Med_AvgTDLanded=("AvgTDLanded", "median"),
    Med_AvgTDPct=("AvgTDPct", "median")
)
wc_stats


Unnamed: 0_level_0,Med_AvgSigStrLanded,Med_AvgSigStrPct,Med_AvgSubAtt,Med_AvgTDLanded,Med_AvgTDPct
WeightClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bantamweight,6.28,0.437,0.3,1.0,0.32
Catch Weight,4.46,0.46,0.3333,1.19,0.37
Featherweight,7.22,0.44,0.4,1.0,0.333
Flyweight,5.28,0.44,0.5,1.2792,0.33
Heavyweight,6.55,0.49,0.0625,0.5,0.25
Light Heavyweight,8.57,0.4895,0.1667,0.6429,0.28
Lightweight,18.0,0.433,0.2857,1.0,0.32
Middleweight,6.817737,0.477,0.2,1.0,0.308
Welterweight,17.0,0.45,0.25,0.965,0.32
Women's Bantamweight,6.01,0.47,0.29285,0.93,0.33


### Feature Engineering

TODO: explanation

In [32]:
class UFC_TimeSeries():
    def __init__(self, fight_data: pd.DataFrame, weight_class_stats: pd.DataFrame):
        self.df = fight_data
        self.wc = weight_class_stats

        self.hist = defaultdict(dict)
        self.timeseries_cols = self._init_cols()

    def _init_cols(self) -> list[str]:
        # Timeseries columns which we'll be creating
        new_cols = ["UFC_Debut", "DaysSinceLastFight", "WonLastFight", "CurrELO", "ExpectedValue"]
        # Columns which we'll overwrite w/ running averages (i.e. turn into timerseries data)
        overwrite = ["AvgSigStrLanded", "AvgSigStrPct", "AvgSubAtt", "AvgTDLanded", "AvgTDPct"]
        
        res = []
        for col in new_cols:
            r, b = "Red" + col, "Blue" + col
            res.append(r)
            res.append(b)
            # init new columns
            self.df[r], self.df[b] = None, None
            
        for col in overwrite:
            r, b = "Red" + col, "Blue" + col
            res.append(r)
            res.append(b)
        
        return res

    def isDebut(self, fighter: str) -> bool:
        '''
        Check if this is fighter's first appearance in the UFC

        NOTE: this would be far 'safer' if we compared the date of the current record 
              against the fighter's first recorded fight date
        '''
        return True if fighter not in self.hist else False

    def _createELO(self, row, red, blue, rDebut, bDebut, K=32, default_ELO=1500):

        # Retrieve both fighter's ELO scores and assign it to the respective column
        Rr = default_ELO if rDebut else self.hist[red]["CurrELO"]
        Rb = default_ELO if bDebut else self.hist[blue]["CurrELO"]

        self.df.loc[row, 'RedCurrELO'], self.df.loc[row, 'BlueCurrELO'] = Rr, Rb

        # Calculate the expected win probabilities of each fighter
        expR = 1 / (1 + (10 ** ((Rb - Rr) / 400)))
        expB = 1 - expR

        self.df.loc[row, 'RedExpectedValue'], self.df.loc[row, 'BlueExpectedValue'] = expR, expB
        
        # Based on the decision of this fight, save the post-fight ELOs to our ELO cache/hashmap
        red_score, blue_score = 0, 0
        if self.df.loc[row, 'Finish'] == "DQ" or self.df.loc[row, 'Finish'] == "Overturned":
            red_score, blue_score = 0.5, 0.5
        elif self.df.loc[row, 'Winner'].lower() == "red":
            red_score = 1
        else:
            blue_score = 1
        
        new_Rr = Rr + K * (red_score - expR)
        new_Rb = Rb + K * (blue_score - expB)

        self.hist[red]["CurrELO"] = new_Rr
        self.hist[blue]["CurrELO"] = new_Rb
    
    
    def _createHistoricalFightRecords(self, row, red, blue, rDebut, bDebut, avgDaysBetweenFights=60):
        curr_date = self.df.loc[row, 'Date']

        def updateIndividualRecord(fighter, isDebut, corner):
            if isDebut:
                self.df.loc[row, corner + "UFC_Debut"]          = True
                self.df.loc[row, corner + "WonLastFight"]       = True
                self.df.loc[row, corner + "DaysSinceLastFight"] = avgDaysBetweenFights
            else:
                self.df.loc[row, corner + "UFC_Debut"]          = False
                self.df.loc[row, corner + "WonLastFight"]       = self.hist[fighter]["WonLastFight"]
                self.df.loc[row, corner + "DaysSinceLastFight"] = (curr_date - self.hist[fighter]["LastFightDate"]).days

            # Update running historical record
            self.hist[fighter]["LastFightDate"] = curr_date
            
            if (self.df.loc[row, 'Finish'] == "DQ"          or 
                self.df.loc[row, 'Finish'] == "Overturned"  or
                self.df.loc[row, 'Winner'].lower() != corner.lower()
            ):
                self.hist[fighter]["WonLastFight"] = False
            else:
                self.hist[fighter]["WonLastFight"] = True
            
        updateIndividualRecord(red, rDebut, "Red")
        updateIndividualRecord(blue, bDebut, "Blue")

    def _convertFightStats(self, row, red, blue, rDebut, bDebut):
        '''
        Convert fight stats into timeseries data (running average)
        '''
        stats = ["AvgSigStrLanded", "AvgSigStrPct", "AvgSubAtt", "AvgTDLanded", "AvgTDPct"]
        
        # Get the median of every statistic by weight class to impute on NaN cells
        weight_class = self.df.loc[row, "WeightClass"]
        wc_med = [self.wc.query(f"WeightClass == \"{weight_class}\"")["Med_" + stat].iloc[0] for stat in stats]
        
        def updateIndividualStats(fighter, isDebut, corner):
            for i, stat in enumerate(stats):
                original_val = df.loc[row, corner + stat]
                
                if pd.isnull(original_val):
                    original_val = wc_med[i]

                if isDebut:
                    df.loc[row, corner + stat] = wc_med[i]
                    self.hist[fighter]["runsum_" + stat] = original_val
                    self.hist[fighter]["TotalFights"] = 1
                else:
                    df.loc[row, corner + stat] = (self.hist[fighter]["runsum_" + stat] / self.hist[fighter]["TotalFights"])
                    self.hist[fighter]["runsum_" + stat] += original_val
                    self.hist[fighter]["TotalFights"] += 1

        updateIndividualStats(red, rDebut, "Red")
        updateIndividualStats(blue, bDebut, "Blue")
    
    def build_timerseries_data(self):
        # Iterate through dataset in reverse order (since older data at the end)
        values = self.df.to_numpy()
        for row in range(len(values)-1, -1, -1):
            red, blue = self.df.loc[row, "RedFighter"], self.df.loc[row, "BlueFighter"]
            rDebut, bDebut = self.isDebut(red), self.isDebut(blue)

            self._createELO(row, red, blue, rDebut, bDebut)
            self._createHistoricalFightRecords(row, red, blue, rDebut, bDebut)
            self._convertFightStats(row, red, blue, rDebut, bDebut)

builder = UFC_TimeSeries(df, wc_stats)
builder.build_timerseries_data()

In [33]:
# Verify that the new features have been added successfully
feature_subset = df[builder.timeseries_cols]
feature_subset.sample(4)

Unnamed: 0,RedUFC_Debut,BlueUFC_Debut,RedDaysSinceLastFight,BlueDaysSinceLastFight,RedWonLastFight,BlueWonLastFight,RedCurrELO,BlueCurrELO,RedExpectedValue,BlueExpectedValue,RedAvgSigStrLanded,BlueAvgSigStrLanded,RedAvgSigStrPct,BlueAvgSigStrPct,RedAvgSubAtt,BlueAvgSubAtt,RedAvgTDLanded,BlueAvgTDLanded,RedAvgTDPct,BlueAvgTDPct
4457,False,False,105,147,False,True,1540.200748,1516.0,0.534771,0.465229,5.816716,17.0,0.089409,0.225,0.099274,0.083333,0.486113,0.24125,0.07536,0.064
3254,False,False,195,181,True,True,1569.342243,1560.78623,0.512311,0.487689,5.526913,8.708331,0.067803,0.120882,0.142554,0.015872,0.387991,0.184211,0.042932,0.11235
4944,False,False,164,70,True,True,1585.343583,1549.946779,0.550765,0.449235,4.100451,5.929091,0.05036,0.125833,0.064597,0.030769,0.114468,0.214286,0.028362,0.048867
4980,False,False,90,61,False,False,1519.014142,1512.551594,0.509299,0.490701,5.063652,6.934545,0.089795,0.15125,0.086955,0.038462,0.182292,0.519943,0.063984,0.108
