# UFC Predictor 

Train an AI UFC sports-betting model

In [62]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# import torchvision.transforms as transforms
# from torchvision.datasets import ImageFolder
# import timm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from datetime import datetime
from collections import defaultdict
import sys

# Print out system info
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Python version: 3.13.2 (v3.13.2:4f8bb3947cf, Feb  4 2025, 11:51:10) [Clang 15.0.0 (clang-1500.3.9.4)]
PyTorch version: 2.7.1
CUDA available: False
Numpy version: 2.3.1
Pandas version: 2.3.1


In [63]:
df = pd.read_csv('datasets/ufc-master.csv')

# Dataset shape and basic info
print("UFC dataframe shape:" , df.shape)   
print("\n--------")           
df.info()  
print("--------")                  

# df.columns.tolist() # Column information

UFC dataframe shape: (6528, 118)

--------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6528 entries, 0 to 6527
Columns: 118 entries, RedFighter to BKOOdds
dtypes: bool(1), float64(60), int64(43), object(14)
memory usage: 5.8+ MB
--------


In [64]:
# Sample 5 random rows and display first 10 columsn
df.sample(5).iloc[:, :10] 

Unnamed: 0,RedFighter,BlueFighter,RedOdds,BlueOdds,RedExpectedValue,BlueExpectedValue,Date,Location,Country,Winner
995,Punahele Soriano,Roman Kopylov,-140.0,120.0,71.4286,120.0,2023-01-14,"Las Vegas, Nevada, USA",USA,Blue
3005,Yushin Okami,Aleksei Kunchenko,250.0,-300.0,250.0,33.3333,2018-12-01,"Adelaide, South Australia, Australia",Australia,Blue
5284,Donald Cerrone,Adriano Martins,-170.0,160.0,58.8235,160.0,2014-01-25,"Chicago, Illinois, USA",USA,Red
5125,Paulo Thiago,Gasan Umalatov,-175.0,165.0,57.1429,165.0,2014-05-31,"Sao Paulo, Brazil",Brazil,Blue
4833,Rafael Dos Anjos,Nate Diaz,-275.0,250.0,36.3636,250.0,2014-12-13,"Phoenix, Arizona, USA",USA,Red


## Data Preprocessing / Feature Engineering

At this stage, we want to explore our ingested UFC dataset, drop any outliers and/or remove noise, and start curating a set of features that will help us best predict UFC fighting outcomes.

### Dropping Official UFC Rankings

The reason for dropping the official UFC rankings are because they're both limited and political/subjective in nature. See the following:
1. Only the top 15 fighters per division are ranked
2. Rankings are infrequent and do not update after every fight
3. These often involve media voting or influenced by marketability of the fighter
4. Fighters must be also be active within the last 12 months or have an upcoming fight booked to be considered in the rankings

A better alternative which we will see below would be to include a pre-fight ELO score for each fighter. This will serve as a normalized/relative ranking metric that is less biased and better serves as a qualitative metric on fighter's ranking/skills at the time of their fight.

### Dropping UFC Betting Odds

We don't want to include betting odds since bookmakers already make their predictions based on available data such as injuries, training camps, and sourcing people's opinions/pre-fight wagers on who will win. This sort of circular logic could result in target leakage. We are not trying to improve upon or build on top of Vegas odds.


In [65]:
# Drop all official UFC rankings
rank_cols=df.filter(regex='.*Rank$', axis=1).columns
df = df.drop(columns=rank_cols)
print(f"DataFrame after dropping UFC rankings: {df.shape}") # dropped 29 columns on 'Rank'

# Drop all UFC betting odds
betting_odds=df.filter(regex='.*Odds$', axis=1).columns
df = df.drop(columns=betting_odds)
print(f"DataFrame after dropping betting odds: {df.shape}") # dropped 8 columns on 'Odds'


DataFrame after dropping UFC rankings: (6528, 89)
DataFrame after dropping betting odds: (6528, 81)


In [66]:
# One metric that we are going to add is ELO scores, but before we do this,
# let's see how draws/no-contest are handled in our dataset...
winner_vals = df['Winner'].unique()
finish_vals = df['Finish'].unique()
print(winner_vals)
print(finish_vals)

# Notice there are about ~200 NaN values which need to be verified, but out of laziness, 
# we'll assume that there was no draw in these cases and that the winner column has the correct result
finish_nan_indices = df.index[df['Finish'].isnull()].tolist()
print(finish_nan_indices[:5], len(finish_nan_indices))

# The 'Date' column can be converted to "datetime" objects for ease of use
print(f"\n\nCurrent type of \'Date\': {df['Date'].dtype}")
df['Date'] = pd.to_datetime(df['Date'])
print(f"New \'Date\' dtype: {df['Date'].dtype}")

['Red' 'Blue']
['SUB' 'U-DEC' 'S-DEC' 'KO/TKO' 'M-DEC' 'DQ' nan 'Overturned']
[2160, 2202, 2203, 2204, 2205] 238


Current type of 'Date': object
New 'Date' dtype: datetime64[ns]


In [None]:
class TimeSeriesBuilder():
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.currELO = defaultdict(float) # latest ELO rating
        self.lastFight = defaultdict(datetime) # date of last fight 
        
    def _update_ELO_stats(self, row, RFighter, BFighter, K=32, default_ELO=1500) -> None:
        # Retrieve both fighter's ELO scores and assign it to the respective column
        Rr = default_ELO if RFighter not in self.currELO else self.currELO[RFighter]
        Rb = default_ELO if BFighter not in self.currELO else self.currELO[BFighter]

        self.df.loc[row, 'RCurrElo'], self.df.loc[row, 'BCurrElo'] = Rr, Rb

        # Calculate the expected win probabilities of each fighter
        expR = 1 / (1 + (10 ** ((Rb - Rr) / 400)))
        expB = 1 - expR

        self.df.loc[row, 'RedExpectedValue'], self.df.loc[row, 'BlueExpectedValue'] = expR, expB
        
        # Based on the decision of this fight, save the post-fight ELOs to our ELO cache/hashmap
        red_score, blue_score = 0, 0
        if self.df.loc[row, 'Finish'] == "DQ" or self.df.loc[row, 'Finish'] == "Overturned":
            red_score, blue_score = 0.5, 0.5
        elif self.df.loc[row, 'Winner'].lower() == "red":
            red_score = 1
        else:
            blue_score = 1
        
        new_Rr = Rr + K * (red_score - expR)
        new_Rb = Rb + K * (blue_score - expB)

        self.currELO[RFighter] = new_Rr
        self.currELO[BFighter] = new_Rb
    
    def _update_time_since_last_fight(self, row: int, RFighter: str, BFighter: str) -> None:
        # Create time since last fight data for both fighters
        curr_fight_date = self.df.loc[row, "Date"]
        
        if RFighter in self.lastFight:
            rdiff = (curr_fight_date - self.lastFight[RFighter]).days
            self.df.loc[row, "RTimeSinceLastFight"] = rdiff
        
        if BFighter in self.lastFight:
            bdiff = (curr_fight_date - self.lastFight[BFighter]).days
            self.df.loc[row, "BTimeSinceLastFight"] = bdiff
        
        self.lastFight[RFighter] = self.lastFight[BFighter] = curr_fight_date
    
    def build_timeseries_data(self):
        # Create or overwrite existing columns 
        self.df['RCurrElo']             = 0.0
        self.df['BCurrElo']             = 0.0
        self.df['RedExpectedValue']     = 0.0 # Overwrite this column w/ our own Exp Prob based on ELO
        self.df['BlueExpectedValue']    = 0.0 # Overwrite this column w/ our own Exp Prob based on ELO
        
        # For a fighter's first appearance in the UFC, they have likely been fighting multiple times
        # a year, so we'll set an approximate default value of 90 days since last fight
        self.df['RTimeSinceLastFight']  = np.int64(90) 
        self.df['BTimeSinceLastFight']  = np.int64(90) 

        # Iterate through dataset in reverse order (since older data at the end)
        values = self.df.to_numpy()
        for r in range(len(values)-1, -1, -1):
            red_fighter, blue_fighter = self.df.loc[r, "RedFighter"], self.df.loc[r, "BlueFighter"]

            # Create the ELO scores
            self._update_ELO_stats(r, red_fighter, blue_fighter)
            # Create TimeSinceLastFight data
            self._update_time_since_last_fight(r, red_fighter, blue_fighter)

builder = TimeSeriesBuilder(df)
builder.build_timeseries_data()
# stats = builder.group_stats_by_fighter()

In [60]:
# Verify that the new features have been added successfully
feature_subset = df[["RedFighter", "RCurrElo", "RedExpectedValue", "RTimeSinceLastFight"]]
feature_subset.sample(4)

Unnamed: 0,RedFighter,RCurrElo,RedExpectedValue,RTimeSinceLastFight
4844,Carla Esparza,1500.0,0.5,90
5718,Mike Pierce,1555.597569,0.511832,70
6126,Takeya Mizugaki,1500.0,0.52301,84
850,Iasmin Lucindo,1484.0,0.5,252


In [73]:
def group_stats_by_fighter():
        '''
        Use this method to group data by fighter and return useful statistics
        '''
        # Let's make a super column of all fighters and their fights
        red = df[['RedFighter', 'RCurrElo', 'Date']].rename(columns={'RedFighter': 'Fighter', 'RCurrElo': 'Elo'})
        blue = df[['BlueFighter', 'BCurrElo', 'Date']].rename(columns={'BlueFighter': 'Fighter', 'BCurrElo': 'Elo'})
        
        all_fights = pd.concat([red, blue])
        # print(f'Verify number of cols has doubled: {all_fights.shape})

        # Create separate DataFrame where each row is a fighter
        stats = all_fights.groupby('Fighter').agg(
            fights=('Date', 'count'),
            firstFight=('Date', 'min'),
            lastFight=('Date', 'max'),
            minElo=('Elo', 'min'),
            maxElo=('Elo', 'max'),
            avgElo=('Elo', 'mean'),
            daysActive=('Date', lambda x: (x.max()-x.min()).days)
        ).reset_index()

        return stats

fighter_stats = group_stats_by_fighter()
fighter_stats.sample(5)

Unnamed: 0,Fighter,fights,firstFight,lastFight,minElo,maxElo,avgElo,daysActive
1622,Quinn Mulhern,2,2013-03-16,2014-01-04,1485.027229,1500.0,1492.513614,294
406,Clifford Starks,3,2011-10-29,2013-04-20,1500.0,1516.0,1505.601251,539
1842,Shinsho Anzai,4,2014-08-23,2018-06-23,1484.0,1517.379323,1500.528908,1400
1657,Ray Rodriguez,2,2020-09-05,2021-03-13,1485.029236,1500.0,1492.514618,189
1581,Paul Buentello,1,2010-03-21,2010-03-21,1500.0,1500.0,1500.0,0


In [None]:
'''
Other Metrics to Build / View:
------------------------------------
Avg_strikes_per_round/match
Avg_control_time_per_round/match

In the future:
- add injuries -- current and healing from
- add fighter age in each fight
'''

'\nOther Metrics to Build:\nAvg_strikes_per_round/match\nAvg_control_time_per_round/match\nTime_since_last_fight\n\nIn the future:\n- add injuries -- current and healing from\n'