In [1]:
# !pip install --upgrade pandas

In [3]:
import pandas as pd
import numpy as np

In [10]:
events_df = pd.read_csv('ufc_event_details.csv')

column_mapping = {}
for col in events_df.columns:
    column_mapping[col] = col.lower()
events_df = events_df.rename(mapper=column_mapping, errors='raise', axis=1)

events_df['event'] = events_df['event'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower().dropna()
events_df['date'] = pd.to_datetime(events_df['date']).dropna()
events_df[['date', 'event']].head()

Unnamed: 0,date,event
0,2024-08-10,ufc fight night: tybura vs. spivac 2
1,2024-08-03,ufc fight night: sandhagen vs. nurmagomedov
2,2024-07-27,ufc 304: edwards vs. muhammad 2
3,2024-07-20,ufc fight night: lemos vs. jandiroba
4,2024-07-13,ufc fight night: namajunas vs. cortez


In [11]:
fight_results_df = pd.read_csv('./ufc_fight_results.csv')

column_mapping = {}
for col in fight_results_df.columns:
    column_mapping[col] = col.lower()
fight_results_df = fight_results_df.rename(mapper=column_mapping, errors='raise', axis=1)

fight_results_df = fight_results_df.dropna(subset=['outcome', 'weightclass', 'bout', 'event'])
fight_results_df = fight_results_df[(~fight_results_df['outcome'].isin(['D/D', 'NC/NC']))]
relevant_weight_classes = ['Flyweight', 'Bantamweight', 'Featherweight', 'Lightweight', 'Welterweight', 'Middleweight', 'Light Heavyweight']
modded_relevant_weight_classes = []
for weight_class in relevant_weight_classes:
  modded_relevant_weight_classes.append(weight_class + ' Bout')
  modded_relevant_weight_classes.append('UFC ' + weight_class + ' Title Bout')
  modded_relevant_weight_classes.append('UFC Interim ' + weight_class + ' Title Bout')
fight_results_df = fight_results_df[(fight_results_df['weightclass'].isin(modded_relevant_weight_classes))]
weightclass_mapping = {
    "Featherweight Bout": 145,
    "UFC Featherweight Title Bout": 145,
    "UFC Interim Featherweight Title Bout": 145,
    "Bantamweight Bout": 135,
    "UFC Bantamweight Title Bout": 135,
    "UFC Interim Bantamweight Title Bout": 135,
    "Lightweight Bout": 155,
    "UFC Lightweight Title Bout": 155,
    "UFC Interim Lightweight Title Bout": 155,
    "Welterweight Bout": 170,
    "UFC Welterweight Title Bout": 170,
    "UFC Interim Welterweight Title Bout": 170,
    "Middleweight Bout": 185,
    "UFC Middleweight Title Bout": 185,
    "UFC Interim Middleweight Title Bout": 185,
    "Light Heavyweight Bout": 205,
    "UFC Light Heavyweight Title Bout": 205,
    "UFC Interim Light Heavyweight Title Bout": 205,
    "Flyweight Bout": 125,
    "UFC Flyweight Title Bout": 125,
    "UFC Interim Flyweight Title Bout": 125
}
fight_results_df['weightclass'] = fight_results_df['weightclass'].map(weightclass_mapping)
fight_results_df[['fighter_a', 'fighter_b']] = fight_results_df['bout'].str.split(' vs. ', expand=True)
fight_results_df['fighter_a'] = fight_results_df['fighter_a'].str.replace(r'\s+', ' ', regex=True).str.strip()
fight_results_df['fighter_b'] = fight_results_df['fighter_b'].str.replace(r'\s+', ' ', regex=True).str.strip()
fight_results_df['outcome'] = fight_results_df['outcome'].apply(lambda x: 1 if x == 'W/L' else 0)
fight_results_df = fight_results_df[(fight_results_df['method'] != 'dq ')]
fight_results_df = fight_results_df.drop('referee', axis=1)
fight_results_df = fight_results_df.drop('details', axis=1)
fight_results_df['event'] = fight_results_df['event'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()
fight_results_df['bout'] = fight_results_df['bout'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower()
fight_results_df['method'] = fight_results_df['method'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower().apply(lambda x: 'decision' if 'decision' in x else x)
fight_results_df = fight_results_df[(~fight_results_df['method'].isin(["tko - doctor's stoppage", 'dq']))]

print('pre-merge', fight_results_df.shape[0])

fight_results_df = pd.merge(fight_results_df, events_df[['event', 'date']], on='event', how='left').dropna()

print('post-merge', fight_results_df.shape[0])

def time_to_seconds(t):
    if pd.isnull(t):
        return np.nan
    if t == '0:00': return 0
    if '--' in t: return np.nan
    t = t.split(':')
    return int(t[0]) * 60 + int(t[1])

def fight_results_time(row):
    returner = time_to_seconds(row['time'])
    if pd.isnull(returner):
        returner = row['round'] * 300
    row['time'] = returner
    return row

fight_results_df = fight_results_df.apply(fight_results_time, axis=1).apply(pd.Series).dropna()
print('coerced times', fight_results_df.shape[0])

def total_duration(row):
    returner = 0
    if row['round'] == 1:
        returner = row['time']
    returner = row['time'] + (row['round'] - 1) * 300
    row['total_time'] = returner
    return row

fight_results_df = fight_results_df.apply(total_duration, axis=1)
print('durations applied', fight_results_df.shape[0])

fight_results_df = fight_results_df.drop(columns=['url', 'time format'], axis=1)
print(fight_results_df['method'].unique())
fight_results_df.head()

pre-merge 5842
post-merge 5842
coerced times 5842
durations applied 5842
['decision' 'submission' 'ko/tko']


Unnamed: 0,event,bout,outcome,weightclass,method,round,time,fighter_a,fighter_b,date,total_time
0,ufc fight night: tybura vs. spivac 2,damon jackson vs. chepe mariscal,0,145,decision,3,300,Damon Jackson,Chepe Mariscal,2024-08-10,900
1,ufc fight night: tybura vs. spivac 2,danny barlow vs. nikolay veretennikov,1,170,decision,3,300,Danny Barlow,Nikolay Veretennikov,2024-08-10,900
2,ufc fight night: tybura vs. spivac 2,chris gutierrez vs. quang le,1,135,decision,3,300,Chris Gutierrez,Quang Le,2024-08-10,900
3,ufc fight night: tybura vs. spivac 2,toshiomi kazama vs. charalampos grigoriou,1,135,submission,2,115,Toshiomi Kazama,Charalampos Grigoriou,2024-08-10,415
4,ufc fight night: tybura vs. spivac 2,youssef zalal vs. jarno errens,1,145,submission,1,232,Youssef Zalal,Jarno Errens,2024-08-10,232


In [35]:
def height_to_inches(height):
    if height == '--': return None
    feet, inches = height.split("' ")
    inches = inches.replace('"', '')
    return int(feet) * 12 + int(inches)

def reach_to_inches(reach):
    if reach == '--': return None
    return int(reach.replace('"', ''))

def weight_to_num(weight):
    if not ' lbs.' in weight: return None
    return int(weight.replace(' lbs.', ''))

fighters_df = pd.read_csv('./ufc_fighter_tott.csv')
column_mapping = {}
for col in fighters_df.columns:
    column_mapping[col] = col.lower()
fighters_df = fighters_df.rename(mapper=column_mapping, errors='raise', axis=1)
# Remove Bruno Silvas
fighters_df = fighters_df[fighters_df['fighter'] != 'Bruno Silva']
print('before...', fighters_df.shape[0])
fighters_df = fighters_df[(~fighters_df['dob'].isna()) & (fighters_df['dob'].str.len() > 3) & (~fighters_df['height'].isna()) & (~fighters_df['reach'].isna())]
fighters_df['dob'] = pd.to_datetime(fighters_df['dob'], errors='coerce')
fighters_df['weight'] = fighters_df['weight'].apply(weight_to_num).astype(float)
fighters_df['height'] = fighters_df['height'].apply(height_to_inches).astype(float)
fighters_df['reach'] = fighters_df['reach'].apply(reach_to_inches).astype(float)
fighters_df = fighters_df.dropna(subset=['height', 'weight', 'reach', 'dob'])
fighters_df = fighters_df.drop(columns=['stance', 'url'])
fighters_df = fighters_df.drop_duplicates(subset=['fighter', 'dob'])
fighters_df['fighter'] = fighters_df['fighter'].str.replace(r'\s+', ' ', regex=True).str.strip().dropna()
print('after...', fighters_df.shape[0])
fighters_df.head()

before... 8473
after... 2264


Unnamed: 0,fighter,height,weight,reach,dob
3,Shamil Abdurakhimov,75.0,235.0,76.0,1981-09-02
5,Daichi Abe,71.0,170.0,71.0,1991-11-27
8,Klidson Abreu,72.0,205.0,74.0,1992-12-24
11,Juan Adams,77.0,265.0,80.0,1992-01-16
12,Anthony Adams,73.0,185.0,76.0,1988-01-13


In [39]:
fight_stats_df = pd.read_csv('./ufc_fight_stats.csv')

def round_to_int(round):
    if pd.isnull(round): return None
    return int(round.split(' ')[1])

def get_stat_part(key, stat):
    if not isinstance(stat, str):
      return None
    if pd.isnull(stat):
      return None
    splitted = stat.split(' of ')
    if len(splitted) == 2:
      return splitted[key]
    return None

def get_left_stat(stat):
    return get_stat_part(0, stat)

def get_right_stat(stat):
    return get_stat_part(1, stat)

fight_stats_df['ROUND'] = fight_stats_df['ROUND'].apply(round_to_int)
fight_stats_df['ROUND'] = fight_stats_df['ROUND'].replace([np.inf, -np.inf], np.nan)
fight_stats_df['ROUND'] = fight_stats_df['ROUND'].dropna()

for after, before in [
  ['SIG_STR_', 'SIG.STR.'],
  ['TOTAL_STR_', 'TOTAL STR.'],
  ['TD_', 'TD'],
  ['HEAD_', 'HEAD'],
  ['BODY_', 'BODY'],
  ['LEG_', 'LEG'],
  ['DISTANCE_', 'DISTANCE'],
  ['CLINCH_', 'CLINCH'],
  ['GROUND_', 'GROUND'],
]:
    fight_stats_df[after + 'LANDED'] = fight_stats_df[before].apply(get_left_stat).astype(float)
    fight_stats_df[after + 'LANDED'] = fight_stats_df[after + 'LANDED'].astype(float)
    fight_stats_df[after + 'ATTEMPTED'] = fight_stats_df[before].apply(get_right_stat)
    fight_stats_df[after + 'ATTEMPTED'] = fight_stats_df[after + 'ATTEMPTED'].astype(float)
    fight_stats_df.drop(columns=[before], inplace=True)

fight_stats_df['CTRL'] = fight_stats_df['CTRL'].apply(time_to_seconds).dropna().astype(float)
fight_stats_df.drop(columns=['SIG.STR. %', 'TD %'], inplace=True)
fight_stats_df['EVENT'] = fight_stats_df['EVENT'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower().reset_index(drop=True)

fight_stats_df['BOUT'] = fight_stats_df['BOUT'].str.replace(r'\s+', ' ', regex=True).str.strip().str.lower().reset_index(drop=True)

column_mapping = {}
for col in fight_stats_df.columns:
    column_mapping[col] = col.lower()

column_mapping['REV.'] = 'reversals'
column_mapping['CTRL'] = 'control_time'
column_mapping['SUB.ATT'] = 'sub_attempted'
column_mapping['KD'] = 'knockdowns'

fight_stats_df = fight_stats_df.rename(mapper=column_mapping, errors='raise', axis=1).reset_index(drop=True)

def get_fight_data(fighter, event, bout):
    fight_results_row = fight_results_df[['fighter_a', 'fighter_b', 'date', 'outcome', 'method']][((fight_results_df['fighter_a'] == fighter) | (fight_results_df['fighter_b'] == fighter)) & (fight_results_df['event'] == event) & (fight_results_df['bout'] == bout)]
    if fight_results_row.empty:
        return [None, None, None, None]
    outcome = fight_results_row['outcome'].values[0]
    method = fight_results_row['method'].values[0]
    if fight_results_row['fighter_a'].values[0] == fighter:
        return [fight_results_row['fighter_b'].values[0], fight_results_row['date'].values[0], outcome, method]
    return [fight_results_row['fighter_a'].values[0], fight_results_row['date'].values[0], 1 if outcome == 0 else 0, method]

def apply_fight_data(row):
    row['opponent'], row['date'], row['outcome'], row['method'] = get_fight_data(row['fighter'], row['event'], row['bout'])
    return row

fight_stats_df = fight_stats_df.apply(apply_fight_data, axis=1).dropna()

for col in fight_stats_df.columns:
    print(col)

fight_stats_df[['event', 'bout', 'fighter', 'opponent', 'outcome', 'method']][(fight_stats_df['fighter'] == 'Youssef Zalal')].head()

event
bout
round
fighter
knockdowns
sub_attempted
reversals
control_time
sig_str_landed
sig_str_attempted
total_str_landed
total_str_attempted
td_landed
td_attempted
head_landed
head_attempted
body_landed
body_attempted
leg_landed
leg_attempted
distance_landed
distance_attempted
clinch_landed
clinch_attempted
ground_landed
ground_attempted
opponent
date
outcome
method


Unnamed: 0,event,bout,fighter,opponent,outcome,method
42,ufc fight night: tybura vs. spivac 2,youssef zalal vs. jarno errens,Youssef Zalal,Jarno Errens,1.0,submission
1090,ufc fight night: ribas vs. namajunas,billy quarantillo vs. youssef zalal,Youssef Zalal,Billy Quarantillo,1.0,submission
1091,ufc fight night: ribas vs. namajunas,billy quarantillo vs. youssef zalal,Youssef Zalal,Billy Quarantillo,1.0,submission
8057,ufc fight night: rozenstruik vs. sakai,sean woodson vs. youssef zalal,Youssef Zalal,Sean Woodson,0.0,decision
8058,ufc fight night: rozenstruik vs. sakai,sean woodson vs. youssef zalal,Youssef Zalal,Sean Woodson,0.0,decision


In [53]:
missing_fight_details = 0
missing_fighter_results = 0
missing_opp_results = 0
missing_fighter_stats = 0
missing_opp_stats = 0

# fighter_stats_df = pd.read_csv('fighter_stats.csv')

def get_fighter_stats(fighter, event, bout):
    global missing_fight_details
    global missing_fighter_results
    global missing_opp_results
    global missing_fighter_stats
    global missing_opp_stats
    fighter_results = fight_stats_df[(fight_stats_df['fighter'] == fighter) & (fight_stats_df['event'] == event) & (fight_stats_df['bout'] == bout)]
    if fighter_results.shape[0] == 0:
        missing_fighter_results += 1
        return None
    fight_date = fighter_results['date'].head(1).item()
    fighter_results = fighter_results.drop(columns=['fighter', 'event', 'bout', 'round'])
    fighter_stats = fighters_df[['weight', 'height', 'reach', 'dob']][fighters_df['fighter'] == fighter].head(1)
    if fighter_stats.shape[0] == 0:
        missing_fighter_stats += 1
        return None
    opponent_name = fighter_results['opponent'].head(1).item()
    opp_results = fight_stats_df[(fight_stats_df['fighter'] == opponent_name) & (fight_stats_df['event'] == event) & (fight_stats_df['bout'] == bout)]
    opp_results = opp_results.drop(columns=['fighter', 'event', 'bout', 'round'])
    if opp_results.shape[0] == 0:
        missing_opp_results += 1
        return None
    opp_stats = fighters_df[['weight', 'height', 'reach', 'dob']][fighters_df['fighter'] == opponent_name].head(1)
    if opp_stats.shape[0] == 0:
        missing_opp_stats += 1
        return None
    age = fight_date - fighter_stats['dob'].item()
    age = age.days / 365.25
    opponent_age = fight_date - opp_stats['dob'].item()
    opponent_age = opponent_age.days / 365.25
    returner = {
        'date': fight_date,
        'event': event,
        'bout': bout,
        'fighter': fighter,
        'weight': float(fighter_stats['weight'].astype(float).values[0]),
        'height': float(fighter_stats['height'].astype(float).values[0]),
        'reach': float(fighter_stats['reach'].astype(float).values[0]),
        'age': float(age),
        'outcome': float(fighter_results['outcome'].astype(float).values[0]),
        'method': fighter_results['method'].values[0],
        'opponent': opponent_name,
    }
    for col in filter(lambda x: x.endswith('landed') or x.endswith('attempted') or x in ['knockdowns', 'reversals', 'control_time'], fighter_results.columns):
        returner[col] = float(fighter_results[col].sum())
    for col in filter(lambda x: x.endswith('landed') or x.endswith('attempted') or x in ['knockdowns', 'reversals', 'control_time'], opp_results.columns):
        returner['opponent_' + col] = float(opp_results[col].sum())
    returner['opponent_weight'] = float(opp_stats['weight'].astype(float).values[0])
    returner['opponent_height'] = float(opp_stats['height'].astype(float).values[0])
    returner['opponent_reach'] = float(opp_stats['reach'].astype(float).values[0])
    returner['opponent_age'] = float((fight_date - opp_stats['dob'].item()).days / 365.25)
    for key in ['height', 'weight', 'reach', 'age', 'control_time', 'reversals', 'knockdowns']:
        numerator = float(returner[key] - returner['opponent_' + key])
        denominator = float(returner[key] + returner['opponent_' + key])
        returner[key + '_diff'] = (numerator if denominator != 0 else 0) / (denominator if denominator != 0 else 1)
    for key in ['total_str', 'sig_str', 'td', 'ground', 'head', 'body', 'leg', 'distance', 'clinch']:
        landed_numerator = float(returner[key + '_landed'] - returner['opponent_' + key + '_landed'])
        landed_denominator = float(returner[key + '_landed'] + returner['opponent_' + key + '_landed'])
        returner[key + '_landed_diff'] = (landed_numerator if landed_denominator != 0 else 0) / (landed_denominator if landed_denominator != 0 else 1)
        attempted_numerator = float(returner[key + '_attempted'] - returner['opponent_' + key + '_attempted'])
        attempted_denominator = float(returner[key + '_attempted'] + returner['opponent_' + key + '_attempted'])
        returner[key + '_attempted_diff'] = (attempted_numerator if attempted_denominator != 0 else 0) / (attempted_denominator if attempted_denominator != 0 else 1)
        absorbed = float(returner['opponent_' + key + '_landed'])
        defended = float(returner['opponent_' + key + '_attempted']) - absorbed
        if defended < 0:
            defended = 0
        returner[key + '_absorbed'] = absorbed
        returner[key + '_defended'] = defended
        absorbed_numerator = float(absorbed - defended)
        absorbed_denominator = float(absorbed + defended)
        returner[key + '_absorbed_diff'] = (absorbed_numerator if absorbed_denominator != 0 else 0) / (absorbed_denominator if absorbed_denominator != 0 else 1)
        returner[key + '_defended_diff'] = (defended if defended != 0 else 0) / (absorbed + defended if absorbed + defended != 0 else 1)
    knockdowns_absorbed = float(returner['opponent_knockdowns'])
    knockdowns_numerator = float(returner['knockdowns'] - knockdowns_absorbed)
    knockdowns_denominator = float(returner['knockdowns'] + knockdowns_absorbed)
    returner['knockdowns_diff'] = (knockdowns_numerator if knockdowns_denominator != 0 else 0) / (knockdowns_denominator if knockdowns_denominator != 0 else 1)
    return returner

def apply_get_fighter_stats(row):
    return get_fighter_stats(row['fighter'], row['event'], row['bout'])

unique_fights = fight_stats_df[['fighter', 'event', 'bout']].drop_duplicates()
unique_fights['fighter'] = unique_fights['fighter'].str.strip().dropna()
unique_fights['event'] = unique_fights['event'].str.strip().dropna()
unique_fights['bout'] = unique_fights['bout'].str.strip().dropna()
print('total unique fights', unique_fights.shape[0])

fighter_stats_df = unique_fights.apply(apply_get_fighter_stats, axis=1).apply(pd.Series).dropna()

total unique fights 11606


In [54]:
# fighter_stats_df[['date', 'fighter', 'opponent', 'outcome', 'method']][fighter_stats_df['fighter'] == 'Youssef Zalal']
fighter_stats_df.describe()

Unnamed: 0,date,weight,height,reach,age,outcome,knockdowns,sub_attempted,reversals,control_time,...,distance_absorbed,distance_defended,distance_absorbed_diff,distance_defended_diff,clinch_landed_diff,clinch_attempted_diff,clinch_absorbed,clinch_defended,clinch_absorbed_diff,clinch_defended_diff
count,10274,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,...,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0
mean,2016-11-17 19:48:33.217831168,163.138992,70.38028,72.324411,30.292191,0.5,0.237785,0.386704,0.140062,132.100058,...,28.029979,43.49416,-0.217096,0.605433,0.0,0.0,4.836188,2.173448,0.266387,0.251905
min,2000-04-14 00:00:00,125.0,62.0,62.0,18.171116,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,0.0
25%,2013-06-08 00:00:00,145.0,68.0,70.0,27.479808,0.0,0.0,0.0,0.0,6.0,...,7.0,11.0,-0.409091,0.51518,-0.333333,-0.333333,0.0,0.0,0.0,0.0
50%,2017-04-22 00:00:00,155.0,70.0,72.0,30.071184,0.5,0.0,0.0,0.0,58.0,...,19.0,31.0,-0.22251,0.611255,0.0,0.0,2.0,1.0,0.2,0.2
75%,2021-01-20 00:00:00,185.0,73.0,75.0,32.845996,1.0,0.0,0.0,0.0,193.0,...,41.0,63.0,-0.030361,0.704545,0.333333,0.333333,6.0,3.0,0.666667,0.416667
max,2024-08-10 00:00:00,255.0,78.0,84.0,47.854894,1.0,5.0,10.0,6.0,1342.0,...,439.0,362.0,1.0,1.0,1.0,1.0,78.0,43.0,1.0,1.0
std,,23.425473,2.9435,3.493479,3.99242,0.500024,0.51608,0.828565,0.432255,174.140904,...,28.927755,41.872129,0.320577,0.167082,0.574173,0.535075,6.986904,3.371523,0.518399,0.284299


In [44]:
fighter_stats_df.to_csv('fighter_stats.csv', index=False)

In [55]:
def fighter_history_by_date(fighter, date):
    full_history = fighter_stats_df[(fighter_stats_df['fighter'] == fighter) & (fighter_stats_df['date'] < date)]
    recent_history = full_history[full_history['date'] >= date - pd.Timedelta(days=730)]
    returner = {}
    for key in ['weight', 'height', 'reach', 'age']:
        returner['avg_' + key + '_diff'] = full_history[key + '_diff'].mean() if full_history.shape[0] > 0 else 0.0
        returner['recent_avg_' + key + '_diff'] = recent_history[key + '_diff'].mean() if recent_history.shape[0] > 0 else 0.0
    to_do = []
    for key in ['knockdowns', 'reversals', 'control_time', 'age', 'weight', 'reach', 'height']:
        to_do += [key, key + '_diff']
    for key in ['total_str', 'sig_str', 'td', 'ground', 'head', 'body', 'leg', 'distance', 'clinch']:
        for suffix in ['landed_diff', 'attempted_diff', 'absorbed_diff', 'defended_diff', 'landed', 'attempted', 'absorbed', 'defended']:
            to_do.append('_'.join([key, suffix]))
    for key in to_do:
        avgK = '_'.join(['avg', key])
        peakK = '_'.join([key, 'peak'])
        valleyK = '_'.join([key, 'valley'])
        recentAvgK = '_'.join(['recent_avg', key])
        returner[avgK] = full_history[key].mean() if full_history.shape[0] > 0 else 0.0
        returner[recentAvgK] = recent_history[key].mean() if recent_history.shape[0] > 0 else 0.0
        if 'absorbed' in key:
            returner[peakK] = full_history[key].min() if full_history.shape[0] > 0 else 0.0
            returner[valleyK] = full_history[key].max() if full_history.shape[0] > 0 else 0.0
        else:
          returner[peakK] = full_history[key].max() if full_history.shape[0] > 0 else 0.0
          returner[valleyK] = full_history[key].min() if full_history.shape[0] > 0 else 0.0
        returner[recentAvgK + '_vs_peak'] = returner[recentAvgK] / returner[peakK] if returner[peakK] != 0 else 0.0
        returner[recentAvgK + '_vs_valley'] = returner[recentAvgK] / returner[valleyK] if returner[valleyK] != 0 else 0.0
        returner[avgK + '_vs_peak'] = returner[avgK] / returner[peakK] if returner[peakK] != 0 else 0.0
        returner[avgK + '_vs_valley'] = returner[avgK] / returner[valleyK] if returner[valleyK] != 0 else 0.0
    returner['recent_wins'] = recent_history[recent_history['outcome'] == 1].shape[0]
    returner['recent_losses'] = recent_history[recent_history['outcome'] == 0].shape[0]
    returner['wins'] = float(full_history[full_history['outcome'] == 1].shape[0])
    returner['losses'] = float(full_history[full_history['outcome'] == 0].shape[0])
    returner['win_ratio'] = returner['wins'] / (returner['wins'] + returner['losses']) if (returner['wins'] + returner['losses']) != 0 else 0.0
    for method, transformed in [['t/ko', 'ko'], ['submission', 'sub'], ['decision', 'dec']]:
        returner[transformed + '_wins'] = full_history[(full_history['method'] == method) & (full_history['outcome'] == 1)]['outcome'].sum()
        returner[transformed + '_losses'] = full_history[(full_history['method'] == method) & (full_history['outcome'] == 0)]['outcome'].sum()
        returner[transformed + '_win_ratio'] = returner[transformed + '_wins'] / (returner[transformed + '_wins'] + returner[transformed + '_losses']) if (returner[transformed + '_wins'] + returner[transformed + '_losses']) != 0 else 0.0
        returner[transformed + '_loss_ratio'] = returner[transformed + '_losses'] / (returner[transformed + '_losses'] + returner[transformed + '_wins']) if (returner[transformed + '_losses'] + returner[transformed + '_wins']) != 0 else 0.0
        returner['recent_' + transformed + '_wins'] = recent_history[(recent_history['method'] == method) & (recent_history['outcome'] == 1)]['outcome'].shape[0]
        returner['recent_' + transformed + '_losses'] = recent_history[(recent_history['method'] == method) & (recent_history['outcome'] == 0)]['outcome'].shape[0]
    return returner

def apply_fighter_history(row):
    row = row.to_dict()
    history_data = fighter_history_by_date(row['fighter'], row['date'])
    for key in history_data:
        row['precomp_' + key] = history_data[key]
    return row

fight_stats_with_history_df = fighter_stats_df.apply(apply_fighter_history, axis=1).apply(pd.Series).dropna()

In [56]:
fight_stats_with_history_df.describe()

Unnamed: 0,date,weight,height,reach,age,outcome,knockdowns,sub_attempted,reversals,control_time,...,precomp_sub_win_ratio,precomp_sub_loss_ratio,precomp_recent_sub_wins,precomp_recent_sub_losses,precomp_dec_wins,precomp_dec_losses,precomp_dec_win_ratio,precomp_dec_loss_ratio,precomp_recent_dec_wins,precomp_recent_dec_losses
count,10274,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,...,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0
mean,2016-11-17 19:48:33.217831168,163.138992,70.38028,72.324411,30.292191,0.5,0.237785,0.386704,0.140062,132.100058,...,0.341542,0.0,0.272825,0.167705,1.536695,0.0,0.583414,0.0,0.640062,0.450847
min,2000-04-14 00:00:00,125.0,62.0,62.0,18.171116,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2013-06-08 00:00:00,145.0,68.0,70.0,27.479808,0.0,0.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2017-04-22 00:00:00,155.0,70.0,72.0,30.071184,0.5,0.0,0.0,0.0,58.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
75%,2021-01-20 00:00:00,185.0,73.0,75.0,32.845996,1.0,0.0,0.0,0.0,193.0,...,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,1.0
max,2024-08-10 00:00:00,255.0,78.0,84.0,47.854894,1.0,5.0,10.0,6.0,1342.0,...,1.0,0.0,5.0,4.0,13.0,0.0,1.0,0.0,6.0,5.0
std,,23.425473,2.9435,3.493479,3.99242,0.500024,0.51608,0.828565,0.432255,174.140904,...,0.47425,0.0,0.578692,0.416262,2.025551,0.0,0.493017,0.0,0.847517,0.663879


In [47]:
fight_stats_with_history_df.to_csv('fighter_stats.csv', index=False)

In [35]:
for c in fight_stats_with_history_df.columns:
  print(c)

date
event
bout
fighter
weight
height
reach
age
outcome
method
opponent
knockdowns
sub_attempted
reversals
control_time
sig_str_landed
sig_str_attempted
total_str_landed
total_str_attempted
td_landed
td_attempted
head_landed
head_attempted
body_landed
body_attempted
leg_landed
leg_attempted
distance_landed
distance_attempted
clinch_landed
clinch_attempted
ground_landed
ground_attempted
opponent_knockdowns
opponent_sub_attempted
opponent_reversals
opponent_control_time
opponent_sig_str_landed
opponent_sig_str_attempted
opponent_total_str_landed
opponent_total_str_attempted
opponent_td_landed
opponent_td_attempted
opponent_head_landed
opponent_head_attempted
opponent_body_landed
opponent_body_attempted
opponent_leg_landed
opponent_leg_attempted
opponent_distance_landed
opponent_distance_attempted
opponent_clinch_landed
opponent_clinch_attempted
opponent_ground_landed
opponent_ground_attempted
opponent_weight
opponent_height
opponent_reach
opponent_age
height_diff
weight_diff
reach_diff
a

In [57]:
def get_history_diffs(fighter, opponent, date):
    returner = {}
    fighter_history = fighter_history_by_date(fighter, date)
    for key in fighter_history:
        returner['precomp_' + key] = fighter_history[key]
    opponent_history = fighter_history_by_date(opponent, date)
    for key in opponent_history:
        returner['opponent_precomp_' + key] = opponent_history[key]
    to_do = []
    for key in ['knockdowns', 'reversals', 'control_time', 'age', 'weight', 'reach', 'height']:
        to_do += [key, key + '_diff']
    for key in ['total_str', 'sig_str', 'td', 'ground', 'head', 'body', 'leg', 'distance', 'clinch']:
        for suffix in ['landed_diff', 'attempted_diff', 'landed', 'attempted', 'absorbed', 'defended']:
            to_do.append('_'.join([key, suffix]))
    for key in to_do:
        avgK = '_'.join(['avg', key])
        peakK = '_'.join([key, 'peak'])
        valleyK = '_'.join([key, 'valley'])
        recentAvgK = '_'.join(['recent_avg', key])
        for k in [avgK, peakK, valleyK, recentAvgK]:
          returner['precomp_' + k + '_vs_opp'] = returner['precomp_' + k] - returner['opponent_precomp_' + k]
    return returner

def apply_history_diffs(row):
    row = row.to_dict()
    history_diff = get_history_diffs(row['fighter'], row['opponent'], row['date'])
    for k in history_diff:
        row[k] = history_diff[k]
    return row

fight_stats_with_history_diffs_df = fight_stats_with_history_df.apply(apply_history_diffs, axis=1).apply(pd.Series).dropna()

In [58]:
fight_stats_with_history_diffs_df.describe()

Unnamed: 0,date,weight,height,reach,age,outcome,knockdowns,sub_attempted,reversals,control_time,...,precomp_clinch_attempted_valley_vs_opp,precomp_recent_avg_clinch_attempted_vs_opp,precomp_avg_clinch_absorbed_vs_opp,precomp_clinch_absorbed_peak_vs_opp,precomp_clinch_absorbed_valley_vs_opp,precomp_recent_avg_clinch_absorbed_vs_opp,precomp_avg_clinch_defended_vs_opp,precomp_clinch_defended_peak_vs_opp,precomp_clinch_defended_valley_vs_opp,precomp_recent_avg_clinch_defended_vs_opp
count,10274,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,...,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0,10274.0
mean,2016-11-17 19:48:33.217831168,163.138992,70.38028,72.324411,30.292191,0.5,0.237785,0.386704,0.140062,132.100058,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2000-04-14 00:00:00,125.0,62.0,62.0,18.171116,0.0,0.0,0.0,0.0,0.0,...,-82.0,-82.0,-57.0,-73.0,-57.0,-57.0,-40.0,-43.0,-41.0,-40.5
25%,2013-06-08 00:00:00,145.0,68.0,70.0,27.479808,0.0,0.0,0.0,0.0,6.0,...,-1.0,-4.666667,-2.714286,-7.0,0.0,-3.0,-1.333333,-3.0,0.0,-1.416667
50%,2017-04-22 00:00:00,155.0,70.0,72.0,30.071184,0.5,0.0,0.0,0.0,58.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2021-01-20 00:00:00,185.0,73.0,75.0,32.845996,1.0,0.0,0.0,0.0,193.0,...,1.0,4.666667,2.714286,7.0,0.0,3.0,1.333333,3.0,0.0,1.416667
max,2024-08-10 00:00:00,255.0,78.0,84.0,47.854894,1.0,5.0,10.0,6.0,1342.0,...,82.0,82.0,57.0,73.0,57.0,57.0,40.0,43.0,41.0,40.5
std,,23.425473,2.9435,3.493479,3.99242,0.500024,0.51608,0.828565,0.432255,174.140904,...,6.540799,9.89282,5.955573,13.640102,4.502744,6.76064,3.007234,6.919304,2.283942,3.309885


In [59]:
fight_stats_with_history_diffs_df.to_csv('fighter_stats.csv', index=False)

In [69]:
df = pd.read_csv('fighter_stats.csv')

In [51]:
# fight_sequence = fight_results_df[['date', 'fighter_b', 'outcome']][(fight_results_df['fighter_a'] == 'Youssef Zalal') | (fight_results_df['fighter_b'] == 'Youssef Zalal')]
# fight_sequence['date'] = pd.to_datetime(fight_sequence['date'])
# fight_sequence = fight_sequence.sort_values(by=['date'])
# fight_sequence['days_since_last_comp'] = fight_sequence['date'].diff().dt.days
# fight_sequence.head()
fight_stats_with_history_diffs_df[['date', 'fighter', 'opponent']][(fight_stats_with_history_diffs_df['fighter'] == 'Youssef Zalal') | (fight_stats_with_history_diffs_df['opponent'] == 'Youssef Zalal')].head()
# df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by=['date'])
# df['days_since_last_comp'] = df.groupby('fighter')['date'].diff().dt.days


Unnamed: 0,date,fighter,opponent
42,2024-08-10,Youssef Zalal,Jarno Errens
43,2024-08-10,Jarno Errens,Youssef Zalal
1088,2024-03-23,Billy Quarantillo,Youssef Zalal
1090,2024-03-23,Youssef Zalal,Billy Quarantillo
8054,2021-06-05,Sean Woodson,Youssef Zalal


In [61]:
import xgboost as xgb

model = xgb.Booster(model_file='./model.json')

In [97]:
def predict_outcome(fighter, opponent, date=np.datetime64('now')):
  hypothetical_fight_df = pd.DataFrame([get_history_diffs(fighter, opponent, date)]).filter(like='precomp')
  prediction = model.predict(xgb.DMatrix(hypothetical_fight_df, enable_categorical=True))[0]
  if prediction >= 0.5:
    return np.round(prediction * 100, 2), fighter
  return np.round((1 - prediction) * 100, 2), opponent

In [110]:
print('UFC 305\n')
fightDate = np.datetime64('2024-08-17')
for fighter, opponent in [['Dricus Du Plessis', 'Israel Adesanya'], ['Kai Kara-France', 'Steve Erceg'], ['Mateusz Gamrot', 'Dan Hooker'], ['Tai Tuivasa', 'Jairzinho Rozenstruik'], ['Li Jingliang', 'Carlos Prates'], ['Junior Tafa', 'Valter Walker'], ['Joshua Culibao', 'Ricardo Ramos'], ['Jack Jenkins', 'Herbert Burns']]:
  probability, fighter = predict_outcome(fighter, opponent, fightDate)
  print('%s has a %.2f%% chance of winning on %s' % (fighter, probability, fightDate))

print('\n\nUFC Fight Night: Tybura vs. Spivac')
fightDate = np.datetime64('2024-08-10')
for fighter, opponent in [['Marcin Tybura', 'Sergey Spivak'], ['Damon Jackson', 'Chepe Mariscal'], ['Danny Barlow', 'Nikolay Veretennikov'], ['Chris Gutierrez', 'Le Quang'], ['Toshiomi Kazama', 'Charalampos Grigoriou'], ['Jhonata Diniz', 'Karl Williams'], ['Youssef Zalal', 'Jarno Errens']]:
  probability, fighter = predict_outcome(fighter, opponent, fightDate)
  print('%s has a %.2f%% chance of winning on %s' % (fighter, probability, fightDate))

UFC 305

Dricus Du Plessis has a 59.51% chance of winning on 2024-08-17
Kai Kara-France has a 55.82% chance of winning on 2024-08-17
Mateusz Gamrot has a 51.31% chance of winning on 2024-08-17
Tai Tuivasa has a 50.29% chance of winning on 2024-08-17
Li Jingliang has a 54.13% chance of winning on 2024-08-17
Junior Tafa has a 50.29% chance of winning on 2024-08-17
Ricardo Ramos has a 50.85% chance of winning on 2024-08-17
Jack Jenkins has a 64.21% chance of winning on 2024-08-17


UFC Fight Night: Tybura vs. Spivac
Marcin Tybura has a 50.29% chance of winning on 2024-08-10
Chepe Mariscal has a 53.57% chance of winning on 2024-08-10
Danny Barlow has a 61.75% chance of winning on 2024-08-10
Chris Gutierrez has a 61.75% chance of winning on 2024-08-10
Toshiomi Kazama has a 56.19% chance of winning on 2024-08-10
Jhonata Diniz has a 50.29% chance of winning on 2024-08-10
Youssef Zalal has a 61.75% chance of winning on 2024-08-10
