# **3 - Data Preprocessing**

## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import pickle
pd.set_option('display.max_columns', None)

## Importing Pickles

In [3]:
with open('fighters.pkl', 'rb') as f:
    fighters = pickle.load(f)
with open('fights.pkl', 'rb') as f:
    fights = pickle.load(f)

## Dropping Columns

In [4]:
# Dropping Post-Fight Columns
fights = fights[['A', 'B', 'Winner', 'Event', 'Date', 'Location', 'Division', 'Format', 'Referee']]

## Dropping Rows

In [5]:
# Dropping Draws
print(f'Before # of Draws: {fights[fights['Winner'] == 'Draw'].shape[0]}')
fights = fights[fights['Winner'] != 'Draw']
print(f'After # of Draws: {fights[fights["Winner"] == 'Draw'].shape[0]}')

Before # of Draws: 58
After # of Draws: 0


## Shuffling

In [6]:
# B = Scraped Default Winner. Shuffle A and B.
print('Before:')
display(fights.head(5))

mask = np.random.rand(len(fights)) < 0.5
fights.loc[mask, ['A', 'B']] = fights.loc[mask, ['B', 'A']].values

print('After:')
display(fights.head(5))

Before:


Unnamed: 0,A,B,Winner,Event,Date,Location,Division,Format,Referee
0,Sean O'Malley,Merab Dvalishvili,Merab Dvalishvili,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Bantamweight Title Bout,5 Rnd (5-5-5-5-5),Herb Dean
1,Alexa Grasso,Valentina Shevchenko,Valentina Shevchenko,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Women's Flyweight Title Bout,5 Rnd (5-5-5-5-5),Mark Smith
2,Brian Ortega,Diego Lopes,Diego Lopes,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Featherweight Bout,3 Rnd (5-5-5),Marc Goddard
3,Daniel Zellhuber,Esteban Ribovics,Esteban Ribovics,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Lightweight Bout,3 Rnd (5-5-5),Jason Herzog
4,Ronaldo Rodriguez,Ode Osbourne,Ronaldo Rodriguez,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Flyweight Bout,3 Rnd (5-5-5),Chris Tognoni


After:


Unnamed: 0,A,B,Winner,Event,Date,Location,Division,Format,Referee
0,Sean O'Malley,Merab Dvalishvili,Merab Dvalishvili,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Bantamweight Title Bout,5 Rnd (5-5-5-5-5),Herb Dean
1,Valentina Shevchenko,Alexa Grasso,Valentina Shevchenko,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Women's Flyweight Title Bout,5 Rnd (5-5-5-5-5),Mark Smith
2,Brian Ortega,Diego Lopes,Diego Lopes,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Featherweight Bout,3 Rnd (5-5-5),Marc Goddard
3,Daniel Zellhuber,Esteban Ribovics,Esteban Ribovics,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Lightweight Bout,3 Rnd (5-5-5),Jason Herzog
4,Ode Osbourne,Ronaldo Rodriguez,Ronaldo Rodriguez,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",Flyweight Bout,3 Rnd (5-5-5),Chris Tognoni


## Merging DataFrames

In [8]:
# Before
display(fights.iloc[[0], :])
display(fighters.loc[(fighters['Name'] == "Sean O'Malley") | (fighters['Name'] == 'Merab Dvalishvili')])

Unnamed: 0,A,B,Winner,Event,Date,Location,Division,Format,Referee
0,Sean O'Malley,Merab Dvalishvili,Merab Dvalishvili,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Bantamweight Title Bout,5 Rnd (5-5-5-5-5),Herb Dean


Unnamed: 0,Name,Record,Height,Weight,Reach,Stance,Birthday,SLPM,SACC,SAPM,SD,TDAVG,TDACC,TDD,SAVG
1006,Merab Dvalishvili,18-4-0,"5' 6""",135 lbs.,"68""",Orthodox,"Jan 10, 1991",4.35,42%,2.31,56%,6.09,36%,80%,0.3
2768,Sean O'Malley,18-2-0 (1 NC),"5' 11""",135 lbs.,"72""",Switch,"Oct 24, 1994",6.7,61%,3.48,60%,0.29,42%,61%,0.3


In [7]:
df = pd.merge(fights, fighters, left_on = 'A', right_on = 'Name', how = 'left')
df = pd.merge(df, fighters, left_on = 'B', right_on = 'Name', how = 'left')

# Renaming Columns
df.columns = ['A', 'B', 'Winner', 'Event', 'Date', 'Location', 'Division', 'Format', 'Referee', 
              'A_Name', 'A_Record', 'A_Height', 'A_Weight', 'A_Reach', 'A_Stance', 'A_Birthday', 'A_SLPM', 'A_SACC', 'A_SAPM', 'A_SD', 'A_TDAVG', 'A_TDACC', 'A_TDD', 'A_SAVG',
              'B_Name', 'B_Record', 'B_Height', 'B_Weight', 'B_Reach', 'B_Stance', 'B_Birthday', 'B_SLPM', 'B_SACC', 'B_SAPM', 'B_SD', 'B_TDAVG', 'B_TDACC', 'B_TDD', 'B_SAVG']

# After
display(df.iloc[[0], :])

Unnamed: 0,A,B,Winner,Event,Date,Location,Division,Format,Referee,A_Name,A_Record,A_Height,A_Weight,A_Reach,A_Stance,A_Birthday,A_SLPM,A_SACC,A_SAPM,A_SD,A_TDAVG,A_TDACC,A_TDD,A_SAVG,B_Name,B_Record,B_Height,B_Weight,B_Reach,B_Stance,B_Birthday,B_SLPM,B_SACC,B_SAPM,B_SD,B_TDAVG,B_TDACC,B_TDD,B_SAVG
0,Sean O'Malley,Merab Dvalishvili,Merab Dvalishvili,UFC 306: Riyadh Season Noche UFC,"September 14, 2024","Las Vegas, Nevada, USA",UFC Bantamweight Title Bout,5 Rnd (5-5-5-5-5),Herb Dean,Sean O'Malley,18-2-0 (1 NC),"5' 11""",135 lbs.,"72""",Switch,"Oct 24, 1994",6.7,61%,3.48,60%,0.29,42%,61%,0.3,Merab Dvalishvili,18-4-0,"5' 6""",135 lbs.,"68""",Orthodox,"Jan 10, 1991",4.35,42%,2.31,56%,6.09,36%,80%,0.3


## Modifying Columns

### Converting Height, Weight, and Reach

In [8]:
print(f'Before:')
display(df[['A_Height', 'A_Weight', 'A_Reach', 'B_Height', 'B_Weight', 'B_Reach']].head())

# Convert from FT IN to CM
def convert_height(height):
    if pd.isna(height):
        return np.nan
    feet, inches = height.split(" ")
    feet = float(feet[0])
    inches = float(inches[0])
    return (feet * 30.48) + (inches * 2.54)

# Convert from LB to KG
def convert_weight(weight):
    if pd.isna(weight):
        return np.nan
    weight = weight.split(" ")
    weight = float(weight[0])
    return weight * 0.4536

# Convert from IN to CM
def convert_reach(reach):
    if pd.isna(reach):
        return np.nan
    reach = float(reach[0:2])
    return reach * 2.54

df['A_Height'] = df['A_Height'].apply(convert_height)
df['A_Weight'] = df['A_Weight'].apply(convert_weight)
df['A_Reach'] = df['A_Reach'].apply(convert_reach)
df['B_Height'] = df['B_Height'].apply(convert_height)
df['B_Weight'] = df['B_Weight'].apply(convert_weight)
df['B_Reach'] = df['B_Reach'].apply(convert_reach)

print(f'After:')
display(df[['A_Height', 'A_Weight', 'A_Reach', 'B_Height', 'B_Weight', 'B_Reach']].head())

Before:


Unnamed: 0,A_Height,A_Weight,A_Reach,B_Height,B_Weight,B_Reach
0,"5' 11""",135 lbs.,"72""","5' 6""",135 lbs.,"68"""
1,"5' 5""",125 lbs.,"66""","5' 5""",125 lbs.,"66"""
2,"5' 11""",145 lbs.,"72""","5' 8""",145 lbs.,"69"""
3,"5' 10""",155 lbs.,"69""","6' 1""",155 lbs.,"77"""
4,"5' 7""",125 lbs.,"73""","5' 6""",125 lbs.,"65"""


After:


Unnamed: 0,A_Height,A_Weight,A_Reach,B_Height,B_Weight,B_Reach
0,154.94,61.236,182.88,167.64,61.236,172.72
1,165.1,56.7,167.64,165.1,56.7,167.64
2,154.94,65.772,182.88,172.72,65.772,175.26
3,154.94,70.308,175.26,185.42,70.308,195.58
4,170.18,56.7,185.42,167.64,56.7,165.1


### Calculating Age

In [9]:
# Convert to DateTime
df['Date'] = pd.to_datetime(df['Date'])
df['A_Birthday'] = pd.to_datetime(df['A_Birthday'])
df['B_Birthday'] = pd.to_datetime(df['B_Birthday'])
# Age in Days
df['A_Age'] = (df['Date'] - df['A_Birthday']).dt.days
df['B_Age'] = (df['Date'] - df['B_Birthday']).dt.days

display(df[['Date', 'A_Birthday', 'B_Birthday', 'A_Age', 'B_Age']].head())

Unnamed: 0,Date,A_Birthday,B_Birthday,A_Age,B_Age
0,2024-09-14,1994-10-24,1991-01-10,10918.0,12301.0
1,2024-09-14,1988-03-07,1993-08-09,13340.0,11359.0
2,2024-09-14,1994-12-30,1991-02-21,10851.0,12259.0
3,2024-09-14,1996-04-27,1999-07-07,10367.0,9201.0
4,2024-09-14,1992-01-09,1999-05-07,11937.0,9262.0


### Binarize Winner

In [10]:
print('Before:')
display(df[['A', 'B', 'Winner']].head())

def binarize_winner(row):
    if row['Winner'] == row['A']:
        return 1
    else:
        return 0
df['Winner'] = df.apply(binarize_winner, axis = 1)

print('After:')
display(df[['A', 'B', 'Winner']].head())

Before:


Unnamed: 0,A,B,Winner
0,Sean O'Malley,Merab Dvalishvili,Merab Dvalishvili
1,Valentina Shevchenko,Alexa Grasso,Valentina Shevchenko
2,Diego Lopes,Brian Ortega,Diego Lopes
3,Esteban Ribovics,Daniel Zellhuber,Esteban Ribovics
4,Ode Osbourne,Ronaldo Rodriguez,Ronaldo Rodriguez


After:


Unnamed: 0,A,B,Winner
0,Sean O'Malley,Merab Dvalishvili,0
1,Valentina Shevchenko,Alexa Grasso,1
2,Diego Lopes,Brian Ortega,1
3,Esteban Ribovics,Daniel Zellhuber,1
4,Ode Osbourne,Ronaldo Rodriguez,0


## Handle Missing Values

In [11]:
# Imputing Reach = Height

print('Before')
print('------------------------------------')
for column in df.columns:
    print(column, 'Missing Values:', df[column].isnull().sum())

df['A_Reach'] = df['A_Reach'].fillna(df['A_Height'])
df['B_Reach'] = df['B_Reach'].fillna(df['B_Height'])

print('')
print('After')
print('------------------------------------')
for column in df.columns:
    print(column, 'Missing Values:', df[column].isnull().sum())

Before
------------------------------------
A Missing Values: 0
B Missing Values: 0
Winner Missing Values: 0
Event Missing Values: 0
Date Missing Values: 0
Location Missing Values: 0
Division Missing Values: 0
Format Missing Values: 0
Referee Missing Values: 25
A_Name Missing Values: 0
A_Record Missing Values: 0
A_Height Missing Values: 15
A_Weight Missing Values: 13
A_Reach Missing Values: 648
A_Stance Missing Values: 50
A_Birthday Missing Values: 141
A_SLPM Missing Values: 0
A_SACC Missing Values: 0
A_SAPM Missing Values: 0
A_SD Missing Values: 0
A_TDAVG Missing Values: 0
A_TDACC Missing Values: 0
A_TDD Missing Values: 0
A_SAVG Missing Values: 0
B_Name Missing Values: 0
B_Record Missing Values: 0
B_Height Missing Values: 16
B_Weight Missing Values: 14
B_Reach Missing Values: 679
B_Stance Missing Values: 49
B_Birthday Missing Values: 132
B_SLPM Missing Values: 0
B_SACC Missing Values: 0
B_SAPM Missing Values: 0
B_SD Missing Values: 0
B_TDAVG Missing Values: 0
B_TDACC Missing Values: 0

## Reorder Columns

In [12]:
print('Before:')
display(df.head(5))

df['Fight'] = df['A'] + ' vs. ' + df['B']
df = df[['Event', 'Fight', 'A', 'B', 'Winner', 'Date', 'Location', 'Division', 'Format', 'Referee', 
         'A_Record', 'A_Height', 'A_Weight', 'A_Reach', 'A_Stance', 'A_Birthday', 'A_Age', 'A_SLPM', 'A_SACC', 'A_SAPM', 'A_SD', 'A_TDAVG', 'A_TDACC', 'A_TDD', 'A_SAVG',
         'B_Record', 'B_Height', 'B_Weight', 'B_Reach', 'B_Stance', 'B_Birthday', 'B_Age', 'B_SLPM', 'B_SACC', 'B_SAPM', 'B_SD', 'B_TDAVG', 'B_TDACC', 'B_TDD', 'B_SAVG']]

print('')
print('After:')
display(df.head(5))

Before:


Unnamed: 0,A,B,Winner,Event,Date,Location,Division,Format,Referee,A_Name,A_Record,A_Height,A_Weight,A_Reach,A_Stance,A_Birthday,A_SLPM,A_SACC,A_SAPM,A_SD,A_TDAVG,A_TDACC,A_TDD,A_SAVG,B_Name,B_Record,B_Height,B_Weight,B_Reach,B_Stance,B_Birthday,B_SLPM,B_SACC,B_SAPM,B_SD,B_TDAVG,B_TDACC,B_TDD,B_SAVG,A_Age,B_Age
0,Sean O'Malley,Merab Dvalishvili,0,UFC 306: Riyadh Season Noche UFC,2024-09-14,"Las Vegas, Nevada, USA",UFC Bantamweight Title Bout,5 Rnd (5-5-5-5-5),Herb Dean,Sean O'Malley,18-2-0 (1 NC),154.94,61.236,182.88,Switch,1994-10-24,6.7,61%,3.48,60%,0.29,42%,61%,0.3,Merab Dvalishvili,18-4-0,167.64,61.236,172.72,Orthodox,1991-01-10,4.35,42%,2.31,56%,6.09,36%,80%,0.3,10918.0,12301.0
1,Valentina Shevchenko,Alexa Grasso,1,UFC 306: Riyadh Season Noche UFC,2024-09-14,"Las Vegas, Nevada, USA",UFC Women's Flyweight Title Bout,5 Rnd (5-5-5-5-5),Mark Smith,Valentina Shevchenko,24-4-1,165.1,56.7,167.64,Southpaw,1988-03-07,3.16,52%,2.0,64%,2.74,64%,71%,0.3,Alexa Grasso,16-4-1,165.1,56.7,167.64,Orthodox,1993-08-09,4.19,41%,3.69,58%,0.44,40%,54%,0.7,13340.0,11359.0
2,Diego Lopes,Brian Ortega,1,UFC 306: Riyadh Season Noche UFC,2024-09-14,"Las Vegas, Nevada, USA",Featherweight Bout,3 Rnd (5-5-5),Marc Goddard,Diego Lopes,26-6-0,154.94,65.772,182.88,Orthodox,1994-12-30,4.2,53%,4.11,50%,0.5,40%,52%,2.8,Brian Ortega,16-4-0 (1 NC),172.72,65.772,175.26,Switch,1991-02-21,4.08,37%,6.59,49%,1.07,26%,55%,1.1,10851.0,12259.0
3,Esteban Ribovics,Daniel Zellhuber,1,UFC 306: Riyadh Season Noche UFC,2024-09-14,"Las Vegas, Nevada, USA",Lightweight Bout,3 Rnd (5-5-5),Jason Herzog,Esteban Ribovics,14-1-0,154.94,70.308,175.26,Orthodox,1996-04-27,7.53,48%,4.95,57%,0.64,66%,61%,0.3,Daniel Zellhuber,15-2-0,185.42,70.308,195.58,Switch,1999-07-07,6.26,40%,6.02,55%,0.18,25%,94%,0.2,10367.0,9201.0
4,Ode Osbourne,Ronaldo Rodriguez,0,UFC 306: Riyadh Season Noche UFC,2024-09-14,"Las Vegas, Nevada, USA",Flyweight Bout,3 Rnd (5-5-5),Chris Tognoni,Ode Osbourne,12-8-0 (1 NC),170.18,56.7,185.42,Southpaw,1992-01-09,3.26,40%,4.01,45%,1.21,28%,63%,0.6,Ronaldo Rodriguez,17-2-0,167.64,56.7,165.1,Orthodox,1999-05-07,2.13,44%,3.15,46%,1.13,50%,64%,1.1,11937.0,9262.0



After:


Unnamed: 0,Event,Fight,A,B,Winner,Date,Location,Division,Format,Referee,A_Record,A_Height,A_Weight,A_Reach,A_Stance,A_Birthday,A_Age,A_SLPM,A_SACC,A_SAPM,A_SD,A_TDAVG,A_TDACC,A_TDD,A_SAVG,B_Record,B_Height,B_Weight,B_Reach,B_Stance,B_Birthday,B_Age,B_SLPM,B_SACC,B_SAPM,B_SD,B_TDAVG,B_TDACC,B_TDD,B_SAVG
0,UFC 306: Riyadh Season Noche UFC,Sean O'Malley vs. Merab Dvalishvili,Sean O'Malley,Merab Dvalishvili,0,2024-09-14,"Las Vegas, Nevada, USA",UFC Bantamweight Title Bout,5 Rnd (5-5-5-5-5),Herb Dean,18-2-0 (1 NC),154.94,61.236,182.88,Switch,1994-10-24,10918.0,6.7,61%,3.48,60%,0.29,42%,61%,0.3,18-4-0,167.64,61.236,172.72,Orthodox,1991-01-10,12301.0,4.35,42%,2.31,56%,6.09,36%,80%,0.3
1,UFC 306: Riyadh Season Noche UFC,Valentina Shevchenko vs. Alexa Grasso,Valentina Shevchenko,Alexa Grasso,1,2024-09-14,"Las Vegas, Nevada, USA",UFC Women's Flyweight Title Bout,5 Rnd (5-5-5-5-5),Mark Smith,24-4-1,165.1,56.7,167.64,Southpaw,1988-03-07,13340.0,3.16,52%,2.0,64%,2.74,64%,71%,0.3,16-4-1,165.1,56.7,167.64,Orthodox,1993-08-09,11359.0,4.19,41%,3.69,58%,0.44,40%,54%,0.7
2,UFC 306: Riyadh Season Noche UFC,Diego Lopes vs. Brian Ortega,Diego Lopes,Brian Ortega,1,2024-09-14,"Las Vegas, Nevada, USA",Featherweight Bout,3 Rnd (5-5-5),Marc Goddard,26-6-0,154.94,65.772,182.88,Orthodox,1994-12-30,10851.0,4.2,53%,4.11,50%,0.5,40%,52%,2.8,16-4-0 (1 NC),172.72,65.772,175.26,Switch,1991-02-21,12259.0,4.08,37%,6.59,49%,1.07,26%,55%,1.1
3,UFC 306: Riyadh Season Noche UFC,Esteban Ribovics vs. Daniel Zellhuber,Esteban Ribovics,Daniel Zellhuber,1,2024-09-14,"Las Vegas, Nevada, USA",Lightweight Bout,3 Rnd (5-5-5),Jason Herzog,14-1-0,154.94,70.308,175.26,Orthodox,1996-04-27,10367.0,7.53,48%,4.95,57%,0.64,66%,61%,0.3,15-2-0,185.42,70.308,195.58,Switch,1999-07-07,9201.0,6.26,40%,6.02,55%,0.18,25%,94%,0.2
4,UFC 306: Riyadh Season Noche UFC,Ode Osbourne vs. Ronaldo Rodriguez,Ode Osbourne,Ronaldo Rodriguez,0,2024-09-14,"Las Vegas, Nevada, USA",Flyweight Bout,3 Rnd (5-5-5),Chris Tognoni,12-8-0 (1 NC),170.18,56.7,185.42,Southpaw,1992-01-09,11937.0,3.26,40%,4.01,45%,1.21,28%,63%,0.6,17-2-0,167.64,56.7,165.1,Orthodox,1999-05-07,9262.0,2.13,44%,3.15,46%,1.13,50%,64%,1.1


## Changing Data Types

In [13]:
# Before
print('Before:')
print(df.info())

# Converting Percentages to Floats
cols = ['A_SACC', 'A_SD', 'A_TDACC', 'A_TDD', 'B_SACC', 'B_SD', 'B_TDACC', 'B_TDD']

for col in cols:
    df[col] = df[col].astype(str).str.rstrip('%').astype('float') / 100.0

# Converting into Floats, Objects (Strings), Datetime, and Categories
dtype_dict = {
    'Event': 'object', 'Fight': 'object', 'A': 'object', 'B': 'object', 'Winner': 'int64', 'Date': 'datetime64[ns]', 'Location': 'object', 'Division': 'category', 'Format': 'category', 'Referee': 'object',
    'A_Record': 'object', 'A_Height': 'float64', 'A_Weight': 'float64', 'A_Reach': 'float64', 'A_Stance': 'category', 'A_Birthday': 'datetime64[ns]', 'A_Age': 'float64', 'A_SLPM': 'float64', 'A_SACC': 'float64', 'A_SAPM': 'float64', 'A_SD': 'float64', 'A_TDAVG': 'float64', 'A_TDACC': 'float64', 'A_TDD': 'float64', 'A_SAVG': 'float64',
    'B_Record': 'object', 'B_Height': 'float64', 'B_Weight': 'float64', 'B_Reach': 'float64', 'B_Stance': 'category', 'B_Birthday': 'datetime64[ns]', 'B_Age': 'float64', 'B_SLPM': 'float64', 'B_SACC': 'float64', 'B_SAPM': 'float64', 'B_SD': 'float64', 'B_TDAVG': 'float64', 'B_TDACC': 'float64', 'B_TDD': 'float64', 'B_SAVG': 'float64'
}

df = df.astype(dtype_dict)

# After
print('')
print('After:')
print(df.info())

Before:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813 entries, 0 to 7812
Data columns (total 40 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Event       7813 non-null   object        
 1   Fight       7813 non-null   object        
 2   A           7813 non-null   object        
 3   B           7813 non-null   object        
 4   Winner      7813 non-null   int64         
 5   Date        7813 non-null   datetime64[ns]
 6   Location    7813 non-null   object        
 7   Division    7813 non-null   object        
 8   Format      7813 non-null   object        
 9   Referee     7788 non-null   object        
 10  A_Record    7813 non-null   object        
 11  A_Height    7798 non-null   float64       
 12  A_Weight    7800 non-null   float64       
 13  A_Reach     7799 non-null   float64       
 14  A_Stance    7763 non-null   object        
 15  A_Birthday  7672 non-null   datetime64[ns]
 16  A_Age       7672

## Exporting Pickle

In [14]:
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)