# Data Cleaning

### Cleaning Fighters Data

In [73]:
import sys
import os

parent_dir = os.path.dirname(os.getcwd())
scripts_path = os.path.join(parent_dir, "scripts")

if scripts_path not in sys.path:
  sys.path.insert(0, scripts_path)

In [74]:
import pandas as pd
import numpy as np
from helpers import base_path

In [75]:
fighters_df = pd.read_csv(base_path('raw_data/raw_fighters.csv'))

In [76]:
fighters_df.head()

Unnamed: 0,Fighter_Id,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,93fe7332d16c6ad9,Tom,Aaron,,--,155 lbs.,--,,5,3,0,False
1,15df64c02b6b0fde,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,False
2,59a9d6dac61c2540,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,False
3,4961467134abd8be,Darion,Abbey,,"6' 2""",265 lbs.,"80.0""",Orthodox,9,5,0,False
4,b361180739bed4b0,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,False


In [77]:
def check_types(df):
    print(df.dtypes)

In [78]:
fighters_df.columns

Index(['Fighter_Id', 'First', 'Last', 'Nickname', 'Ht.', 'Wt.', 'Reach',
       'Stance', 'W', 'L', 'D', 'Belt'],
      dtype='object')

In [79]:
check_types(fighters_df)

Fighter_Id    object
First         object
Last          object
Nickname      object
Ht.           object
Wt.           object
Reach         object
Stance        object
W              int64
L              int64
D              int64
Belt            bool
dtype: object


In [80]:
def number_of_values(df):
    print("================ Number of unique values for each col ================")
    for col in df.columns.tolist():
        print(f'{col} : {df[col].nunique()}')
    print("="*70)
    


In [81]:
number_of_values(fighters_df)

Fighter_Id : 4443
First : 2053
Last : 3310
Nickname : 1939
Ht. : 28
Wt. : 114
Reach : 28
Stance : 5
W : 57
L : 40
D : 12
Belt : 2


In [82]:
# print(len("================ Columns having missing values ================"))
def print_missing_values(df):
    print("================ Columns having missing values ================")
    for col in df.columns.tolist():
        if df[col].isna().sum():
            print(f'{col} : {df[col].isna().sum()}')
    print("="*64)

In [83]:
# I did this because the Full Name column would be empty in case one of the columns is empty 
fighters_df.fillna({'First': ''}, inplace=True)
fighters_df.fillna({'Last': ''}, inplace=True)

fighters_df['Full Name'] = fighters_df['First'] + ' ' + fighters_df['Last']
fighters_df.drop(columns=['First', 'Last'], inplace=True)
cols = fighters_df.columns.tolist()
cols = [cols.pop()] + cols
fighters_df = fighters_df[cols]

Height, Weight ,Reach and Stance are missing. I think I can use imputation as follows :
- Height, Weight and Reach are imputed by the median weight of fighter's current weight_class
- For Stance I'm gonna simply use mode imputation


In [84]:
fighters_df.fillna({"First" : "Uknown"},inplace=True)
fighters_df.fillna({"Nickname" : "No Nickname"},inplace=True)
fighters_df.fillna({"Stance" : fighters_df['Stance'].mode().iloc[-1]},inplace=True)


In [85]:
# Ht. column
def format_height(height=""):
    if height == '--':
        return np.nan
    height = height.rstrip('"')
    height = height.replace("' ", ".")
    return height


fighters_df['Ht.'] = fighters_df['Ht.'].apply(format_height)
fighters_df['Ht.'] = fighters_df['Ht.'].astype('float32')

In [86]:
# Wt. column
def format_weight(height=""):
    if height == '--':
        return np.nan
    height = height.rstrip(' lbs.')
    return height


fighters_df['Wt.'] = fighters_df['Wt.'].apply(format_weight)
fighters_df['Wt.'] = fighters_df['Wt.'].astype('float32')

In [87]:
# Reach column
def format_reach(height=""):
    if height == '--':
        return np.nan
    height = height.rstrip('"')
    return height


fighters_df['Reach'] = fighters_df['Reach'].apply(format_reach)
fighters_df['Reach'] = fighters_df['Reach'].astype('float32')

In [88]:
# Making stance column categorical
fighters_df['Stance'] = fighters_df['Stance'].astype('category')

In [89]:
# W - L - D can not be so big.
# It is better to make them only 32-bits
for col in ['W','L','D'] : 
    fighters_df[col] = fighters_df[col].astype('int32')

In [90]:
fighters_df.head()

Unnamed: 0,Full Name,Fighter_Id,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,Tom Aaron,93fe7332d16c6ad9,No Nickname,,155.0,,Orthodox,5,3,0,False
1,Danny Abbadi,15df64c02b6b0fde,The Assassin,5.11,155.0,,Orthodox,4,6,0,False
2,Nariman Abbasov,59a9d6dac61c2540,Bayraktar,5.8,155.0,66.0,Orthodox,28,4,0,False
3,Darion Abbey,4961467134abd8be,No Nickname,6.2,265.0,80.0,Orthodox,9,5,0,False
4,David Abbott,b361180739bed4b0,Tank,6.0,265.0,,Switch,10,15,0,False


In [91]:
print_missing_values(fighters_df)

Ht. : 354
Wt. : 86
Reach : 1976


I think this is enough for cleaning fighters data

In [92]:
fighters_df.to_csv(base_path('data/Fighters.csv'),index=False)

### CLeaning Events Data


In [93]:
events_df = pd.read_csv(base_path('raw_data/raw_events.csv'))
events_df.head()

Unnamed: 0,Event_Id,Name,Date,Location
0,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,"December 06, 2025","Las Vegas, Nevada, USA"
1,92c96df8bdab5fea,UFC Fight Night: Tsarukyan vs. Hooker,"November 22, 2025","Doha, Qatar"
2,8db1b36dde268ef6,UFC 322: Della Maddalena vs. Makhachev,"November 15, 2025","New York City, New York, USA"
3,6436029b50a9c255,UFC Fight Night: Bonfim vs. Brown,"November 08, 2025","Las Vegas, Nevada, USA"
4,0e2c2daf11b5d8f2,UFC Fight Night: Garcia vs. Onama,"November 01, 2025","Las Vegas, Nevada, USA"


In [94]:
check_types(events_df)

Event_Id    object
Name        object
Date        object
Location    object
dtype: object


In [95]:
print_missing_values(events_df)



! Perfect

In [96]:
events_df['Date'] = pd.to_datetime(events_df['Date'], format="%B %d, %Y")
events_df.head()

Unnamed: 0,Event_Id,Name,Date,Location
0,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,2025-12-06,"Las Vegas, Nevada, USA"
1,92c96df8bdab5fea,UFC Fight Night: Tsarukyan vs. Hooker,2025-11-22,"Doha, Qatar"
2,8db1b36dde268ef6,UFC 322: Della Maddalena vs. Makhachev,2025-11-15,"New York City, New York, USA"
3,6436029b50a9c255,UFC Fight Night: Bonfim vs. Brown,2025-11-08,"Las Vegas, Nevada, USA"
4,0e2c2daf11b5d8f2,UFC Fight Night: Garcia vs. Onama,2025-11-01,"Las Vegas, Nevada, USA"


In [97]:
events_df.to_csv(base_path('data/Events.csv'),index=False)

Now let's get to the largest dataframe :) 

### Cleaning Fights Data

In [98]:
fights_df = pd.read_csv(base_path('raw_data/raw_fights_detailed.csv')).set_index('Fight_Id')

In [99]:
check_types(fights_df)

Win/No Contest/Draw     object
Fighter_id_1            object
Fighter_id_2            object
Fighter_1               object
Fighter_2               object
KD_1                    object
KD_2                    object
STR_1                   object
STR_2                   object
TD_1                    object
TD_2                    object
SUB_1                   object
SUB_2                   object
Weight_Class            object
Method                  object
Round                    int64
Fight_Time              object
Event_Id_x              object
Result_1                object
Result_2                object
Time Format             object
Referee                 object
Method Details          object
Kd_1                   float64
Kd_2                   float64
Sig. Str._1             object
Sig. Str._2             object
Sig. Str. %_1           object
Sig. Str. %_2           object
Total Str._1            object
Total Str._2            object
Td_1                    object
Td_2    

In [100]:
def values_count(df):
    for col in df.columns.tolist():
        yield col, df[col].value_counts(dropna=False)

# usage:
# gen = unique_values(fights_df)
# next(gen)  # returns (column_name, value_counts_series) for the next column

In [101]:
cols = values_count(fights_df)

I am going to do this for the rest of columns

In [102]:

def clean_missing_fight_detail(col):
    fights_df[col] = (fights_df[col]
                      .replace('--', pd.NA)
                      .astype('Int32'))


def make_categorical(col):
    fights_df[col] = fights_df[col].astype('category')


def calculate_pct(pct):
    if pd.isna(pct) or pct == '---':
        return pd.NA
    pct = pct.rstrip('%')
    return int(pct)/100


def parse_seconds_from_time(time):
    if pd.isna(time) or time == '--':
        return pd.NA
    minutes, seconds = time.split(':')
    return int(minutes) * 60 + int(seconds)


def of_to_pct(exp):
    if pd.isna(exp):
        return pd.NA
    if exp == '0 of 0':
        return 0

    x, y = exp.split('of')
    x, y = int(x), int(y)
    try:
        return round(x/y, 2)
    except ZeroDivisionError:
        return 0


def remove_quotation(s):
    if s.startswith("'"):
        return s.strip("'")
    else:
        return s.strip('"')

In [103]:
def clean_fights_df():
    # Convert these columns to int and replace missing values '--' with NA 
    clean_missing_fight_detail('KD_1')
    clean_missing_fight_detail('KD_2')
    clean_missing_fight_detail('STR_1')
    clean_missing_fight_detail('STR_2')
    clean_missing_fight_detail('TD_1')
    clean_missing_fight_detail('TD_2')
    clean_missing_fight_detail('SUB_1')
    clean_missing_fight_detail('SUB_2')

    # Converting categorical columns
    make_categorical('Weight_Class')
    make_categorical('Result_1')
    make_categorical('Result_2')

    fights_df['Round'] = fights_df['Round'].astype('int8')

    # Event_Id is saved twice due to table joins
    fights_df.rename(columns={'Event_Id_x': 'Event_Id'}, inplace=True)
    # Removing redundant columns
    redundant_cols = ['Kd_1', 'Kd_2', 'Td_1',
                      'Td_2', 'Win/No Contest/Draw', 'Event_Id_y']
    fights_df.drop(columns=redundant_cols, inplace=True)

    # Values are transformed from string 'X%' to float (X/100)
    pct_cols = ['Sig. Str. %', 'Td %']
    for col in pct_cols:
        fights_df[f'{col}_1'] = fights_df[f'{col}_1'].apply(calculate_pct)
        fights_df[f'{col}_2'] = fights_df[f'{col}_2'].apply(calculate_pct)
    # Cols that are in this form : x of y
    of_cols = ['Head_', 'Body_', 'Leg_', 'Distance_',
               'Clinch_', 'Ground_', 'Total Str._','Sig. Str._']
    # This data is supposed to be already scraped
    # But I did not scrape it to do it myself (faster)
    for col in of_cols:
        fights_df[f"{col}%_1"] = fights_df[f"{col}1"].apply(of_to_pct)
        fights_df[f"{col}%_2"] = fights_df[f"{col}2"].apply(of_to_pct)
        fights_df.drop(columns=[f"{col}1", f"{col}2"], inplace=True)

    # Stripping quotation marks that came with scraped data
    fights_df['Weight_Class'] = fights_df['Weight_Class'].apply(
        remove_quotation)
    fights_df['Method'] = fights_df['Method'].apply(remove_quotation)
    fights_df['Fight_Time'] = fights_df['Fight_Time'].apply(remove_quotation)
    # Integer columns conversion
    fights_df['Sub. Att_1'] = fights_df['Sub. Att_1'].astype('Int8')
    fights_df['Sub. Att_2'] = fights_df['Sub. Att_2'].astype('Int8')
    fights_df['Rev._1'] = fights_df['Rev._1'].astype('Int8')
    fights_df['Rev._2'] = fights_df['Rev._2'].astype('Int8')
    # mm:ss to xxxx seconds
    fights_df['Ctrl_1'] = fights_df['Ctrl_1'].apply(
        parse_seconds_from_time).astype('Int32')
    fights_df['Ctrl_2'] = fights_df['Ctrl_2'].apply(
        parse_seconds_from_time).astype('Int32')

    fights_df.fillna({"Method Details": "No details provided"}, inplace=True)
    fights_df.fillna({"Referee": "Uknown referee"}, inplace=True)

    fights_df.dropna(subset=['KD_1'], inplace=True)
    # I already have a takedowns column + 37% of the data missing is too much to be imputed
    fights_df.drop(columns=['Td %_1', 'Td %_2'], inplace=True)

In [104]:
clean_fights_df()

In [105]:
fights_df.columns

Index(['Fighter_id_1', 'Fighter_id_2', 'Fighter_1', 'Fighter_2', 'KD_1',
       'KD_2', 'STR_1', 'STR_2', 'TD_1', 'TD_2', 'SUB_1', 'SUB_2',
       'Weight_Class', 'Method', 'Round', 'Fight_Time', 'Event_Id', 'Result_1',
       'Result_2', 'Time Format', 'Referee', 'Method Details', 'Sig. Str. %_1',
       'Sig. Str. %_2', 'Sub. Att_1', 'Sub. Att_2', 'Rev._1', 'Rev._2',
       'Ctrl_1', 'Ctrl_2', 'Head_%_1', 'Head_%_2', 'Body_%_1', 'Body_%_2',
       'Leg_%_1', 'Leg_%_2', 'Distance_%_1', 'Distance_%_2', 'Clinch_%_1',
       'Clinch_%_2', 'Ground_%_1', 'Ground_%_2', 'Total Str._%_1',
       'Total Str._%_2', 'Sig. Str._%_1', 'Sig. Str._%_2'],
      dtype='object')

In [106]:
print_missing_values(fights_df)

Sig. Str. %_1 : 39
Sig. Str. %_2 : 58
Ctrl_1 : 181
Ctrl_2 : 181


I can not impute `['Wt.', 'Ht.','Reach']` by their means grouped by the current fighter `weight_class` because there is nearly no fights recorded for players with missing weight

In [107]:
missing_weight = fighters_df[fighters_df['Wt.'].isna()]['Full Name']
fights_of_missing_weight_1 = fights_df[fights_df['Fighter_1'].isin(missing_weight)]
fights_of_missing_weight_2 = fights_df[fights_df['Fighter_2'].isin(missing_weight)]
print(len(missing_weight))
print(len(fights_of_missing_weight_1)+len(fights_of_missing_weight_2))

86
17


Doing this for all the columns I wanted to impute revealed a huge difference between the length of the two subsets. 
So maybe I'll consider dropping these fighters and their fights before training the model.

In [108]:
fights_df.to_csv(base_path('data/Fights.csv'),index=False)