# Data Cleaning

### Cleaning Fighters Data

In [17]:
import pandas as pd
import numpy as np

In [18]:
fighters_df = pd.read_csv('raw_data/raw_fighters.csv')

In [19]:
fighters_df.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,
3,Darion,Abbey,,"6' 2""",265 lbs.,"80.0""",Orthodox,9,5,0,
4,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,


In [20]:
def check_types(df):
    print(df.dtypes)

In [21]:
check_types(fighters_df)

First        object
Last         object
Nickname     object
Ht.          object
Wt.          object
Reach        object
Stance       object
W             int64
L             int64
D             int64
Belt        float64
dtype: object


In [22]:
def number_of_values(df):
    print("================ Number of unique values for each col ================")
    for col in df.columns.tolist():
        print(f'{col} : {df[col].nunique()}')
    print("="*70)
    


In [23]:
number_of_values(fighters_df)

First : 1747
Last : 2660
Nickname : 1599
Ht. : 28
Wt. : 106
Reach : 28
Stance : 5
W : 54
L : 38
D : 12
Belt : 0


In [24]:
# Ht. column
def format_height(height=""):
    if height == '--':
        return np.nan
    height = height.rstrip('"')
    height = height.replace("' ", ".")
    return height


fighters_df['Ht.'] = fighters_df['Ht.'].apply(format_height)
fighters_df['Ht.'] = fighters_df['Ht.'].astype('float32')

In [25]:
# Wt. column
def format_weight(height=""):
    if height == '--':
        return np.nan
    height = height.rstrip(' lbs.')
    return height


fighters_df['Wt.'] = fighters_df['Wt.'].apply(format_weight)
fighters_df['Wt.'] = fighters_df['Wt.'].astype('float32')

In [26]:
# Reach column
def format_reach(height=""):
    if height == '--':
        return np.nan
    height = height.rstrip('"')
    return height


fighters_df['Reach'] = fighters_df['Reach'].apply(format_reach)
fighters_df['Reach'] = fighters_df['Reach'].astype('float32')

In [27]:
# print(len("================ Columns having missing values ================"))
def print_missing_values(df):
    print("================ Columns having missing values ================")
    for col in df.columns.tolist():
        if df[col].isna().sum():
            print(f'{col} : {df[col].isna().sum()}')
    print("="*64)

In [28]:
print_missing_values(fighters_df)

First : 11
Nickname : 1590
Ht. : 315
Wt. : 74
Reach : 1618
Stance : 734
Belt : 3567


In [29]:
# Making stance column categorical
fighters_df['Stance'] = fighters_df['Stance'].astype('category')

In [30]:
# W - L - D can not be so big.
# It is better to make them only 32-bits
for col in ['W','L','D'] : 
    fighters_df[col] = fighters_df[col].astype('int32')

In [31]:
fighters_df.head()

Unnamed: 0,First,Last,Nickname,Ht.,Wt.,Reach,Stance,W,L,D,Belt
0,Tom,Aaron,,,155.0,,,5,3,0,
1,Danny,Abbadi,The Assassin,5.11,155.0,,Orthodox,4,6,0,
2,Nariman,Abbasov,Bayraktar,5.8,155.0,66.0,Orthodox,28,4,0,
3,Darion,Abbey,,6.2,265.0,80.0,Orthodox,9,5,0,
4,David,Abbott,Tank,6.0,265.0,,Switch,10,15,0,


I think this is enough for cleaning fighters data

In [32]:
fighters_df.to_csv('data/Fighters.csv')

### CLeaning Events Data


In [33]:
events_df = pd.read_csv('raw_data/raw_events.csv')
events_df.head()

Unnamed: 0,Event_Id,Name,Date,Location
0,754968e325d6f60d,UFC Fight Night: Walker vs. Zhang,"August 23, 2025","Shanghai, Hebei, China"
1,421ccfc6ddb17958,UFC 319: Du Plessis vs. Chimaev,"August 16, 2025","Chicago, Illinois, USA"
2,6cd3dfc54f01287f,UFC Fight Night: Dolidze vs. Hernandez,"August 09, 2025","Las Vegas, Nevada, USA"
3,f2c934689243fe4e,UFC Fight Night: Taira vs. Park,"August 02, 2025","Las Vegas, Nevada, USA"
4,28d8638ea0a71908,UFC Fight Night: Whittaker vs. De Ridder,"July 26, 2025","Abu Dhabi, Abu Dhabi, United Arab Emirates"


In [34]:
check_types(events_df)

Event_Id    object
Name        object
Date        object
Location    object
dtype: object


In [35]:
print_missing_values(events_df)



! Perfect

In [36]:
events_df['Date'] = pd.to_datetime(events_df['Date'], format="%B %d, %Y")
events_df.head()

Unnamed: 0,Event_Id,Name,Date,Location
0,754968e325d6f60d,UFC Fight Night: Walker vs. Zhang,2025-08-23,"Shanghai, Hebei, China"
1,421ccfc6ddb17958,UFC 319: Du Plessis vs. Chimaev,2025-08-16,"Chicago, Illinois, USA"
2,6cd3dfc54f01287f,UFC Fight Night: Dolidze vs. Hernandez,2025-08-09,"Las Vegas, Nevada, USA"
3,f2c934689243fe4e,UFC Fight Night: Taira vs. Park,2025-08-02,"Las Vegas, Nevada, USA"
4,28d8638ea0a71908,UFC Fight Night: Whittaker vs. De Ridder,2025-07-26,"Abu Dhabi, Abu Dhabi, United Arab Emirates"


In [37]:
events_df.to_csv('data/Events.csv')

Now let's get to the largest dataframe :) 

### Cleaning Fights Data

In [272]:
fights_df = pd.read_csv('raw_data/raw_fights_detailed.csv').set_index('Fight_Id')

In [262]:
check_types(fights_df)

Win/No Contest/Draw     object
Fighter_1               object
Fighter_2               object
KD_1                    object
KD_2                    object
STR_1                   object
STR_2                   object
TD_1                    object
TD_2                    object
SUB_1                   object
SUB_2                   object
Weight_Class            object
Method                  object
Round                    int64
Fight_Time              object
Event_Id_x              object
Result_1                object
Result_2                object
Time Format             object
Referee                 object
Method Details          object
Kd_1                   float64
Kd_2                   float64
Sig. Str._1             object
Sig. Str._2             object
Sig. Str. %_1           object
Sig. Str. %_2           object
Total Str._1            object
Total Str._2            object
Td_1                    object
Td_2                    object
Td %_1                  object
Td %_2  

--> All columns are objects

In [263]:
def values_count(df):
    for col in df.columns.tolist():
        yield col, df[col].value_counts(dropna=False)

# usage:
# gen = unique_values(fights_df)
# next(gen)  # returns (column_name, value_counts_series) for the next column

In [264]:
cols = values_count(fights_df)

I am going to do this for the rest of columns

In [None]:

def clean_missing_fight_detail(col):
    fights_df[col] = (fights_df[col]
                      .replace('--', pd.NA)
                      .astype('Int32'))


def make_categorical(col):
    fights_df[col] = fights_df[col].astype('category')


def calculate_pct(pct):
    if pd.isna(pct) or pct == '---':
        return pd.NA
    pct = pct.rstrip('%')
    return int(pct)/100


def parse_seconds_from_time(time):
    if pd.isna(time) or time == '--':
        return pd.NA
    minutes, seconds = time.split(':')
    return int(minutes) * 60 + int(seconds)


def of_to_pct(exp):
    if pd.isna(exp):
        return pd.NA
    if exp == '0 of 0':
        return 0

    x, y = exp.split('of')
    x, y = int(x), int(y)
    try:
        return round(x/y, 2)
    except ZeroDivisionError:
        return 0


def remove_quotation(s):
    if s.startswith("'"):
        return s.strip("'")
    else:
        return s.strip('"')

In [266]:
def clean_fights_df():
    
    clean_missing_fight_detail('KD_1')
    clean_missing_fight_detail('KD_2')
    clean_missing_fight_detail('STR_1')
    clean_missing_fight_detail('STR_2')
    clean_missing_fight_detail('TD_1')
    clean_missing_fight_detail('TD_2')
    clean_missing_fight_detail('SUB_1')
    clean_missing_fight_detail('SUB_2')

    make_categorical('Weight_Class')
    make_categorical('Result_1')
    make_categorical('Result_2')

    fights_df['Round'] = fights_df['Round'].astype('int8')
    
    fights_df.rename(columns={'Event_Id_x' : 'Event_Id'}, inplace=True)
    
    redundant_cols = ['Kd_1', 'Kd_2', 'Td_1',
                      'Td_2', 'Win/No Contest/Draw', 'Event_Id_y']
    fights_df.drop(columns=redundant_cols, inplace=True)

    pct_cols = ['Sig. Str. %', 'Td %']
    for col in pct_cols:
        fights_df[f'{col}_1'] = fights_df[f'{col}_1'].apply(calculate_pct)
        fights_df[f'{col}_2'] = fights_df[f'{col}_2'].apply(calculate_pct)
    # Cols that are in this form : x of y    
    of_cols = ['Head_','Body_','Leg_','Distance_','Clinch_','Ground_','Total Str._']
    # This data is supposed to be already scraped
    # But I did not scrape it to do it myself (faster)
    for col in of_cols :
        fights_df[f"{col}%_1"] = fights_df[f"{col}1"].apply(of_to_pct)
        fights_df[f"{col}%_2"] = fights_df[f"{col}2"].apply(of_to_pct)
        fights_df.drop(columns=[f"{col}1",f"{col}2"],inplace=True)

    fights_df['Weight_Class'] = fights_df['Weight_Class'].apply(remove_quotation)
    fights_df['Method'] = fights_df['Method'].apply(remove_quotation)
    fights_df['Fight_Time'] = fights_df['Fight_Time'].apply(remove_quotation)

    fights_df['Sub. Att_1'] = fights_df['Sub. Att_1'].astype('Int8')
    fights_df['Sub. Att_2'] = fights_df['Sub. Att_2'].astype('Int8')
    fights_df['Rev._1'] = fights_df['Rev._1'].astype('Int8')
    fights_df['Rev._2'] = fights_df['Rev._2'].astype('Int8')

    fights_df['Ctrl_1'] = fights_df['Ctrl_1'].apply(parse_seconds_from_time).astype('Int32')
    fights_df['Ctrl_2'] = fights_df['Ctrl_2'].apply(parse_seconds_from_time).astype('Int32')
    
    fights_df['Method Details'].fillna("No fight details provided",inplace=True)
    fights_df['Referee'].fillna("Uknown referee",inplace=True)
    
    fights_df.dropna(subset=['KD_1'], inplace=True)
    # I already have a takedowns column + 37% of the data missing is too much to be imputed
    fights_df.drop(columns=['Td %_1','Td %_2'],inplace=True)
    

In [267]:
clean_fights_df()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fights_df['Method Details'].fillna("No fight details provided",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fights_df['Referee'].fillna("Uknown referee",inplace=True)


In [268]:
fights_df.columns

Index(['Fighter_1', 'Fighter_2', 'KD_1', 'KD_2', 'STR_1', 'STR_2', 'TD_1',
       'TD_2', 'SUB_1', 'SUB_2', 'Weight_Class', 'Method', 'Round',
       'Fight_Time', 'Event_Id', 'Result_1', 'Result_2', 'Time Format',
       'Referee', 'Method Details', 'Sig. Str._1', 'Sig. Str._2',
       'Sig. Str. %_1', 'Sig. Str. %_2', 'Sub. Att_1', 'Sub. Att_2', 'Rev._1',
       'Rev._2', 'Ctrl_1', 'Ctrl_2', 'Head_%_1', 'Head_%_2', 'Body_%_1',
       'Body_%_2', 'Leg_%_1', 'Leg_%_2', 'Distance_%_1', 'Distance_%_2',
       'Clinch_%_1', 'Clinch_%_2', 'Ground_%_1', 'Ground_%_2',
       'Total Str._%_1', 'Total Str._%_2'],
      dtype='object')

In [269]:
(fights_df['Sig. Str._1'].isna()).sum()

np.int64(0)

In [270]:
print_missing_values(fights_df)

Sig. Str. %_1 : 38
Sig. Str. %_2 : 58
Ctrl_1 : 181
Ctrl_2 : 181


L missing data li mezelt maadch fiha hatta 7all ken IMPUTATION 