# Preprocessing 

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [2]:
sys.path.append("../src/")

In [3]:
from utilities import *

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
filepath = os.path.join(dir_dict["feature_engineered"], f"completed_feateng.parquet")
df = pd.read_parquet(filepath)

In [6]:
df["fighter_name"] = df["fighter_name"].astype("category")

In [7]:
df.select_dtypes(exclude=["number", "bool"])

Unnamed: 0,event_name,event_date,event_city,event_state,event_country,fight_bout,fight_method,fight_time_format,fight_referee,fight_details,fight_weight_class,fighter_name,fighter_nickname,fighter_stance,fighter_dob
0,UFC - Ultimate Ultimate '95,1995-12-16,Denver,Colorado,USA,open_weight_bout,decision_unanimous,1 Rnd (18),John McCarthy,,open_weight,Dan Severn,The Beast,southpaw,1958-06-08
1,UFC - Ultimate Ultimate '95,1995-12-16,Denver,Colorado,USA,ultimate_ultimate_95_tournament_title_bout,decision_unanimous,1 Rnd + OT (27-3),John McCarthy,,open_weight,Dan Severn,The Beast,southpaw,1958-06-08
2,UFC - Ultimate Ultimate '95,1995-12-16,Denver,Colorado,USA,open_weight_bout,submission,1 Rnd (15),John McCarthy,Ankle Lock From Inoki-Ali Position,open_weight,Dave Beneteau,Dangerous,orthodox,NaT
3,UFC - Ultimate Ultimate '95,1995-12-16,Denver,Colorado,USA,open_weight_bout,submission,1 Rnd (15),John McCarthy,Rear Naked Choke,open_weight,Keith Hackney,The Giant Killer,sideways,NaT
4,UFC - Ultimate Ultimate '95,1995-12-16,Denver,Colorado,USA,ultimate_ultimate_95_tournament_title_bout,decision_unanimous,1 Rnd + OT (27-3),John McCarthy,,open_weight,Oleg Taktarov,The Russian Bear,orthodox,1967-08-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13475,UFC Fight Night: Cannonier vs. Strickland,2022-12-17,Las Vegas,Nevada,USA,middleweight_bout,decision_split,5 Rnd (5-5-5-5-5),Herb Dean,Derek Cleary ...,middleweight,Sean Strickland,,orthodox,1991-02-27
13476,UFC Fight Night: Cannonier vs. Strickland,2022-12-17,Las Vegas,Nevada,USA,flyweight_bout,ko_tko,3 Rnd (5-5-5),Keith Peterson,Punch to Head At Distance,flyweight,Alessandro Costa,Nono,orthodox,1996-01-28
13477,UFC Fight Night: Cannonier vs. Strickland,2022-12-17,Las Vegas,Nevada,USA,welterweight_bout,decision_unanimous,3 Rnd (5-5-5),Herb Dean,Adalaide Byrd ...,welterweight,Bryan Battle,Pooh Bear,orthodox,1994-09-21
13478,UFC Fight Night: Cannonier vs. Strickland,2022-12-17,Las Vegas,Nevada,USA,lightweight_bout,decision_unanimous,3 Rnd (5-5-5),Mark Smith,Mike Bell ...,lightweight,Damir Ismagulov,Qazaq,orthodox,1991-02-03


In [8]:
to_drop = ["event_name", "event_city", "event_state", "event_country",
           "fight_bout", "fight_details", "fighter_dob",  
           "fighter_name", "fighter_nickname"]

In [9]:
df = df.drop(to_drop, axis=1)

### One Hot Encoding 

In [10]:
cat_cols = df.select_dtypes(exclude=["number", "bool"]).columns.to_list()
cat_cols.remove("event_date")
cat_cols

['fight_method',
 'fight_time_format',
 'fight_referee',
 'fight_weight_class',
 'fighter_stance']

In [11]:
df[cat_cols] = df[cat_cols].astype("category")

In [12]:
df = pd.concat([df, pd.get_dummies(df[cat_cols])], axis=1)

In [13]:
df = df.drop(cat_cols, axis=1)

In [14]:
bool_cols = df.select_dtypes("bool").columns
df[bool_cols] = df[bool_cols].astype("uint8")

### Drop Fight columns

In [15]:
to_drop = [col for col in df.columns if "fight_" in col] 
to_drop.remove("fight_id")
to_drop.remove("fight_fighter_win")
df = df.drop(to_drop, axis=1)

### Deal with Null Values

In [16]:
df.isna().sum().sort_values()

event_date                                             0
fighter_opponents_cummean_overall_body_ss_total        0
fighter_opponents_cummean_round1_body_ss_landed        0
fighter_opponents_cummean_round1_body_ss_total         0
fighter_opponents_cummean_overall_leg_ss_landed        0
                                                   ...  
fighter_opponents_cummean_round5_sig_str_%         11800
fighter_opponents_cummean_round4_td_%              12171
fighter_cummean_round4_td_%                        12297
fighter_opponents_cummean_round5_td_%              12364
fighter_cummean_round5_td_%                        12486
Length: 394, dtype: int64

In [17]:
null_counter = df.isna().sum()

null_pct = null_counter.sort_values() / df.shape[0]

In [18]:
too_much_missing = null_pct[null_pct > 0.5].index.to_list()

In [19]:
len(too_much_missing)

108

Round4 or Round5 columns

In [20]:
round_4_5_cols = [col for col in too_much_missing if "round4" in col or "round5" in col]
len(round_4_5_cols)

108

In [21]:
df = df.drop(round_4_5_cols, axis=1)

In [22]:
null_counter = df.isna().sum()

null_pct = null_counter.sort_values() / df.shape[0]

In [23]:
null_pct

event_date                                         0.000000
fighter_opponents_cummean_overall_body_ss_total    0.000000
fighter_opponents_cummean_round1_body_ss_landed    0.000000
fighter_opponents_cummean_round1_body_ss_total     0.000000
fighter_opponents_cummean_overall_leg_ss_landed    0.000000
                                                     ...   
fighter_cummean_round1_td_%                        0.177003
fighter_opponents_cummean_round2_td_%              0.228858
fighter_cummean_round2_td_%                        0.263056
fighter_opponents_cummean_round3_td_%              0.298813
fighter_cummean_round3_td_%                        0.321810
Length: 286, dtype: float64

In [24]:
desc_cols = ["fighter_height_inches", "fighter_age", "fighter_reach_inches"]
df = df.loc[df[desc_cols].notna().all(axis=1)].reset_index(drop=True)

In [25]:
null_counter = df.isna().sum()

null_pct = null_counter.sort_values() / df.shape[0]

null_cols = null_pct[null_pct > 0].index.to_list()

In [26]:
len(null_cols)

124

In [27]:
len([col for col in null_cols if "cummean" in col])

124

#### Fighter Stats Columns

In [28]:
stat_cols = [col for col in null_cols if "fighter" in col]

In [29]:
df[stat_cols] = \
    df.groupby("fighter_id")[stat_cols].transform(lambda x: x.fillna(method="ffill")\
                                                                      .fillna(method="bfill"))

In [30]:
df.isna().sum().sort_values()

event_date                                             0
fighter_opponents_cummean_round1_head_ss_total         0
fighter_opponents_cummean_overall_body_ss_landed       0
fighter_opponents_cummean_overall_body_ss_total        0
fighter_opponents_cummean_round1_body_ss_landed        0
                                                    ... 
fighter_opponents_cummean_round2_td_%                645
fighter_cummean_round1_td_%                          738
fighter_opponents_cummean_round3_td_%               1135
fighter_cummean_round2_td_%                         1251
fighter_cummean_round3_td_%                         1631
Length: 286, dtype: int64

In [31]:
df[stat_cols] = df[stat_cols].fillna(df[stat_cols].mean())

In [32]:
df.isna().sum().sort_values()

event_date                                        0
fighter_opponents_cummean_overall_leg_ss_total    0
fighter_opponents_cummean_round1_leg_ss_landed    0
fighter_opponents_cummean_round1_leg_ss_total     0
fighter_opponents_cummean_round2_leg_ss_landed    0
                                                 ..
fighter_cummean_round1_ground_ss_total            0
fighter_cummean_round1_ground_ss_landed           0
fighter_cummean_overall_ground_ss_total           0
fighter_cummean_round3_sig_str_total              0
fighter_stance_switch                             0
Length: 286, dtype: int64

In [33]:
filepath = os.path.join(dir_dict["preprocessed"], f"agg_processed.parquet")
df.to_parquet(filepath)