# Preprocessing 

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

In [2]:
sys.path.append("../src/")

In [3]:
from utilities import *

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
filepath = os.path.join(dir_dict["feature_engineered"], f"completed_feateng.parquet")
df = pd.read_parquet(filepath)

In [6]:
df["fighter_name"] = df["fighter_name"].astype("category")

In [7]:
df.select_dtypes(exclude=["number", "bool"])

Unnamed: 0,event_name,event_date,event_city,event_state,event_country,fight_bout,fight_method,fight_time_format,fight_referee,fight_details,fight_weight_class,fighter_name,fighter_nickname,fighter_stance,fighter_dob
0,,NaT,,,,light_heavyweight_bout,submission,3 Rnd (5-5-5),Chris Tognoni,Rear Naked Choke,light_heavyweight,,,,NaT
1,,NaT,,,,light_heavyweight_bout,submission,3 Rnd (5-5-5),Chris Tognoni,Rear Naked Choke,light_heavyweight,,,,NaT
2,,NaT,,,,welterweight_bout,decision_unanimous,3 Rnd (5-5-5),Yves Lavigne,Patricia Morse-Jarman ...,welterweight,,,,NaT
3,,NaT,,,,welterweight_bout,decision_unanimous,3 Rnd (5-5-5),Yves Lavigne,Patricia Morse-Jarman ...,welterweight,,,,NaT
4,,NaT,,,,middleweight_bout,decision_unanimous,3 Rnd (5-5-5),Herb Dean,Tony Weeks ...,middleweight,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13475,,NaT,,,,heavyweight_bout,ko_tko,3 Rnd (5-5-5),Gabe Barahona,Kick to Head At Distance Hook Kick,heavyweight,,,,NaT
13476,,NaT,,,,welterweight_bout,decision_unanimous,3 Rnd (5-5-5),Jason Herzog,Derek Cleary ...,welterweight,,,,NaT
13477,,NaT,,,,welterweight_bout,decision_unanimous,3 Rnd (5-5-5),Jason Herzog,Derek Cleary ...,welterweight,,,,NaT
13478,,NaT,,,,lightweight_bout,decision_split,5 Rnd (5-5-5-5-5),John McCarthy,Gabriel Sabaitis ...,lightweight,,,,NaT


In [8]:
to_drop = ["event_name", "event_city", "event_state", "event_country",
           "fight_bout", "fight_details", "fighter_dob",  
           "fighter_name", "fighter_nickname"]

In [9]:
df = df.drop(to_drop, axis=1)

### One Hot Encoding 

In [10]:
cat_cols = df.select_dtypes(exclude=["number", "bool"]).columns.to_list()
cat_cols.remove("event_date")
cat_cols

['fight_method',
 'fight_time_format',
 'fight_referee',
 'fight_weight_class',
 'fighter_stance']

In [11]:
df[cat_cols] = df[cat_cols].astype("category")

In [12]:
df = pd.concat([df, pd.get_dummies(df[cat_cols])], axis=1)

In [13]:
df = df.drop(cat_cols, axis=1)

In [14]:
bool_cols = df.select_dtypes("bool").columns
df[bool_cols] = df[bool_cols].astype("uint8")

### Drop Fight columns

In [15]:
to_drop = [col for col in df.columns if "fight_" in col] 
to_drop.remove("fight_id")
to_drop.remove("fight_fighter_win")
df = df.drop(to_drop, axis=1)

### Deal with Null Values

In [16]:
df.isna().sum().sort_values()

fighter_stance_switch                                  0
fighter_opponents_cummean_round1_body_ss_landed        0
fighter_method_decision_majority_winrate               0
fighter_method_decision_majority_losses                0
fighter_method_decision_majority_wins                  0
                                                   ...  
fighter_height_inches                              13473
fighter_height_reach_prod                          13474
fighter_reach_inches                               13474
fighter_age                                        13480
event_date                                         13480
Length: 394, dtype: int64

In [17]:
null_counter = df.isna().sum()

null_pct = null_counter.sort_values() / df.shape[0]

In [18]:
too_much_missing = null_pct[null_pct > 0.5].index.to_list()

In [19]:
len(too_much_missing)

113

Round4 or Round5 columns

In [20]:
round_4_5_cols = [col for col in too_much_missing if "round4" in col or "round5" in col]
len(round_4_5_cols)

108

In [21]:
df = df.drop(round_4_5_cols, axis=1)

In [22]:
null_counter = df.isna().sum()

null_pct = null_counter.sort_values() / df.shape[0]

In [23]:
null_pct

fighter_stance_switch                       0.000000
fighter_opponents_cummean_round1_sub_att    0.000000
fighter_opponents_cummean_overall_rev       0.000000
fighter_opponents_cummean_round1_rev        0.000000
fighter_method_decision_unanimous_wins      0.000000
                                              ...   
fighter_height_inches                       0.999481
fighter_height_reach_prod                   0.999555
fighter_reach_inches                        0.999555
fighter_age                                 1.000000
event_date                                  1.000000
Length: 286, dtype: float64

In [24]:
desc_cols = ["fighter_height_inches", "fighter_age", "fighter_reach_inches"]
df = df.loc[df[desc_cols].notna().all(axis=1)].reset_index(drop=True)

In [25]:
null_counter = df.isna().sum()

null_pct = null_counter.sort_values() / df.shape[0]

null_cols = null_pct[null_pct > 0].index.to_list()

In [26]:
len(null_cols)

0

In [27]:
len([col for col in null_cols if "cummean" in col])

0

#### Fighter Stats Columns

In [33]:
if null_cols:
    stat_cols = [col for col in null_cols if "fighter" in col]

    df[stat_cols] = \
        df.groupby("fighter_id")[stat_cols].transform(lambda x: x.fillna(method="ffill")\
                                                                          .fillna(method="bfill"))

In [34]:
df.isna().sum().sort_values()

event_date                                        0.0
fighter_opponents_cummean_overall_leg_ss_total    0.0
fighter_opponents_cummean_round1_leg_ss_landed    0.0
fighter_opponents_cummean_round1_leg_ss_total     0.0
fighter_opponents_cummean_round2_leg_ss_landed    0.0
                                                 ... 
fighter_cummean_round1_ground_ss_total            0.0
fighter_cummean_round1_ground_ss_landed           0.0
fighter_cummean_overall_ground_ss_total           0.0
fighter_cummean_round3_sig_str_total              0.0
fighter_stance_switch                             0.0
Length: 286, dtype: float64

In [35]:
if null_cols:
    df[stat_cols] = df[stat_cols].fillna(df[stat_cols].mean())

In [36]:
df.isna().sum().sort_values()

event_date                                        0.0
fighter_opponents_cummean_overall_leg_ss_total    0.0
fighter_opponents_cummean_round1_leg_ss_landed    0.0
fighter_opponents_cummean_round1_leg_ss_total     0.0
fighter_opponents_cummean_round2_leg_ss_landed    0.0
                                                 ... 
fighter_cummean_round1_ground_ss_total            0.0
fighter_cummean_round1_ground_ss_landed           0.0
fighter_cummean_overall_ground_ss_total           0.0
fighter_cummean_round3_sig_str_total              0.0
fighter_stance_switch                             0.0
Length: 286, dtype: float64

In [37]:
filepath = os.path.join(dir_dict["preprocessed"], f"agg_processed.parquet")
df.to_parquet(filepath)