## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import HTML, display

In [2]:
#Setting columns and rows to display all the results
pd.set_option("display.max_columns", None, "display.max_rows", None)

In [3]:
def load_notebook_config(width=True):
    """
    Loads all neccesary configuration for the notebook's style:
     - plots styling.
     - pandas table sizes and limiting amount of float decimals.
     - adjust the notebook cells width
    """
    pd.options.display.max_columns = 0
    pd.set_option('display.float_format', lambda x: '%.4f' % x)
    pd.options.mode.chained_assignment = None

    if width:
        display(HTML("""<link href='https://fonts.googleapis.com/css?family=Montserrat' rel='stylesheet'>
                        <style> div.text_cell_render{font-family: 'Montserrat';}
                                .container { width:95% !important;}
                        </style>"""))
load_notebook_config()

In [4]:
# Load the data
file_path = "../Resources/ufc-master.csv"
master_df = pd.read_csv(file_path, low_memory=False)
master_df.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,...,empty_arena,constant_1,B_match_weightclass_rank,R_match_weightclass_rank,R_Women's Flyweight_rank,R_Women's Featherweight_rank,R_Women's Strawweight_rank,R_Women's Bantamweight_rank,R_Heavyweight_rank,R_Light Heavyweight_rank,R_Middleweight_rank,R_Welterweight_rank,R_Lightweight_rank,R_Featherweight_rank,R_Bantamweight_rank,R_Flyweight_rank,R_Pound-for-Pound_rank,B_Women's Flyweight_rank,B_Women's Featherweight_rank,B_Women's Strawweight_rank,B_Women's Bantamweight_rank,B_Heavyweight_rank,B_Light Heavyweight_rank,B_Middleweight_rank,B_Welterweight_rank,B_Lightweight_rank,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,R_kd_bout,B_kd_bout,R_sig_str_landed_bout,B_sig_str_landed_bout,R_sig_str_attempted_bout,B_sig_str_attempted_bout,R_sig_str_pct_bout,B_sig_str_pct_bout,R_tot_str_landed_bout,B_tot_str_landed_bout,R_tot_str_attempted_bout,B_tot_str_attempted_bout,R_td_landed_bout,B_td_landed_bout,R_td_attempted_bout,B_td_attempted_bout,R_td_pct_bout,B_td_pct_bout,R_sub_attempts_bout,B_sub_attempts_bout,R_pass_bout,B_pass_bout,R_rev_bout,B_rev_bout
0,Alistair Overeem,Alexander Volkov,150,-182,150.0,54.9451,2/6/2021,"Las Vegas, Nevada, USA",USA,Blue,False,Heavyweight,MALE,5,0,1,0,4.76,0.58,0.2,0.69,0.7,4,2,26,0,0,1,2,3,0,0,6,Orthodox,200.66,203.2,250,0,2,0,3.71,0.64,0.8,1.44,0.56,11,15,83,1,1,0,4,20,8,0,33,Orthodox,193.04,203.2,265,...,1,1,6.0,5.0,,,,,5.0,,,,,,,,,,,,,6.0,,,,,,,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Cory Sandhagen,Frankie Edgar,-400,300,25.0,300.0,2/6/2021,"Las Vegas, Nevada, USA",USA,Red,False,Bantamweight,MALE,3,0,1,1,3.7,0.39,0.3,2.28,0.31,5,8,94,9,0,1,10,5,2,0,18,Orthodox,167.64,172.72,135,0,1,0,6.88,0.48,0.5,1.07,0.5,5,1,14,0,0,1,1,3,1,0,6,Switch,180.34,177.8,135,...,1,1,4.0,2.0,,,,,,,,,,,2.0,,,,,,,,,,,,,4.0,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Alexandre Pantoja,Manel Kape,-125,100,80.0,100.0,2/6/2021,"Las Vegas, Nevada, USA",USA,Red,False,Flyweight,MALE,3,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,Southpaw,165.1,172.72,125,1,0,0,4.2,0.47,1.1,1.08,0.36,3,3,21,0,0,1,1,2,2,0,6,Orthodox,165.1,170.18,125,...,1,1,,5.0,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Diego Ferreira,Beneil Dariush,-125,105,80.0,105.0,2/6/2021,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,MALE,3,0,5,1,3.98,0.49,1.1,1.75,0.32,5,4,36,0,0,1,4,3,5,0,13,Southpaw,177.8,182.88,155,0,6,0,5.07,0.36,1.0,1.04,0.27,6,2,21,0,0,0,3,3,2,0,8,Orthodox,175.26,187.96,155,...,1,1,13.0,10.0,,,,,,,,,10.0,,,,,,,,,,,,,13.0,,,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Michael Johnson,Clay Guida,-225,175,44.4444,175.0,2/6/2021,"Las Vegas, Nevada, USA",USA,Blue,False,Lightweight,MALE,3,2,0,0,2.41,0.33,0.7,3.37,0.38,4,15,87,0,0,3,8,2,4,0,17,Orthodox,170.18,177.8,155,3,0,0,4.23,0.38,0.1,0.51,0.45,4,12,58,0,0,1,6,4,0,0,11,Southpaw,177.8,185.42,155,...,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
file_path_1 = "../Resources/upcoming-event.csv"
upcoming_df = pd.read_csv(file_path_1, low_memory=False)
upcoming_df.head()

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,...,empty_arena,constant_1,B_match_weightclass_rank,R_match_weightclass_rank,R_Women's Flyweight_rank,R_Women's Featherweight_rank,R_Women's Strawweight_rank,R_Women's Bantamweight_rank,R_Heavyweight_rank,R_Light Heavyweight_rank,R_Middleweight_rank,R_Welterweight_rank,R_Lightweight_rank,R_Featherweight_rank,R_Bantamweight_rank,R_Flyweight_rank,R_Pound-for-Pound_rank,B_Women's Flyweight_rank,B_Women's Featherweight_rank,B_Women's Strawweight_rank,B_Women's Bantamweight_rank,B_Heavyweight_rank,B_Light Heavyweight_rank,B_Middleweight_rank,B_Welterweight_rank,B_Lightweight_rank,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,R_kd_bout,B_kd_bout,R_sig_str_landed_bout,B_sig_str_landed_bout,R_sig_str_attempted_bout,B_sig_str_attempted_bout,R_sig_str_pct_bout,B_sig_str_pct_bout,R_tot_str_landed_bout,B_tot_str_landed_bout,R_tot_str_attempted_bout,B_tot_str_attempted_bout,R_td_landed_bout,B_td_landed_bout,R_td_attempted_bout,B_td_attempted_bout,R_td_pct_bout,B_td_pct_bout,R_sub_attempts_bout,B_sub_attempts_bout,R_pass_bout,B_pass_bout,R_rev_bout,B_rev_bout
0,Kamaru Usman,Gilbert Burns,-278.0,228.0,35.97122302,228.0,2/13/2021,"Las Vegas, Nevada, USA",USA,,True,Welterweight,MALE,5.0,0.0,6.0,0.0,3.15,0.46,0.7,2.21,0.37,6.0,3.0,36.0,0.0,0.0,0.0,5.0,3.0,4.0,0.0,12.0,Orthodox,177.8,180.34,170.0,0.0,12.0,0.0,4.5,0.53,0.1,3.38,0.47,12.0,0.0,43.0,3.0,0.0,0.0,9.0,2.0,1.0,0.0,12.0,Switch,182.88,193.04,170.0,...,1.0,1.0,1.0,0.0,,,,,,,,0.0,,,,,5.0,,,,,,,,1.0,,,,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Maycee Barber,Alexa Grasso,-107.0,-107.0,93.45794393,93.4579,2/13/2021,"Las Vegas, Nevada, USA",USA,,False,Women's Flyweight,FEMALE,3.0,0.0,1.0,0.0,5.35,0.42,0.5,0.32,0.4,1.0,3.0,19.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,4.0,Orthodox,165.1,167.64,125.0,1.0,0.0,0.0,6.02,0.6,0.3,0.95,0.3,4.0,1.0,11.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,Switch,165.1,165.1,125.0,...,1.0,1.0,15.0,10.0,10.0,,,,,,,,,,,,,15.0,,,,,,,,,,,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Kelvin Gastelum,Ian Heinisch,-205.0,174.0,48.7804878,174.0,2/13/2021,"Las Vegas, Nevada, USA",USA,,False,Middleweight,MALE,3.0,0.0,1.0,0.0,3.62,0.52,0.5,1.16,0.19,3.0,2.0,14.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,4.0,Orthodox,180.34,182.88,185.0,3.0,0.0,0.0,3.72,0.43,0.1,0.87,0.37,5.0,6.0,44.0,1.0,0.0,3.0,2.0,3.0,2.0,0.0,10.0,Southpaw,175.26,180.34,185.0,...,1.0,1.0,15.0,9.0,,,,,,,9.0,,,,,,,,,,,,,15.0,,,,,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Ricky Simon,Brian Kelleher,-253.0,210.0,39.5256917,210.0,2/13/2021,"Las Vegas, Nevada, USA",USA,,False,Featherweight,MALE,3.0,0.0,1.0,0.0,4.45,0.4,0.8,1.2,0.25,2.0,4.0,19.0,0.0,0.0,0.0,1.0,2.0,3.0,0.0,6.0,Switch,167.64,167.64,145.0,0.0,2.0,0.0,3.09,0.42,0.3,7.07,0.53,4.0,2.0,21.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,6.0,Orthodox,167.64,175.26,135.0,...,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Maki Pitolo,Julian Marquez,145.0,-177.0,145.0,56.4972,2/13/2021,"Las Vegas, Nevada, USA",USA,,False,Middleweight,MALE,3.0,1.0,0.0,0.0,3.8,0.4,3.0,0.0,0.0,2.0,1.0,7.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,Orthodox,187.96,182.88,185.0,2.0,0.0,0.0,4.87,0.43,0.4,1.45,0.66,1.0,3.0,10.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,Orthodox,177.8,190.5,185.0,...,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
# Final row in upcoming dataset is null so we must drop it
upcoming_df.drop(index=[11], inplace=True)


In [7]:
# List dataframe data types
master_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4566 entries, 0 to 4565
Columns: 137 entries, R_fighter to B_rev_bout
dtypes: bool(1), float64(73), int64(47), object(16)
memory usage: 4.7+ MB


In [8]:
master_df.isnull().sum().sort_values(ascending=False)

B_Women's Featherweight_rank    4566
R_Women's Featherweight_rank    4553
B_Pound-for-Pound_rank          4538
B_Women's Flyweight_rank        4532
R_Women's Flyweight_rank        4516
B_Women's Strawweight_rank      4511
B_Women's Bantamweight_rank     4492
B_Bantamweight_rank             4486
B_Lightweight_rank              4484
B_Welterweight_rank             4484
B_Featherweight_rank            4482
B_Light Heavyweight_rank        4479
B_Flyweight_rank                4476
B_Middleweight_rank             4476
R_Women's Strawweight_rank      4470
B_Heavyweight_rank              4464
R_Women's Bantamweight_rank     4457
R_Middleweight_rank             4445
R_Featherweight_rank            4441
R_Bantamweight_rank             4438
R_Lightweight_rank              4438
R_Welterweight_rank             4435
R_Light Heavyweight_rank        4434
R_Heavyweight_rank              4433
R_Flyweight_rank                4432
R_Pound-for-Pound_rank          4418
B_match_weightclass_rank        3754
R

In [9]:
# Find duplicate entries
print(f"Duplicate entries: {master_df.duplicated().sum()}")

Duplicate entries: 0


In [10]:
#Encoding label so it is easier to find correlation-- did this encoding first so we can check feature corrleation
master_df['Winner'] = [1 if winner == 'Red' else 0 for winner in master_df.Winner]
#upcoming_df['Winner'] = [1 if winner == 'Red' else 0 for winner in master_df.Winner]

#### Top Features correlated to winning

In [11]:
# Gather correlation between variables and target
num_corr_col = [column for column in master_df.columns if master_df[column].dtype == 'int64' or master_df[column].dtype == 'float64']
corr_dict = {}
#Getting absolute values of correlation since we would need to inspect negative correlation too
for column in num_corr_col:
    corr_dict[column] = abs(master_df[column].corr(master_df['Winner']))

In [12]:
for w in sorted(corr_dict, key=corr_dict.get):
    print(w, corr_dict[w])

B_Pound-for-Pound_rank 4.0438642413036e-18
B_avg_SUB_ATT 0.0006736112614573925
B_avg_TD_pct 0.0009201808895340117
R_Welterweight_rank 0.0012701269778216627
B_win_by_Submission 0.0017995641358488264
R_win_by_Decision_Unanimous 0.002687505760424579
total_title_bout_dif 0.003499821552798515
B_longest_win_streak 0.004181782202401373
R_Weight_lbs 0.005904824770245006
R_Height_cms 0.005970663201641085
R_win_by_Submission 0.006133959910974233
sub_dif 0.006449232915702231
R_draw 0.007935935240262116
B_win_by_TKO_Doctor_Stoppage 0.009936743234671318
R_rev_bout 0.01052663529382445
B_avg_SIG_STR_pct 0.010679686254663588
R_Pound-for-Pound_rank 0.010723703312317751
R_wins 0.010795991776121673
B_Middleweight_rank 0.010808014397003114
B_wins 0.011037180629812016
R_Reach_cms 0.012441208177052288
R_win_by_KO/TKO 0.012514062387041855
empty_arena 0.013754836001593093
B_win_by_Decision_Majority 0.013951715582271667
B_win_by_Decision_Unanimous 0.015089180046768083
R_win_by_Decision_Majority 0.0159562117792

#### Feature Engineering
We have a very robust dataset with 137 columns of data.  In order to achieve better model performance we must
1. Feature Engineering- Limit input features by combining similar variables in order to increase correlation
2. Eliminate 'noisy' data
3. Find null values and fill them
4. Encode categorical variables

In [13]:
# Take all variables for both 'r' and 'b' fighters, and engineer them into a difference.  This will concentrate our data and should increase performance

master_df['draw_diff'] = (master_df['B_draw']-master_df['R_draw'])
master_df['avg_sig_str_pct_diff'] = (master_df['B_avg_SIG_STR_pct']-master_df['R_avg_SIG_STR_pct'])
master_df['avg_TD_pct_diff'] = (master_df['B_avg_TD_pct']-master_df['B_avg_TD_pct'])
master_df['win_by_Decision_Majority_diff'] = (master_df['B_win_by_Decision_Majority']-master_df['R_win_by_Decision_Majority'])
master_df['win_by_Decision_Split_diff'] = (master_df['B_win_by_Decision_Split']-master_df['R_win_by_Decision_Split'])
master_df['win_by_Decision_Unanimous_diff'] = (master_df['B_win_by_Decision_Unanimous']-master_df['R_win_by_Decision_Unanimous'])
master_df['win_by_TKO_Doctor_Stoppage_diff'] = (master_df['B_win_by_TKO_Doctor_Stoppage']-master_df['R_win_by_TKO_Doctor_Stoppage'])
master_df['odds_diff'] = (master_df['B_odds']-master_df['R_odds'])
master_df['ev_diff'] = (master_df['B_ev']-master_df['R_ev'])
master_df['kd_bout_diff']=(master_df['B_kd_bout']-master_df['R_kd_bout'])
master_df['sig_str_landed_bout_diff']=(master_df['B_sig_str_landed_bout']-master_df['R_sig_str_landed_bout'])
master_df['sig_str_attempted_bout_diff']=(master_df['B_sig_str_attempted_bout']-master_df['R_sig_str_attempted_bout'])
master_df['sig_str_attempted_bout_diff']=(master_df['B_sig_str_attempted_bout']-master_df['R_sig_str_attempted_bout'])
master_df['sig_str_pct_bout_diff']=(master_df['B_sig_str_pct_bout']-master_df['R_sig_str_pct_bout'])
master_df['tot_str_landed_bout_diff']=(master_df['B_tot_str_landed_bout']-master_df['R_tot_str_landed_bout'])
master_df['tot_str_attempted_bout_diff']=(master_df['B_tot_str_attempted_bout']-master_df['R_tot_str_attempted_bout'])
master_df['td_landed_bout_diff']=(master_df['B_td_landed_bout']-master_df['R_td_landed_bout'])
master_df['td_attempted_bout_diff']=(master_df['B_td_attempted_bout']-master_df['R_td_attempted_bout'])
master_df['td_pct_bout_diff']=(master_df['B_td_pct_bout']-master_df['R_td_pct_bout'])
master_df['td_pct_bout_diff']=(master_df['B_td_pct_bout']-master_df['R_td_pct_bout'])
master_df['sub_attempts_bout_diff']=(master_df['B_sub_attempts_bout']-master_df['R_sub_attempts_bout'])
master_df['pass_bout_diff']=(master_df['B_pass_bout']-master_df['R_pass_bout'])
master_df['rev_bout_diff']=(master_df['B_rev_bout']-master_df['R_rev_bout'])

In [14]:
# R_ev column is a string and we must convert to float for feature engineering in upcoming df
upcoming_df['R_ev'] = pd.to_numeric(upcoming_df['R_ev'], errors='coerce')

In [15]:
 # Repeat for upcoming df

upcoming_df['draw_diff'] = (upcoming_df['B_draw']-upcoming_df['R_draw'])
upcoming_df['avg_sig_str_pct_diff'] = (upcoming_df['B_avg_SIG_STR_pct']-upcoming_df['R_avg_SIG_STR_pct'])
upcoming_df['avg_TD_pct_diff'] = (upcoming_df['B_avg_TD_pct']-upcoming_df['B_avg_TD_pct'])
upcoming_df['win_by_Decision_Majority_diff'] = (upcoming_df['B_win_by_Decision_Majority']-upcoming_df['R_win_by_Decision_Majority'])
upcoming_df['win_by_Decision_Split_diff'] = (upcoming_df['B_win_by_Decision_Split']-upcoming_df['R_win_by_Decision_Split'])
upcoming_df['win_by_Decision_Unanimous_diff'] = (upcoming_df['B_win_by_Decision_Unanimous']-upcoming_df['R_win_by_Decision_Unanimous'])
upcoming_df['win_by_TKO_Doctor_Stoppage_diff'] = (upcoming_df['B_win_by_TKO_Doctor_Stoppage']-upcoming_df['R_win_by_TKO_Doctor_Stoppage'])
upcoming_df['odds_diff'] = (upcoming_df['B_odds']-upcoming_df['R_odds'])
upcoming_df['ev_diff'] = (upcoming_df['B_ev']-upcoming_df['R_ev'])
upcoming_df['kd_bout_diff']=(upcoming_df['B_kd_bout']-upcoming_df['R_kd_bout'])
upcoming_df['sig_str_landed_bout_diff']=(upcoming_df['B_sig_str_landed_bout']-upcoming_df['R_sig_str_landed_bout'])
upcoming_df['sig_str_attempted_bout_diff']=(upcoming_df['B_sig_str_attempted_bout']-upcoming_df['R_sig_str_attempted_bout'])
upcoming_df['sig_str_attempted_bout_diff']=(upcoming_df['B_sig_str_attempted_bout']-upcoming_df['R_sig_str_attempted_bout'])
upcoming_df['sig_str_pct_bout_diff']=(upcoming_df['B_sig_str_pct_bout']-upcoming_df['R_sig_str_pct_bout'])
upcoming_df['tot_str_landed_bout_diff']=(upcoming_df['B_tot_str_landed_bout']-upcoming_df['R_tot_str_landed_bout'])
upcoming_df['tot_str_attempted_bout_diff']=(upcoming_df['B_tot_str_attempted_bout']-upcoming_df['R_tot_str_attempted_bout'])
upcoming_df['td_landed_bout_diff']=(upcoming_df['B_td_landed_bout']-upcoming_df['R_td_landed_bout'])
upcoming_df['td_attempted_bout_diff']=(upcoming_df['B_td_attempted_bout']-upcoming_df['R_td_attempted_bout'])
upcoming_df['td_pct_bout_diff']=(upcoming_df['B_td_pct_bout']-upcoming_df['R_td_pct_bout'])
upcoming_df['td_pct_bout_diff']=(upcoming_df['B_td_pct_bout']-upcoming_df['R_td_pct_bout'])
upcoming_df['sub_attempts_bout_diff']=(upcoming_df['B_sub_attempts_bout']-upcoming_df['R_sub_attempts_bout'])
upcoming_df['pass_bout_diff']=(upcoming_df['B_pass_bout']-upcoming_df['R_pass_bout'])
upcoming_df['rev_bout_diff']=(upcoming_df['B_rev_bout']-upcoming_df['R_rev_bout'])

In [16]:
#Dropping original variables, necessary information has been extracted.
var_drop = [
'B_odds',
'R_odds',
'B_ev',
'R_ev',
'R_kd_bout',
'B_kd_bout',
'R_sig_str_landed_bout',
'B_sig_str_landed_bout',
'R_sig_str_attempted_bout',
'B_sig_str_attempted_bout',
'R_sig_str_pct_bout',
'B_sig_str_pct_bout',
'R_tot_str_landed_bout',
'B_tot_str_landed_bout',
'R_tot_str_attempted_bout',
'B_tot_str_attempted_bout',
'R_td_landed_bout',
'B_td_landed_bout',
'R_td_attempted_bout',
'B_td_attempted_bout',
'R_td_pct_bout',
'B_td_pct_bout',
'R_sub_attempts_bout',
'B_sub_attempts_bout',
'R_pass_bout',
'B_pass_bout',
'R_rev_bout',
'B_rev_bout',
'B_current_lose_streak', 'R_current_lose_streak',
'B_current_win_streak', 'R_current_win_streak',
'B_longest_win_streak', 'R_longest_win_streak',
'B_wins', 'R_wins',
'B_losses', 'R_losses',
'B_total_rounds_fought', 'R_total_rounds_fought',
'B_total_title_bouts', 'R_total_title_bouts',
'B_win_by_KO/TKO', 'R_win_by_KO/TKO',
'B_win_by_Submission', 'R_win_by_Submission',
'B_Height_cms', 'R_Height_cms',
'B_Reach_cms', 'R_Reach_cms',
'B_age', 'R_age',
'B_avg_SIG_STR_landed', 'R_avg_SIG_STR_landed',
'B_avg_SUB_ATT', 'R_avg_SUB_ATT',
'B_avg_TD_landed', 'R_avg_TD_landed',
'B_draw','B_avg_SIG_STR_pct','B_avg_TD_pct','B_win_by_Decision_Majority','B_win_by_Decision_Split','B_win_by_Decision_Unanimous','B_win_by_TKO_Doctor_Stoppage',
'R_draw','R_avg_SIG_STR_pct','R_avg_TD_pct','R_win_by_Decision_Majority','R_win_by_Decision_Split','R_win_by_Decision_Unanimous','R_win_by_TKO_Doctor_Stoppage']
master_df.drop(var_drop, axis=1, inplace = True)
upcoming_df.drop(var_drop, axis=1, inplace = True)

In [17]:
# First we will drop all variables that are common to both fighters, and therefore will not help predict the outcome in this model
noisy_data = [
'date','location','country','weight_class','gender','no_of_rounds','empty_arena','constant_1','finish','finish_details','finish_round','finish_round_time','total_fight_time_secs','B_Weight_lbs','R_Weight_lbs'
]
master_df.drop(noisy_data, axis=1, inplace=True)
upcoming_df.drop(noisy_data, axis=1, inplace=True)

In [18]:
# When analyzing null values above it is apparent that 'rank' values contain mostly null values 
# because each fighter can only be ranked in 1 weight class in most cases, we will dig deeper.  
# These columns are all located between cells CB and DD in the .csv file
master_df.loc[:,'B_match_weightclass_rank':'better_rank'].isnull().sum()

B_match_weightclass_rank        3754
R_match_weightclass_rank        3333
R_Women's Flyweight_rank        4516
R_Women's Featherweight_rank    4553
R_Women's Strawweight_rank      4470
R_Women's Bantamweight_rank     4457
R_Heavyweight_rank              4433
R_Light Heavyweight_rank        4434
R_Middleweight_rank             4445
R_Welterweight_rank             4435
R_Lightweight_rank              4438
R_Featherweight_rank            4441
R_Bantamweight_rank             4438
R_Flyweight_rank                4432
R_Pound-for-Pound_rank          4418
B_Women's Flyweight_rank        4532
B_Women's Featherweight_rank    4566
B_Women's Strawweight_rank      4511
B_Women's Bantamweight_rank     4492
B_Heavyweight_rank              4464
B_Light Heavyweight_rank        4479
B_Middleweight_rank             4476
B_Welterweight_rank             4484
B_Lightweight_rank              4484
B_Featherweight_rank            4482
B_Bantamweight_rank             4486
B_Flyweight_rank                4476
B

In [19]:
# better_rank is the only column that does not have a significant number of null values, we will keep and remove the other columns
master_df.drop(master_df.loc[:,'B_match_weightclass_rank':'B_Pound-for-Pound_rank'], axis=1, inplace = True)
upcoming_df.drop(upcoming_df.loc[:,'B_match_weightclass_rank':'B_Pound-for-Pound_rank'], axis=1, inplace = True)

In [20]:
# B_Stance has a repetitive value, space added after Switch
# use .loc to change all mispelled Switch values to correct format
master_df['B_Stance'].loc[master_df['B_Stance']=='Switch '] = 'Switch'
upcoming_df['B_Stance'].loc[master_df['B_Stance']=='Switch '] = 'Switch'

In [21]:
# Create a dictionary in order to encode the stance variables
stance_enc = {
    'Open Stance': 1,
    'Switch': 2,
    'Southpaw': 3,
    'Orthodox': 4
}
# Assign stances to list so they can be dropped
stance = ['B_Stance', 'R_Stance', 'R_Stance_enc', 'B_Stance_enc']

In [22]:
# Use lambda function to custom encode R_Stance and B_ Stance to the stance_enc dictionary
master_df["R_Stance_enc"] = master_df["R_Stance"].apply(lambda x: stance_enc[x])
master_df["B_Stance_enc"] = master_df["B_Stance"].apply(lambda x: stance_enc[x])
master_df['Stance_diff'] = (master_df['B_Stance_enc'] - master_df['R_Stance_enc'])
upcoming_df["R_Stance_enc"] = upcoming_df["R_Stance"].apply(lambda x: stance_enc[x])
upcoming_df["B_Stance_enc"] = upcoming_df["B_Stance"].apply(lambda x: stance_enc[x])
upcoming_df['Stance_diff'] = (upcoming_df['B_Stance_enc'] - upcoming_df['R_Stance_enc'])

In [23]:
# Drop stance variables that have numerical info extracted
master_df.drop(stance, axis=1, inplace=True)
upcoming_df.drop(stance, axis=1, inplace=True)

In [24]:
# Create a dictionary in order to encode 'better rank column'
rank_enc = {
    'Red': -1,
    'Blue': 1,
    'neither': 0
}
# Assign better rank to list so it can be dropped
rank = ["better_rank"]

In [25]:
# Custom encode better rank with lambda fx
master_df["better_rank_enc"] = master_df["better_rank"].apply(lambda x: rank_enc[x])
upcoming_df["better_rank_enc"] = upcoming_df["better_rank"].apply(lambda x: rank_enc[x])
# Drop better rank column
master_df.drop(rank, axis=1, inplace=True)
upcoming_df.drop(rank, axis=1, inplace=True)

In [26]:
master_df['title_bout'] = [1 if x==True else 0 for x in master_df['title_bout']]
upcoming_df['title_bout'] = [1 if x==True else 0 for x in upcoming_df['title_bout']]

In [27]:
# Filling these columns nan values with 0-  there were no actions of that particular type in the fight so the value should be represented with 0
master_df['kd_bout_diff'] = master_df['kd_bout_diff'].fillna(0)
master_df['sig_str_landed_bout_diff'] = master_df['sig_str_landed_bout_diff'].fillna(0)
master_df['rev_bout_diff'] = master_df['rev_bout_diff'].fillna(0)
master_df['pass_bout_diff'] = master_df['pass_bout_diff'].fillna(0)
master_df['sub_attempts_bout_diff'] = master_df['sub_attempts_bout_diff'].fillna(0)
master_df['td_pct_bout_diff'] = master_df['td_pct_bout_diff'].fillna(0)
master_df['td_attempted_bout_diff'] = master_df['td_attempted_bout_diff'].fillna(0)
master_df['td_landed_bout_diff'] = master_df['td_landed_bout_diff'].fillna(0)
master_df['tot_str_attempted_bout_diff'] = master_df['tot_str_attempted_bout_diff'].fillna(0)
master_df['tot_str_landed_bout_diff'] = master_df['tot_str_landed_bout_diff'].fillna(0)
master_df['sig_str_pct_bout_diff'] = master_df['sig_str_pct_bout_diff'].fillna(0)
master_df['sig_str_attempted_bout_diff'] = master_df['sig_str_attempted_bout_diff'].fillna(0)
master_df['avg_sig_str_pct_diff'] = master_df['kd_bout_diff'].fillna(0)
master_df['avg_TD_pct_diff'] = master_df['avg_TD_pct_diff'].fillna(0)

In [28]:
# repeat for upcoming df
upcoming_df['kd_bout_diff'] = upcoming_df['kd_bout_diff'].fillna(0)
upcoming_df['sig_str_landed_bout_diff'] = upcoming_df['sig_str_landed_bout_diff'].fillna(0)
upcoming_df['rev_bout_diff'] = upcoming_df['rev_bout_diff'].fillna(0)
upcoming_df['pass_bout_diff'] = upcoming_df['pass_bout_diff'].fillna(0)
upcoming_df['sub_attempts_bout_diff'] = upcoming_df['sub_attempts_bout_diff'].fillna(0)
upcoming_df['td_pct_bout_diff'] = upcoming_df['td_pct_bout_diff'].fillna(0)
upcoming_df['td_attempted_bout_diff'] = upcoming_df['td_attempted_bout_diff'].fillna(0)
upcoming_df['td_landed_bout_diff'] = upcoming_df['td_landed_bout_diff'].fillna(0)
upcoming_df['tot_str_attempted_bout_diff'] = upcoming_df['tot_str_attempted_bout_diff'].fillna(0)
upcoming_df['tot_str_landed_bout_diff'] = upcoming_df['tot_str_landed_bout_diff'].fillna(0)
upcoming_df['sig_str_pct_bout_diff'] = upcoming_df['sig_str_pct_bout_diff'].fillna(0)
upcoming_df['sig_str_attempted_bout_diff'] = upcoming_df['sig_str_attempted_bout_diff'].fillna(0)
upcoming_df['avg_sig_str_pct_diff'] = upcoming_df['kd_bout_diff'].fillna(0)
upcoming_df['avg_TD_pct_diff'] = upcoming_df['avg_TD_pct_diff'].fillna(0)

In [29]:
# The random forest classifier cannot process strings so we must remove the R_fighter and B_figther columns
names = ['R_fighter', 'B_fighter']
master_df.drop(names, axis=1, inplace = True)
upcoming_df.drop(names, axis=1, inplace = True)

In [30]:
master_df.reset_index(level=0, inplace=True)
master_df['level_0'] = master_df['index']
master_df.head()

Unnamed: 0,index,Winner,title_bout,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,draw_diff,avg_sig_str_pct_diff,avg_TD_pct_diff,win_by_Decision_Majority_diff,win_by_Decision_Split_diff,win_by_Decision_Unanimous_diff,win_by_TKO_Doctor_Stoppage_diff,odds_diff,ev_diff,kd_bout_diff,sig_str_landed_bout_diff,sig_str_attempted_bout_diff,sig_str_pct_bout_diff,tot_str_landed_bout_diff,tot_str_attempted_bout_diff,td_landed_bout_diff,td_attempted_bout_diff,td_pct_bout_diff,sub_attempts_bout_diff,pass_bout_diff,rev_bout_diff,Stance_diff,better_rank_enc,level_0
0,0,0,0,0,-1,-7,-27,-13,-57,-1,-17,-8,7.62,0.0,-8,1.05,-0.6,-0.75,0,0.0,0.0,-1,1,-2,0,-332,-95.0549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,-1,0
1,1,1,0,0,0,0,12,7,80,9,2,1,-12.7,-5.08,11,-3.18,-0.2,1.21,1,0.0,0.0,0,0,9,0,700,275.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,-1,1
2,2,1,0,-1,0,-3,-6,-3,-21,0,-2,-2,0.0,2.54,-3,-4.2,-1.1,-1.08,0,0.0,0.0,0,-1,-1,0,225,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,2
3,3,0,0,0,-1,-1,5,2,15,0,0,3,2.54,-5.08,-5,-1.09,0.1,0.71,1,0.0,0.0,0,1,1,0,230,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1,-1,3
4,4,0,0,-1,0,0,6,3,29,0,-2,4,-7.62,-7.62,5,-1.82,0.6,2.86,0,0.0,0.0,0,2,2,0,400,130.5556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,4


In [31]:
# Saving cleaned data
file_path = "../Resources/data_preprocessed.csv"
master_df.to_csv(file_path, index=False, header=True)

In [32]:
file_path_1 = "../Resources/upcoming_data_preprocessed.csv"
upcoming_df.to_csv(file_path_1, index=False, header=True)