In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import sys, warnings, os
from sklearn.dummy import DummyClassifier

In [2]:
# To ignore max-iteration warnings while cross validating scores 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [3]:
# Setting Columns and rows to display all the results 
pd.set_option("display.max_columns", None, "display.max_rows", None)

In [4]:
#Reading the dataset
ufc_master_ds = pd.read_csv("~/Desktop/ufc_data/ufc-master.csv")
#Separating label from input 
label = ufc_master_ds.Winner

#Removed "B_Women's Featherweight_rank" because inputing with this feature in the dataset gives me a ton of erors in the baseline model
X = ufc_master_ds.drop(['Winner', "B_Women's Featherweight_rank"], axis=1)

In [5]:
#Display the first few rows of a Dataframe
X.head(5)

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,title_bout,weight_class,gender,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,B_avg_TD_landed,B_avg_TD_pct,B_longest_win_streak,B_losses,B_total_rounds_fought,B_total_title_bouts,B_win_by_Decision_Majority,B_win_by_Decision_Split,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_SIG_STR_landed,R_avg_SIG_STR_pct,R_avg_SUB_ATT,R_avg_TD_landed,R_avg_TD_pct,R_longest_win_streak,R_losses,R_total_rounds_fought,R_total_title_bouts,R_win_by_Decision_Majority,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age,B_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,B_match_weightclass_rank,R_match_weightclass_rank,R_Women's Flyweight_rank,R_Women's Featherweight_rank,R_Women's Strawweight_rank,R_Women's Bantamweight_rank,R_Heavyweight_rank,R_Light Heavyweight_rank,R_Middleweight_rank,R_Welterweight_rank,R_Lightweight_rank,R_Featherweight_rank,R_Bantamweight_rank,R_Flyweight_rank,R_Pound-for-Pound_rank,B_Women's Flyweight_rank,B_Women's Strawweight_rank,B_Women's Bantamweight_rank,B_Heavyweight_rank,B_Light Heavyweight_rank,B_Middleweight_rank,B_Welterweight_rank,B_Lightweight_rank,B_Featherweight_rank,B_Bantamweight_rank,B_Flyweight_rank,B_Pound-for-Pound_rank,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,R_kd_bout,B_kd_bout,R_sig_str_landed_bout,B_sig_str_landed_bout,R_sig_str_attempted_bout,B_sig_str_attempted_bout,R_sig_str_pct_bout,B_sig_str_pct_bout,R_tot_str_landed_bout,B_tot_str_landed_bout,R_tot_str_attempted_bout,B_tot_str_attempted_bout,R_td_landed_bout,B_td_landed_bout,R_td_attempted_bout,B_td_attempted_bout,R_td_pct_bout,B_td_pct_bout,R_sub_attempts_bout,B_sub_attempts_bout,R_pass_bout,B_pass_bout,R_rev_bout,B_rev_bout
0,Paul Felder,Rafael Dos Anjos,165,-200,165.0,50.0,11/14/2020,"Las Vegas, Nevada, USA",USA,False,Lightweight,MALE,5,2,0,0,3.47,0.45,0.6,1.82,0.39,5,11,86,4,0,0,10,4,4,0,18,Southpaw,172.72,177.8,155,1,0,0,3.67,0.43,0.2,0.25,0.25,3,5,38,0,0,2,2,4,1,0,9,Orthodox,180.34,177.8,155,36,36,1,0,2,9,6,48,4,0,3,-7.62,0.0,0,-0.2,0.4,1.57,1,1,12.0,7.0,,,,,,,,,7.0,,,,,,,,,,,,12.0,,,,,Red,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Abdul Razak Alhassan,Khaos Williams,-240,185,41.666667,185.0,11/14/2020,"Las Vegas, Nevada, USA",USA,False,Welterweight,MALE,3,0,1,0,22.22,0.58,0.0,0.0,0.0,1,0,1,0,0,0,0,1,0,0,1,Orthodox,182.88,195.58,170,1,0,0,4.54,0.48,0.0,0.73,0.28,3,2,10,0,0,0,0,4,0,0,4,Orthodox,177.8,185.42,170,35,26,-1,1,-2,-3,-2,-9,0,-3,0,5.08,10.16,-9,17.68,0.0,-0.73,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Kay Hansen,Cory McKenna,-230,180,43.478261,180.0,11/14/2020,"Las Vegas, Nevada, USA",USA,False,Women's Strawweight,FEMALE,3,0,1,0,3.13,0.49,0.0,1.0,1.0,1,0,3,0,0,0,1,0,0,0,1,Orthodox,160.02,147.32,115,0,1,0,2.49,0.56,1.2,2.41,0.18,1,0,3,0,0,0,0,0,1,0,1,Orthodox,157.48,160.02,115,21,21,0,0,0,0,0,0,0,0,-1,2.54,-12.7,0,0.64,-1.2,-1.41,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Brendan Allen,Sean Strickland,-118,-106,84.745763,94.339623,11/14/2020,"Las Vegas, Nevada, USA",USA,False,Catch Weight,MALE,3,0,2,0,4.64,0.36,0.4,1.22,0.57,3,3,28,0,0,2,3,2,1,0,8,Orthodox,185.42,193.04,185,0,4,0,2.67,0.6,2.4,1.42,0.37,4,0,7,0,0,0,1,1,2,0,4,Orthodox,187.96,190.5,185,24,29,0,-2,-1,4,3,21,0,1,-1,-2.54,2.54,5,1.97,-2.0,-0.2,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Ashley Yoder,Miranda Granger,135,-167,135.0,59.88024,11/14/2020,"Las Vegas, Nevada, USA",USA,False,Women's Strawweight,FEMALE,3,1,0,0,3.47,0.43,0.0,0.0,0.0,1,1,4,0,0,0,1,0,0,0,1,Orthodox,170.18,172.72,125,2,0,0,2.96,0.4,0.4,1.14,0.36,2,5,21,0,0,1,1,0,0,0,2,Southpaw,170.18,175.26,115,33,28,-1,0,-1,-1,-4,-17,0,0,0,0.0,-2.54,-5,0.51,-0.4,-1.14,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,neither,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [59]:
#Encoding categorical variables 
#Separating the features based on their data types 
cat_col = [col for col in X.columns if X[col].dtypes == 'object']
for col in X.columns: 
    print(X[col].dtypes)
    print(X[cat_col].shape)
num_col = [col for col in X.columns if col not in cat_col]


int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
bool
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
float64
(4473, 0)
float64
(4473, 0)
int64
(4473, 0)
int64
(4473, 0)
int64
(44

In [44]:
enc = LabelEncoder()
for i in X[cat_col]:
    #using astype(str) to avoid columns with 'float and str' to throw errors
    X[i] = enc.fit_transform(X[i].astype(str))

In [45]:
#Also encoding Label for Red to be 1 and Blue to be 0 
label = [1 if win == 'Red' else 0 for win in label]

In [46]:
# By setting a random seed with `random_state` parameter, you ensure that the data is split in the same way every time you run the code
# aka you will always get the same datasets
X_train, X_valid, y_train, y_valid = train_test_split(X, label, random_state = 2, test_size = 0.3)

In [47]:
X_train.isnull().sum().sort_values(ascending=False)

R_Women's Featherweight_rank    3127
B_Pound-for-Pound_rank          3118
B_Women's Flyweight_rank        3112
R_Women's Flyweight_rank        3100
B_Women's Strawweight_rank      3093
B_Welterweight_rank             3085
B_Lightweight_rank              3079
B_Women's Bantamweight_rank     3078
B_Bantamweight_rank             3075
B_Featherweight_rank            3073
B_Flyweight_rank                3071
B_Light Heavyweight_rank        3068
B_Middleweight_rank             3065
R_Women's Strawweight_rank      3060
B_Heavyweight_rank              3059
R_Women's Bantamweight_rank     3052
R_Welterweight_rank             3048
R_Lightweight_rank              3048
R_Middleweight_rank             3043
R_Flyweight_rank                3041
R_Bantamweight_rank             3039
R_Light Heavyweight_rank        3039
R_Featherweight_rank            3039
R_Heavyweight_rank              3037
R_Pound-for-Pound_rank          3030
B_match_weightclass_rank        2584
R_match_weightclass_rank        2272
R

In [48]:
# Using imputation to fill in null values for numerical columns
imp = SimpleImputer(strategy='most_frequent')
print(imp)
imp.fit(X_train[num_col])
X_train[num_col] = imp.transform(X_train[num_col])
X_valid[num_col] = imp.transform(X_valid[num_col])

SimpleImputer(strategy='most_frequent')


## The `cat_col` for the next cell is an empty Dataframe and is causing the code to error out

In [51]:
# Using imputation to fill in null values for categorical columns
cat_imp = SimpleImputer(strategy = 'most_frequent')
print(X_train[cat_col].shape)
cat_imp.fit(X_train[cat_col])
X_train[cat_col] = cat_imp.transform(X_train[cat_col])
X_valid[cat_col] = cat_imp.transform(X_valid[cat_col])


(3131, 0)
X_train: Empty DataFrame
Columns: []
Index: [1553, 1954]


ValueError: at least one array or dtype is required