In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('dataframe/UFC_Test_Classif_X.csv')
df.head()

Unnamed: 0,id,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,Tecia Torres,Juliana Lima,Chris Tognoni,2017-07-07,"Las Vegas, Nevada, USA",False,WomenStrawweight,0.0,0.0,...,4,0,0,0,Orthodox,154.94,152.4,115.0,35.0,27.0
1,1,John Howard,Lorenz Larkin,Herb Dean,2015-01-18,"Boston, Massachusetts, USA",False,Welterweight,0.0,0.25,...,1,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0
2,2,Kyle Bochniak,Jeremy Kennedy,Todd Ronald Anderson,2017-07-22,"Uniondale, New York, USA",False,Featherweight,0.0,0.5,...,0,0,0,0,Orthodox,170.18,177.8,145.0,24.0,30.0
3,3,Yao Zhikui,Royston Wee,Steve Perceval,2014-08-23,"Macau, China",False,Bantamweight,0.0,0.0,...,0,0,0,0,Orthodox,165.1,162.56,125.0,27.0,23.0
4,4,Carlos Newton,Pat Miletich,John McCarthy,2001-05-04,"Atlantic City, New Jersey, USA",True,Welterweight,0.0,0.0,...,0,0,1,0,Orthodox,175.26,,170.0,33.0,24.0


In [3]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [4]:
# Impute missing value using iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import ExtraTreeRegressor

imputer = IterativeImputer(initial_strategy='median', random_state=42, estimator=ExtraTreeRegressor(random_state=42), max_iter=15)
df_imputed_number = imputer.fit_transform(df_number)
df_imputed_number = pd.DataFrame(df_imputed_number, columns=df_number.columns)



In [5]:
# Drop R_fighter, B_fighter, Referee, location, date
df_object = df_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [6]:
# Combine df_object and df_imputed_number
df = pd.concat([df_object, df_imputed_number], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,WomenStrawweight,Orthodox,Orthodox,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,0.0,4.0,0.0,0.0,0.0,154.94,152.4,115.0,35.0,27.0
1,Welterweight,Orthodox,Orthodox,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,3.0,1.0,2.0,0.0,0.0,170.18,182.88,170.0,28.0,31.0
2,Featherweight,Orthodox,Orthodox,2.0,0.0,0.5,0.5,0.48,0.615,0.0,...,1.0,0.0,0.0,0.0,0.0,170.18,177.8,145.0,24.0,30.0
3,Bantamweight,Orthodox,Orthodox,3.0,0.0,0.0,0.66,0.69,0.85,0.0,...,0.0,0.0,0.0,0.0,0.0,165.1,162.56,125.0,27.0,23.0
4,Welterweight,Orthodox,Orthodox,4.0,0.0,0.0,0.505312,0.439375,0.815937,0.067187,...,0.0,0.0,0.0,1.0,0.0,175.26,182.88,170.0,33.0,24.0


In [7]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)

In [8]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [9]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [10]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
id                         0
B_avg_KD                  10
B_avg_opp_KD             108
B_avg_SIG_STR_pct         21
B_avg_opp_SIG_STR_pct     15
                        ... 
R_Height_cms               3
R_Reach_cms               12
R_Weight_lbs              32
B_age                     11
R_age                      1
Length: 134, dtype: int64


percentage of outliers in each column:
id = 0.0%
B_avg_KD = 1.6611295681063125%
B_avg_opp_KD = 17.940199335548172%
B_avg_SIG_STR_pct = 3.488372093023256%
B_avg_opp_SIG_STR_pct = 2.4916943521594686%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 6.976744186046512%
B_avg_opp_SUB_ATT = 4.152823920265781%
B_avg_REV = 17.441860465116278%
B_avg_opp_REV = 20.59800664451827%
B_avg_SIG_STR_att = 2.990033222591362%
B_avg_SIG_STR_landed = 3.322259136212625%
B_avg_opp_SIG_STR_att = 2.6578073089700998%
B_avg_opp_SIG_STR_landed = 2.3255813953488373%
B_avg_TOTAL_STR_att = 2.1594684385382057%
B_avg_TOTAL_STR_landed = 1.49

In [11]:
from scipy.stats.mstats import winsorize

from scipy.stats.mstats import winsorize

# Identify columns with more than 0.5% outliers
outlier_columns = []
for key in outliers.keys():
    if outliers[key] / df_number[key].shape[0] > 0.005:
        outlier_columns.append(key)

# Winsorize the identified columns
for column in outlier_columns:
    df_number[column] = winsorize(df_number[column], limits=(0, 0.1))

# Combine df_object and df_number
df = pd.concat([df_object, df_number], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,WomenStrawweight,Orthodox,Orthodox,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,0.0,4.0,0.0,0.0,0.0,154.94,152.4,115.0,35.0,27.0
1,Welterweight,Orthodox,Orthodox,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,1.0,1.0,2.0,0.0,0.0,170.18,182.88,170.0,28.0,31.0
2,Featherweight,Orthodox,Orthodox,2.0,0.0,0.5,0.5,0.48,0.615,0.0,...,1.0,0.0,0.0,0.0,0.0,170.18,177.8,145.0,24.0,30.0
3,Bantamweight,Orthodox,Orthodox,3.0,0.0,0.0,0.615,0.605,0.85,0.0,...,0.0,0.0,0.0,0.0,0.0,165.1,162.56,125.0,27.0,23.0
4,Welterweight,Orthodox,Orthodox,4.0,0.0,0.0,0.505312,0.439375,0.815937,0.067187,...,0.0,0.0,0.0,1.0,0.0,175.26,182.88,170.0,33.0,24.0


In [12]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

  arr.partition(
  arr.partition(


Outlier pada tiap atribut:
id                         0
B_avg_KD                   0
B_avg_opp_KD             108
B_avg_SIG_STR_pct          8
B_avg_opp_SIG_STR_pct      4
                        ... 
R_Height_cms               3
R_Reach_cms                6
R_Weight_lbs               0
B_age                      0
R_age                      1
Length: 134, dtype: int64


percentage of outliers in each column:
id = 0.0%
B_avg_KD = 0.0%
B_avg_opp_KD = 17.940199335548172%
B_avg_SIG_STR_pct = 1.3289036544850499%
B_avg_opp_SIG_STR_pct = 0.6644518272425249%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 0.0%
B_avg_opp_SUB_ATT = 0.0%
B_avg_REV = 17.441860465116278%
B_avg_opp_REV = 20.59800664451827%
B_avg_SIG_STR_att = 0.0%
B_avg_SIG_STR_landed = 0.0%
B_avg_opp_SIG_STR_att = 0.0%
B_avg_opp_SIG_STR_landed = 0.0%
B_avg_TOTAL_STR_att = 0.0%
B_avg_TOTAL_STR_landed = 0.0%
B_avg_opp_TOTAL_STR_att = 0.0%
B_avg_opp_TOTAL_STR_landed = 0.0%
B_avg_TD_att = 0.0%
B_avg_TD_landed = 0.0%
B_avg_

In [13]:
df['gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [14]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')

In [15]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Strawweight', 'Welterweight', 'Featherweight', 'Bantamweight',
       'Middleweight', 'Lightweight', 'LightHeavyweight', 'Heavyweight',
       'CatchWeight', 'OpenWeight', 'Flyweight'], dtype=object)

In [16]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [17]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)

In [18]:
# Map each gender value to the correct number
df['gender'] = df['gender'].map(gender_dict)

In [19]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,1,Orthodox,Orthodox,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,4.0,0.0,0.0,0.0,154.94,152.4,115.0,35.0,27.0,0
1,6,Orthodox,Orthodox,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,1.0,2.0,0.0,0.0,170.18,182.88,170.0,28.0,31.0,1
2,4,Orthodox,Orthodox,2.0,0.0,0.5,0.5,0.48,0.615,0.0,...,0.0,0.0,0.0,0.0,170.18,177.8,145.0,24.0,30.0,1
3,3,Orthodox,Orthodox,3.0,0.0,0.0,0.615,0.605,0.85,0.0,...,0.0,0.0,0.0,0.0,165.1,162.56,125.0,27.0,23.0,1
4,6,Orthodox,Orthodox,4.0,0.0,0.0,0.505312,0.439375,0.815937,0.067187,...,0.0,0.0,1.0,0.0,175.26,182.88,170.0,33.0,24.0,1


In [21]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,R_age,gender,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,0.0,0.125,...,27.0,0,False,True,False,False,False,True,False,False
1,6,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,0.0,0.5,...,31.0,1,False,True,False,False,False,True,False,False
2,4,2.0,0.0,0.5,0.5,0.48,0.615,0.0,0.0,0.0,...,30.0,1,False,True,False,False,False,True,False,False
3,3,3.0,0.0,0.0,0.615,0.605,0.85,0.0,0.0,1.0625,...,23.0,1,False,True,False,False,False,True,False,False
4,6,4.0,0.0,0.0,0.505312,0.439375,0.815937,0.067187,1.25,0.640625,...,24.0,1,False,True,False,False,False,True,False,False


In [23]:
df = df[['B_avg_DISTANCE_landed', 'R_avg_opp_SIG_STR_pct', 'R_age',
       'B_avg_CTRL_time(seconds)', 'B_avg_opp_SIG_STR_pct', 'B_avg_HEAD_att',
       'B_avg_opp_LEG_att', 'B_age', 'B_avg_DISTANCE_att', 'R_losses',
       'R_avg_opp_HEAD_landed', 'R_avg_opp_LEG_att', 'R_avg_LEG_att',
       'B_avg_GROUND_att', 'R_avg_GROUND_landed', 'R_avg_opp_BODY_landed',
       'R_avg_SUB_ATT', 'R_avg_opp_DISTANCE_att', 'R_avg_BODY_landed',
       'R_avg_opp_CTRL_time(seconds)']]

df.head()

Unnamed: 0,B_avg_DISTANCE_landed,R_avg_opp_SIG_STR_pct,R_age,B_avg_CTRL_time(seconds),B_avg_opp_SIG_STR_pct,B_avg_HEAD_att,B_avg_opp_LEG_att,B_age,B_avg_DISTANCE_att,R_losses,R_avg_opp_HEAD_landed,R_avg_opp_LEG_att,R_avg_LEG_att,B_avg_GROUND_att,R_avg_GROUND_landed,R_avg_opp_BODY_landed,R_avg_SUB_ATT,R_avg_opp_DISTANCE_att,R_avg_BODY_landed,R_avg_opp_CTRL_time(seconds)
0,17.6875,0.26625,27.0,349.5,0.44875,60.3125,0.625,35.0,58.125,1.0,20.5625,4.9375,15.211609,12.875,1.9375,13.1875,0.0,126.8125,13.5625,94.9375
1,16.0625,0.536279,31.0,9.6875,0.371875,37.125,8.125,28.0,42.6875,5.0,22.541016,14.3125,13.258789,0.125,1.556641,13.320312,0.073242,67.904297,8.336914,273.668945
2,14.0,0.36,30.0,349.5,0.48,61.0,4.0,24.0,40.0,1.0,41.5,14.3125,15.211609,19.0,1.0,13.320312,0.0,126.8125,5.0,40.5
3,0.0,0.4,23.0,349.5,0.605,11.0,4.0,27.0,2.0,0.0,30.21875,5.632935,2.6875,9.0,11.0,11.0,0.0,126.8125,6.0,37.28125
4,5.71875,0.33,24.0,219.75,0.439375,24.25,3.046875,33.0,18.984375,1.0,13.0,0.0,4.5,7.703125,0.5,1.0,1.125,5.0,1.0,0.0


In [None]:
df.to_csv('dataframe/UFC_kombinasi3_kaggle.csv', index=False)
