## Kombinasi 7 :
- Delete duplicate
- Impute missing value using median/mode
- Outlier handling with winsorization
- Encoding
- Standard Scaler
- Feature selection with Decision Tree

In [55]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [56]:
df = pd.read_csv("dataframe/UFC_Test_Classif_X.csv")
df.head()

Unnamed: 0,id,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,Tecia Torres,Juliana Lima,Chris Tognoni,2017-07-07,"Las Vegas, Nevada, USA",False,WomenStrawweight,0.0,0.0,...,4,0,0,0,Orthodox,154.94,152.4,115.0,35.0,27.0
1,1,John Howard,Lorenz Larkin,Herb Dean,2015-01-18,"Boston, Massachusetts, USA",False,Welterweight,0.0,0.25,...,1,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0
2,2,Kyle Bochniak,Jeremy Kennedy,Todd Ronald Anderson,2017-07-22,"Uniondale, New York, USA",False,Featherweight,0.0,0.5,...,0,0,0,0,Orthodox,170.18,177.8,145.0,24.0,30.0
3,3,Yao Zhikui,Royston Wee,Steve Perceval,2014-08-23,"Macau, China",False,Bantamweight,0.0,0.0,...,0,0,0,0,Orthodox,165.1,162.56,125.0,27.0,23.0
4,4,Carlos Newton,Pat Miletich,John McCarthy,2001-05-04,"Atlantic City, New Jersey, USA",True,Welterweight,0.0,0.0,...,0,0,1,0,Orthodox,175.26,,170.0,33.0,24.0


In [57]:
# Delete duplicate
print(f"Shape before dropping duplicate : {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicate : {df.shape}")

Shape before dropping duplicate : (602, 143)
Shape after dropping duplicate : (602, 143)


In [58]:
def check_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na / len(df)
    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")
    else:
        print(missing_data[missing_data['Total'] > 0])

In [59]:
check_null(df)

                          Total   Percent
Referee                       1  0.001661
R_Stance                      2  0.003322
B_Stance                      2  0.003322
R_age                         7  0.011628
B_age                        11  0.018272
...                         ...       ...
B_avg_CTRL_time(seconds)    134  0.222591
B_avg_opp_GROUND_att        134  0.222591
B_avg_GROUND_landed         134  0.222591
B_avg_BODY_landed           134  0.222591
B_avg_DISTANCE_att          134  0.222591

[104 rows x 2 columns]


In [60]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)


In [61]:
# Impute missing value using median/mode
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
df_imputed_number = imputer.fit_transform(df_number)
df_imputed_number = pd.DataFrame(df_imputed_number, columns=df_number.columns)

imputer = SimpleImputer(strategy='most_frequent')
df_imputed_object = imputer.fit_transform(df_object)
df_imputed_object = pd.DataFrame(df_imputed_object, columns=df_object.columns)

In [62]:
# Drop R_fighter, B_fighter, Referee, location, date
df_object = df_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [63]:
# Combine df_object and df_imputed_number
df = pd.concat([df_object, df_imputed_number], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,WomenStrawweight,Orthodox,Orthodox,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,0.0,4.0,0.0,0.0,0.0,154.94,152.4,115.0,35.0,27.0
1,Welterweight,Orthodox,Orthodox,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,3.0,1.0,2.0,0.0,0.0,170.18,182.88,170.0,28.0,31.0
2,Featherweight,Orthodox,Orthodox,2.0,0.0,0.5,0.5,0.48,0.615,0.0,...,1.0,0.0,0.0,0.0,0.0,170.18,177.8,145.0,24.0,30.0
3,Bantamweight,Orthodox,Orthodox,3.0,0.0,0.0,0.66,0.69,0.85,0.0,...,0.0,0.0,0.0,0.0,0.0,165.1,162.56,125.0,27.0,23.0
4,Welterweight,Orthodox,Orthodox,4.0,0.0,0.0,0.505312,0.439375,0.815937,0.067187,...,0.0,0.0,0.0,1.0,0.0,175.26,182.88,170.0,33.0,24.0


In [64]:
check_null(df)

          Total   Percent
R_Stance      2  0.003322
B_Stance      2  0.003322


In [65]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)


In [66]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [67]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [68]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)


In [69]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
id                         0
B_avg_KD                  49
B_avg_opp_KD             126
B_avg_SIG_STR_pct         66
B_avg_opp_SIG_STR_pct     54
                        ... 
R_Height_cms               3
R_Reach_cms               11
R_Weight_lbs              32
B_age                     11
R_age                      1
Length: 134, dtype: int64


percentage of outliers in each column:
id = 0.0%
B_avg_KD = 8.13953488372093%
B_avg_opp_KD = 20.930232558139537%
B_avg_SIG_STR_pct = 10.96345514950166%
B_avg_opp_SIG_STR_pct = 8.970099667774086%
B_avg_TD_pct = 4.318936877076411%
B_avg_opp_TD_pct = 6.64451827242525%
B_avg_SUB_ATT = 7.807308970099667%
B_avg_opp_SUB_ATT = 5.647840531561462%
B_avg_REV = 21.428571428571427%
B_avg_opp_REV = 23.920265780730897%
B_avg_SIG_STR_att = 5.48172757475083%
B_avg_SIG_STR_landed = 4.485049833887043%
B_avg_opp_SIG_STR_att = 4.983388704318937%
B_avg_opp_SIG_STR_landed = 5.3156146179401995%
B_avg_TOTAL_STR_att = 5.3156146179401995%
B_avg_

In [70]:
from scipy.stats.mstats import winsorize

# Identify columns with more than 0.5% outliers
outlier_columns = []
for key in outliers.keys():
    if outliers[key] / df_number[key].shape[0] > 0.005:
        outlier_columns.append(key)

# Winsorize the identified columns
for column in outlier_columns:
    df_number[column] = winsorize(df_number[column], limits=(0, 0.1))

# Combine df_object and df_number
df = pd.concat([df_object, df_number], axis=1)
df.head()


Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,WomenStrawweight,Orthodox,Orthodox,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,0.0,4.0,0.0,0.0,0.0,154.94,152.4,115.0,35.0,27.0
1,Welterweight,Orthodox,Orthodox,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,1.0,1.0,2.0,0.0,0.0,170.18,182.88,170.0,28.0,31.0
2,Featherweight,Orthodox,Orthodox,2.0,0.0,0.5,0.5,0.48,0.615,0.0,...,1.0,0.0,0.0,0.0,0.0,170.18,177.8,145.0,24.0,30.0
3,Bantamweight,Orthodox,Orthodox,3.0,0.0,0.0,0.595,0.57,0.62,0.0,...,0.0,0.0,0.0,0.0,0.0,165.1,162.56,125.0,27.0,23.0
4,Welterweight,Orthodox,Orthodox,4.0,0.0,0.0,0.505312,0.439375,0.62,0.067187,...,0.0,0.0,0.0,1.0,0.0,175.26,182.88,170.0,33.0,24.0


In [71]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
id                         0
B_avg_KD                   0
B_avg_opp_KD             126
B_avg_SIG_STR_pct         32
B_avg_opp_SIG_STR_pct     25
                        ... 
R_Height_cms               3
R_Reach_cms                6
R_Weight_lbs               0
B_age                      0
R_age                      1
Length: 134, dtype: int64


percentage of outliers in each column:
id = 0.0%
B_avg_KD = 0.0%
B_avg_opp_KD = 20.930232558139537%
B_avg_SIG_STR_pct = 5.3156146179401995%
B_avg_opp_SIG_STR_pct = 4.152823920265781%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 0.0%
B_avg_opp_SUB_ATT = 0.0%
B_avg_REV = 21.428571428571427%
B_avg_opp_REV = 23.920265780730897%
B_avg_SIG_STR_att = 0.0%
B_avg_SIG_STR_landed = 0.0%
B_avg_opp_SIG_STR_att = 0.0%
B_avg_opp_SIG_STR_landed = 0.0%
B_avg_TOTAL_STR_att = 0.0%
B_avg_TOTAL_STR_landed = 0.0%
B_avg_opp_TOTAL_STR_att = 0.0%
B_avg_opp_TOTAL_STR_landed = 0.0%
B_avg_TD_att = 0.0%
B_avg_TD_landed = 10.13289036

  arr.partition(
  arr.partition(


In [72]:
df['gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [73]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,WomenStrawweight,Orthodox,Orthodox,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,4.0,0.0,0.0,0.0,154.94,152.4,115.0,35.0,27.0,women
1,Welterweight,Orthodox,Orthodox,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,1.0,2.0,0.0,0.0,170.18,182.88,170.0,28.0,31.0,male
2,Featherweight,Orthodox,Orthodox,2.0,0.0,0.5,0.5,0.48,0.615,0.0,...,0.0,0.0,0.0,0.0,170.18,177.8,145.0,24.0,30.0,male
3,Bantamweight,Orthodox,Orthodox,3.0,0.0,0.0,0.595,0.57,0.62,0.0,...,0.0,0.0,0.0,0.0,165.1,162.56,125.0,27.0,23.0,male
4,Welterweight,Orthodox,Orthodox,4.0,0.0,0.0,0.505312,0.439375,0.62,0.067187,...,0.0,0.0,1.0,0.0,175.26,182.88,170.0,33.0,24.0,male


In [74]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')

In [75]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Strawweight', 'Welterweight', 'Featherweight', 'Bantamweight',
       'Middleweight', 'Lightweight', 'LightHeavyweight', 'Heavyweight',
       'CatchWeight', 'OpenWeight', 'Flyweight'], dtype=object)

In [76]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [77]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)

In [78]:
# Map each gender value to the correct number
df['gender'] = df['gender'].map(gender_dict)

In [79]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,1,Orthodox,Orthodox,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,4.0,0.0,0.0,0.0,154.94,152.4,115.0,35.0,27.0,0
1,6,Orthodox,Orthodox,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,1.0,2.0,0.0,0.0,170.18,182.88,170.0,28.0,31.0,1
2,4,Orthodox,Orthodox,2.0,0.0,0.5,0.5,0.48,0.615,0.0,...,0.0,0.0,0.0,0.0,170.18,177.8,145.0,24.0,30.0,1
3,3,Orthodox,Orthodox,3.0,0.0,0.0,0.595,0.57,0.62,0.0,...,0.0,0.0,0.0,0.0,165.1,162.56,125.0,27.0,23.0,1
4,6,Orthodox,Orthodox,4.0,0.0,0.0,0.505312,0.439375,0.62,0.067187,...,0.0,0.0,1.0,0.0,175.26,182.88,170.0,33.0,24.0,1


In [81]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()


Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,R_age,gender,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0.0,0.0,0.0,0.49375,0.44875,0.475,0.1775,0.0,0.125,...,27.0,0,False,True,False,False,False,True,False,False
1,6,1.0,0.0,0.25,0.473125,0.371875,0.0,0.20375,0.0,0.5,...,31.0,1,False,True,False,False,False,True,False,False
2,4,2.0,0.0,0.5,0.5,0.48,0.615,0.0,0.0,0.0,...,30.0,1,False,True,False,False,False,True,False,False
3,3,3.0,0.0,0.0,0.595,0.57,0.62,0.0,0.0,1.0,...,23.0,1,False,True,False,False,False,True,False,False
4,6,4.0,0.0,0.0,0.505312,0.439375,0.62,0.067187,1.002136,0.640625,...,24.0,1,False,True,False,False,False,True,False,False


In [82]:
df

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,R_age,gender,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0.0,0.000000,0.00,0.493750,0.448750,0.475000,0.177500,0.000000,0.125000,...,27.0,0,False,True,False,False,False,True,False,False
1,6,1.0,0.000000,0.25,0.473125,0.371875,0.000000,0.203750,0.000000,0.500000,...,31.0,1,False,True,False,False,False,True,False,False
2,4,2.0,0.000000,0.50,0.500000,0.480000,0.615000,0.000000,0.000000,0.000000,...,30.0,1,False,True,False,False,False,True,False,False
3,3,3.0,0.000000,0.00,0.595000,0.570000,0.620000,0.000000,0.000000,1.000000,...,23.0,1,False,True,False,False,False,True,False,False
4,6,4.0,0.000000,0.00,0.505312,0.439375,0.620000,0.067187,1.002136,0.640625,...,24.0,1,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,3,597.0,0.015625,0.00,0.461132,0.430000,0.217562,0.192031,0.125000,0.068893,...,24.0,0,False,True,False,False,False,True,False,False
598,5,598.0,0.000000,0.50,0.585000,0.570000,0.000000,0.500000,1.000000,0.000000,...,22.0,1,False,False,True,False,False,True,False,False
599,5,599.0,0.750000,0.50,0.360254,0.501284,0.250000,0.119775,0.070312,0.008789,...,32.0,1,False,True,False,False,False,True,False,False
600,9,600.0,0.000000,0.00,0.520000,0.557500,0.500000,0.080000,0.000000,0.250000,...,33.0,1,False,False,True,False,False,True,False,False


In [83]:
df = df.drop(["B_Reach_cms", "R_Stance_Sideways", "B_Stance_Sideways"], axis=1)

KeyError: "['B_Reach_cms', 'R_Stance_Sideways', 'B_Stance_Sideways'] not found in axis"

In [84]:
df = df.drop(["Winner_Blue",	"Winner_Draw",	"Winner_Red"], axis=1)

KeyError: "['Winner_Blue', 'Winner_Draw', 'Winner_Red'] not found in axis"

In [85]:
df

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,R_age,gender,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0.0,0.000000,0.00,0.493750,0.448750,0.475000,0.177500,0.000000,0.125000,...,27.0,0,False,True,False,False,False,True,False,False
1,6,1.0,0.000000,0.25,0.473125,0.371875,0.000000,0.203750,0.000000,0.500000,...,31.0,1,False,True,False,False,False,True,False,False
2,4,2.0,0.000000,0.50,0.500000,0.480000,0.615000,0.000000,0.000000,0.000000,...,30.0,1,False,True,False,False,False,True,False,False
3,3,3.0,0.000000,0.00,0.595000,0.570000,0.620000,0.000000,0.000000,1.000000,...,23.0,1,False,True,False,False,False,True,False,False
4,6,4.0,0.000000,0.00,0.505312,0.439375,0.620000,0.067187,1.002136,0.640625,...,24.0,1,False,True,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,3,597.0,0.015625,0.00,0.461132,0.430000,0.217562,0.192031,0.125000,0.068893,...,24.0,0,False,True,False,False,False,True,False,False
598,5,598.0,0.000000,0.50,0.585000,0.570000,0.000000,0.500000,1.000000,0.000000,...,22.0,1,False,False,True,False,False,True,False,False
599,5,599.0,0.750000,0.50,0.360254,0.501284,0.250000,0.119775,0.070312,0.008789,...,32.0,1,False,True,False,False,False,True,False,False
600,9,600.0,0.000000,0.00,0.520000,0.557500,0.500000,0.080000,0.000000,0.250000,...,33.0,1,False,False,True,False,False,True,False,False


In [None]:
# Export to CSV
df.to_csv('dataframe/UFC_Pre_kombinasi7.csv', index=False)