### Kombinasi 4
* Delete Duplicate
* Impute Null pake iterative imputer
* Outlier capping pake Log Transform
* Encoding 
* Standard scaler
* Feature selection -> K-Best

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('../UFC_train.csv')

df.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,Winner
0,Joe Riggs,Joe Doerksen,Steve Mazzagatti,2004-08-21,"Las Vegas, Nevada, USA",False,Middleweight,,,,...,0,0,0,Southpaw,182.88,177.8,185.0,26.0,21.0,Red
1,Jorge Masvidal,Al Iaquinta,Keith Peterson,2015-04-04,"Fairfax, Virginia, USA",False,Lightweight,1.15625,0.0,0.394141,...,0,1,0,Orthodox,180.34,187.96,170.0,27.0,30.0,Blue
2,Dan Stittgen,Stephen Thompson,Josh Rosenthal,2012-02-04,"Las Vegas, Nevada, USA",False,Welterweight,,,,...,0,0,0,Orthodox,185.42,,170.0,28.0,31.0,Blue
3,Josh Koscheck,Johny Hendricks,Kevin Mulhall,2012-05-05,"East Rutherford, New Jersey, USA",False,Welterweight,0.695312,0.0,0.783359,...,6,3,0,Orthodox,177.8,185.42,170.0,28.0,34.0,Blue
4,John Dodson,Manvel Gamburyan,James Warring,2016-04-16,"Tampa, Florida, USA",False,Bantamweight,0.5,0.266602,0.381462,...,3,0,1,Orthodox,160.02,167.64,135.0,34.0,31.0,Red


In [3]:
# Delete duplicate
print(f"Shape before dropping duplicates : {df.shape}") # Before dropping duplicates
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {df.shape}") # After dropping duplicates

Shape before dropping duplicates : (5410, 144)
Shape after dropping duplicates: (5410, 144)


In [4]:
def check_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na / len(df)
    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")
    else:
        print(missing_data[missing_data['Total'] > 0])

In [5]:
check_null(df)

                         Total   Percent
R_Weight_lbs                 2  0.000370
R_Height_cms                 4  0.000739
B_Weight_lbs                 8  0.001479
B_Height_cms                10  0.001848
R_Stance                    27  0.004991
...                        ...       ...
B_avg_GROUND_landed       1293  0.239002
B_avg_GROUND_att          1293  0.239002
B_avg_opp_CLINCH_landed   1293  0.239002
B_avg_TD_att              1293  0.239002
B_avg_HEAD_landed         1293  0.239002

[109 rows x 2 columns]


In [6]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [7]:
# Impute missing value using iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer()
df_imputed_number = imputer.fit_transform(df_number)
df_imputed_number = pd.DataFrame(df_imputed_number, columns=df_number.columns)



In [8]:
# Drop R_fighter, B_fighter, Referee, location, and date
df_object = df_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [9]:
# Combine df_object and df_imputed_number
df = pd.concat([df_object, df_imputed_number], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Middleweight,Orthodox,Southpaw,Red,-1.558,-269.960915,474.696197,1.001642,0.286333,3.761259,...,0.0,0.0,0.0,0.0,0.0,182.88,177.8,185.0,26.0,21.0
1,Lightweight,Orthodox,Orthodox,Blue,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,...,0.0,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0
2,Welterweight,Orthodox,Orthodox,Blue,-0.169817,-59.25779,104.407134,0.565607,0.25313,1.065947,...,0.0,0.0,0.0,0.0,0.0,185.42,177.846319,170.0,28.0,31.0
3,Welterweight,Southpaw,Orthodox,Blue,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,...,1.0,5.0,6.0,3.0,0.0,177.8,185.42,170.0,28.0,34.0
4,Bantamweight,Orthodox,Orthodox,Red,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,...,0.0,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0


In [10]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)

In [11]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [12]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [13]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [14]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                  519
B_avg_opp_KD             1292
B_avg_SIG_STR_pct        1311
B_avg_opp_SIG_STR_pct     323
B_avg_TD_pct              186
                         ... 
R_Height_cms               19
R_Reach_cms               106
R_Weight_lbs              301
B_age                      38
R_age                      98
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 9.593345656192238%
B_avg_opp_KD = 23.88170055452865%
B_avg_SIG_STR_pct = 24.23290203327172%
B_avg_opp_SIG_STR_pct = 5.970425138632162%
B_avg_TD_pct = 3.4380776340110906%
B_avg_opp_TD_pct = 8.484288354898336%
B_avg_SUB_ATT = 9.353049907578558%
B_avg_opp_SUB_ATT = 11.44177449168207%
B_avg_REV = 12.513863216266174%
B_avg_opp_REV = 40.46210720887246%
B_avg_SIG_STR_att = 4.306839186691312%
B_avg_SIG_STR_landed = 6.7467652495378925%
B_avg_opp_SIG_STR_att = 4.953789279112754%
B_avg_opp_SIG_STR_landed = 5.138632162661738%
B_avg_TOTAL_STR_att = 3.5674676524953792%
B_a

In [20]:
from scipy.stats import boxcox

# Add a constant value to make all values positive
df_number_positive = df_number + abs(df_number.min()) + 1

df_number_log = np.log(df_number_positive)

In [21]:
df_number_log

Unnamed: 0,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,B_avg_opp_REV,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,1.507976,6.276408,7.163003,0.936400,0.251796,2.357310,2.389336,3.015841,0.441275,6.252145,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.817944,5.311745,5.707110,3.463272,6.062935
1,1.978492,6.686904,6.705001,0.642675,0.214481,1.918792,2.750676,2.574997,0.190949,5.794258,...,0.000000,1.609438,0.000000,0.693147,0.000000,5.810362,5.360652,5.655992,3.494118,6.083667
2,1.775928,6.610128,6.824909,0.748935,0.225644,2.062701,2.700207,2.710131,0.345085,5.914811,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.825469,5.311974,5.655992,3.524042,6.085944
3,1.912633,6.686904,6.705478,0.550827,0.084600,1.932335,2.746675,2.569628,0.190949,5.794258,...,0.693147,1.791759,1.945910,1.386294,0.000000,5.802722,5.348648,5.655992,3.524042,6.092745
4,1.883363,6.687236,6.704986,0.695992,0.357405,1.983868,2.825827,2.564850,0.289229,5.795780,...,0.000000,1.098612,1.386294,0.000000,0.693147,5.747544,5.260323,5.525453,3.686909,6.085944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5405,1.824643,6.686904,6.705108,0.739143,0.543051,1.979102,2.780183,2.593273,0.900323,5.794639,...,0.000000,0.693147,0.000000,0.000000,0.000000,5.795024,5.324198,5.602119,3.463272,6.081385
5406,1.806846,6.686904,6.704990,0.622066,0.478694,1.946322,2.775812,2.590054,0.198985,5.794645,...,0.000000,1.609438,0.693147,1.791759,0.000000,5.825469,5.336498,5.655992,3.524042,6.092745
5407,1.844601,6.687216,6.705257,0.724713,0.285179,1.964485,2.756648,2.564831,0.190949,5.794258,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.795024,5.311745,5.564520,3.398569,6.083667
5408,1.938825,6.687060,6.705184,0.680766,0.030772,1.940537,2.740644,2.566033,0.190949,5.794258,...,0.693147,1.609438,0.693147,0.000000,0.000000,5.795024,5.324198,5.602119,3.494118,6.092745


In [22]:
outliers = dict(check_outlier(df_number_log).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number_log[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                  511
B_avg_opp_KD             1292
B_avg_SIG_STR_pct        1311
B_avg_opp_SIG_STR_pct     304
B_avg_TD_pct                0
                         ... 
R_Height_cms               19
R_Reach_cms               162
R_Weight_lbs              210
B_age                      21
R_age                      98
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 9.44547134935305%
B_avg_opp_KD = 23.88170055452865%
B_avg_SIG_STR_pct = 24.23290203327172%
B_avg_opp_SIG_STR_pct = 5.619223659889094%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 8.262476894639557%
B_avg_SUB_ATT = 9.279112754158964%
B_avg_opp_SUB_ATT = 11.386321626617375%
B_avg_REV = 11.977818853974123%
B_avg_opp_REV = 40.46210720887246%
B_avg_SIG_STR_att = 2.9205175600739373%
B_avg_SIG_STR_landed = 6.728280961182994%
B_avg_opp_SIG_STR_att = 2.865064695009242%
B_avg_opp_SIG_STR_landed = 5.027726432532347%
B_avg_TOTAL_STR_att = 2.1256931608133085%
B_avg_TOTAL_STR_la

In [23]:
df = pd.concat([df_object, df_number_log], axis=1)

In [24]:
df['R_Gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')
df['B_Gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [25]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,R_Gender,B_Gender
0,Middleweight,Orthodox,Southpaw,Red,1.507976,6.276408,7.163003,0.9364,0.251796,2.35731,...,0.0,0.0,0.0,5.817944,5.311745,5.70711,3.463272,6.062935,male,male
1,Lightweight,Orthodox,Orthodox,Blue,1.978492,6.686904,6.705001,0.642675,0.214481,1.918792,...,0.0,0.693147,0.0,5.810362,5.360652,5.655992,3.494118,6.083667,male,male
2,Welterweight,Orthodox,Orthodox,Blue,1.775928,6.610128,6.824909,0.748935,0.225644,2.062701,...,0.0,0.0,0.0,5.825469,5.311974,5.655992,3.524042,6.085944,male,male
3,Welterweight,Southpaw,Orthodox,Blue,1.912633,6.686904,6.705478,0.550827,0.0846,1.932335,...,1.94591,1.386294,0.0,5.802722,5.348648,5.655992,3.524042,6.092745,male,male
4,Bantamweight,Orthodox,Orthodox,Red,1.883363,6.687236,6.704986,0.695992,0.357405,1.983868,...,1.386294,0.0,0.693147,5.747544,5.260323,5.525453,3.686909,6.085944,male,male


In [26]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')

In [27]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Middleweight', 'Lightweight', 'Welterweight', 'Bantamweight',
       'Flyweight', 'LightHeavyweight', 'Strawweight', 'Featherweight',
       'OpenWeight', 'Heavyweight', 'CatchWeight'], dtype=object)

In [28]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [29]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)

In [30]:
# Map each gender value to the correct number
df['R_Gender'] = df['R_Gender'].map(gender_dict)
df['B_Gender'] = df['B_Gender'].map(gender_dict)

In [31]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,R_Gender,B_Gender
0,7,Orthodox,Southpaw,Red,1.507976,6.276408,7.163003,0.9364,0.251796,2.35731,...,0.0,0.0,0.0,5.817944,5.311745,5.70711,3.463272,6.062935,1,1
1,5,Orthodox,Orthodox,Blue,1.978492,6.686904,6.705001,0.642675,0.214481,1.918792,...,0.0,0.693147,0.0,5.810362,5.360652,5.655992,3.494118,6.083667,1,1
2,6,Orthodox,Orthodox,Blue,1.775928,6.610128,6.824909,0.748935,0.225644,2.062701,...,0.0,0.0,0.0,5.825469,5.311974,5.655992,3.524042,6.085944,1,1
3,6,Southpaw,Orthodox,Blue,1.912633,6.686904,6.705478,0.550827,0.0846,1.932335,...,1.94591,1.386294,0.0,5.802722,5.348648,5.655992,3.524042,6.092745,1,1
4,3,Orthodox,Orthodox,Red,1.883363,6.687236,6.704986,0.695992,0.357405,1.983868,...,1.386294,0.0,0.693147,5.747544,5.260323,5.525453,3.686909,6.085944,1,1


In [32]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()

Unnamed: 0,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch,Winner_Blue,Winner_Draw,Winner_Red
0,7,1.507976,6.276408,7.163003,0.9364,0.251796,2.35731,2.389336,3.015841,0.441275,...,False,False,False,False,False,True,False,False,False,True
1,5,1.978492,6.686904,6.705001,0.642675,0.214481,1.918792,2.750676,2.574997,0.190949,...,False,False,False,True,False,False,False,True,False,False
2,6,1.775928,6.610128,6.824909,0.748935,0.225644,2.062701,2.700207,2.710131,0.345085,...,False,False,False,True,False,False,False,True,False,False
3,6,1.912633,6.686904,6.705478,0.550827,0.0846,1.932335,2.746675,2.569628,0.190949,...,True,False,False,True,False,False,False,True,False,False
4,3,1.883363,6.687236,6.704986,0.695992,0.357405,1.983868,2.825827,2.56485,0.289229,...,False,False,False,True,False,False,False,False,False,True


In [33]:
from sklearn.feature_selection import SelectKBest, f_regression
# Perform feature selection using SelectKBest
selector = SelectKBest(k=20, score_func=f_regression)
X = df.drop('B_Reach_cms', axis=1) 
y = df['B_Reach_cms'] 
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]
print(selected_feature_names)
# Create a new dataframe with only the selected features
df_selected = pd.DataFrame(X_selected, columns=selected_feature_names)
df_selected.head()

Index(['R_avg_opp_SIG_STR_pct', 'R_avg_opp_SUB_ATT', 'R_avg_opp_REV',
       'R_avg_opp_SIG_STR_landed', 'R_avg_TOTAL_STR_att',
       'R_avg_TOTAL_STR_landed', 'R_avg_TD_att', 'R_avg_opp_BODY_att',
       'R_avg_opp_BODY_landed', 'R_avg_LEG_att', 'R_avg_LEG_landed',
       'R_avg_opp_LEG_landed', 'R_avg_DISTANCE_att', 'R_avg_DISTANCE_landed',
       'R_avg_CLINCH_landed', 'R_avg_opp_CLINCH_landed', 'R_avg_GROUND_att',
       'R_avg_opp_GROUND_att', 'R_avg_opp_CTRL_time(seconds)', 'R_Reach_cms'],
      dtype='object')


Unnamed: 0,R_avg_opp_SIG_STR_pct,R_avg_opp_SUB_ATT,R_avg_opp_REV,R_avg_opp_SIG_STR_landed,R_avg_TOTAL_STR_att,R_avg_TOTAL_STR_landed,R_avg_TD_att,R_avg_opp_BODY_att,R_avg_opp_BODY_landed,R_avg_LEG_att,R_avg_LEG_landed,R_avg_opp_LEG_landed,R_avg_DISTANCE_att,R_avg_DISTANCE_landed,R_avg_CLINCH_landed,R_avg_opp_CLINCH_landed,R_avg_GROUND_att,R_avg_opp_GROUND_att,R_avg_opp_CTRL_time(seconds),R_Reach_cms
0,178.033727,176.736584,176.451765,166.664484,180.067444,179.766654,178.998086,168.823364,168.724889,168.639512,167.262963,167.520023,171.74667,171.164668,178.258639,182.049306,178.805676,176.985423,183.66463,5.311745
1,178.159867,176.862724,176.577905,166.790624,180.193584,179.892794,179.124226,168.949504,168.851029,168.765652,167.389103,167.646163,171.87281,171.290808,178.384779,182.175446,178.931816,177.111563,183.79077,5.360652
2,178.130734,176.833591,176.548771,166.761491,180.164451,179.86366,179.095093,168.920371,168.821896,168.736518,167.359969,167.617029,171.843676,171.261674,178.355646,182.146313,178.902683,177.082429,183.761637,5.311974
3,178.159867,176.862724,176.577905,166.790624,180.193584,179.892794,179.124226,168.949504,168.851029,168.765652,167.389103,167.646163,171.87281,171.290808,178.384779,182.175446,178.931816,177.111563,183.79077,5.348648
4,178.159867,176.862724,176.577905,166.790624,180.193584,179.892794,179.124226,168.949504,168.851029,168.765652,167.389103,167.646163,171.87281,171.290808,178.384779,182.175446,178.931816,177.111563,183.79077,5.260323


In [34]:
print(df.shape)

(5410, 150)
