## Kombinasi 5 :
- Delete duplicate
- Impute missing value using iterative imputer
- Outlier handling with log tranformation
- Encoding
- Standard Scaler
- Feature selection with Decision Tree

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("../UFC_train.csv")

df.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,Winner
0,Joe Riggs,Joe Doerksen,Steve Mazzagatti,2004-08-21,"Las Vegas, Nevada, USA",False,Middleweight,,,,...,0,0,0,Southpaw,182.88,177.8,185.0,26.0,21.0,Red
1,Jorge Masvidal,Al Iaquinta,Keith Peterson,2015-04-04,"Fairfax, Virginia, USA",False,Lightweight,1.15625,0.0,0.394141,...,0,1,0,Orthodox,180.34,187.96,170.0,27.0,30.0,Blue
2,Dan Stittgen,Stephen Thompson,Josh Rosenthal,2012-02-04,"Las Vegas, Nevada, USA",False,Welterweight,,,,...,0,0,0,Orthodox,185.42,,170.0,28.0,31.0,Blue
3,Josh Koscheck,Johny Hendricks,Kevin Mulhall,2012-05-05,"East Rutherford, New Jersey, USA",False,Welterweight,0.695312,0.0,0.783359,...,6,3,0,Orthodox,177.8,185.42,170.0,28.0,34.0,Blue
4,John Dodson,Manvel Gamburyan,James Warring,2016-04-16,"Tampa, Florida, USA",False,Bantamweight,0.5,0.266602,0.381462,...,3,0,1,Orthodox,160.02,167.64,135.0,34.0,31.0,Red


In [3]:
# Delete duplicate
print(f"Shape before dropping duplicate : {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicate : {df.shape}")

Shape before dropping duplicate : (5410, 144)
Shape after dropping duplicate : (5410, 144)


In [4]:
def check_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na / len(df)
    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")
    else:
        print(missing_data[missing_data['Total'] > 0])

In [5]:
check_null(df)

                         Total   Percent
R_Weight_lbs                 2  0.000370
R_Height_cms                 4  0.000739
B_Weight_lbs                 8  0.001479
B_Height_cms                10  0.001848
R_Stance                    27  0.004991
...                        ...       ...
B_avg_GROUND_landed       1293  0.239002
B_avg_GROUND_att          1293  0.239002
B_avg_opp_CLINCH_landed   1293  0.239002
B_avg_TD_att              1293  0.239002
B_avg_HEAD_landed         1293  0.239002

[109 rows x 2 columns]


In [6]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [7]:
# Impute missing value using iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import ExtraTreeRegressor

imputer = IterativeImputer(random_state=42, max_iter=15, initial_strategy='median', estimator=ExtraTreeRegressor())
df_imputed_number = imputer.fit_transform(df_number)
df_imputed_number = pd.DataFrame(df_imputed_number, columns=df_number.columns)



In [8]:
# Drop R_fighter, B_fighter, Referee, location, date
df_object = df_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [9]:
# Combine df_object and df_imputed_number
df = pd.concat([df_object, df_imputed_number], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Middleweight,Orthodox,Southpaw,Red,0.0,0.0,0.24,0.44,0.5625,1.0,...,0.0,0.0,0.0,0.0,0.0,182.88,177.8,185.0,26.0,21.0
1,Lightweight,Orthodox,Orthodox,Blue,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,...,0.0,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0
2,Welterweight,Orthodox,Orthodox,Blue,0.0,0.0,0.51,0.18,0.55,0.0,...,0.0,0.0,0.0,0.0,0.0,185.42,180.34,170.0,28.0,31.0
3,Welterweight,Southpaw,Orthodox,Blue,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,...,1.0,5.0,6.0,3.0,0.0,177.8,185.42,170.0,28.0,34.0
4,Bantamweight,Orthodox,Orthodox,Red,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,...,0.0,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0


In [10]:
check_null(df)

          Total   Percent
R_Stance     27  0.004991
B_Stance     64  0.011830


In [11]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)

In [12]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [13]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [14]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [15]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                 569
B_avg_opp_KD             557
B_avg_SIG_STR_pct        126
B_avg_opp_SIG_STR_pct    141
B_avg_TD_pct               0
                        ... 
R_Height_cms              19
R_Reach_cms               83
R_Weight_lbs             301
B_age                     31
R_age                     75
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 10.517560073937153%
B_avg_opp_KD = 10.295748613678374%
B_avg_SIG_STR_pct = 2.32902033271719%
B_avg_opp_SIG_STR_pct = 2.6062846580406656%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 7.22735674676525%
B_avg_opp_SUB_ATT = 6.709796672828096%
B_avg_REV = 12.476894639556377%
B_avg_opp_REV = 11.571164510166358%
B_avg_SIG_STR_att = 3.4565619223659887%
B_avg_SIG_STR_landed = 3.197781885397412%
B_avg_opp_SIG_STR_att = 3.807763401109057%
B_avg_opp_SIG_STR_landed = 2.902033271719039%
B_avg_TOTAL_STR_att = 2.255083179297597%
B_avg_TOTAL_STR_landed = 2.273567467652495

In [16]:
# Add a constant value to make all values positive
df_number_positive = df_number + abs(df_number.min()) + 1

df_number_log = np.log(df_number_positive)


In [17]:
df_number_log.head()

Unnamed: 0,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,B_avg_opp_REV,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0.0,0.0,0.215111,0.364643,0.446287,0.693147,1.098612,0.0,0.693147,1.098612,...,0.0,0.0,0.0,0.0,0.0,5.817944,5.802722,5.70711,3.806662,3.713572
1,0.768371,0.0,0.332278,0.301897,0.214481,0.011419,0.145182,0.124703,0.0,0.0,...,0.0,1.609438,0.0,0.693147,0.0,5.810362,5.832938,5.655992,3.828641,3.912023
2,0.0,0.0,0.41211,0.165514,0.438255,0.0,0.0,1.098612,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.825469,5.810362,5.655992,3.850148,3.931826
3,0.527867,0.0,0.578499,0.170204,0.0846,0.09928,0.089612,0.060625,0.0,0.0,...,0.693147,1.791759,1.94591,1.386294,0.0,5.802722,5.825469,5.655992,3.850148,3.988984
4,0.405465,0.236337,0.323143,0.376076,0.357405,0.38497,0.86623,0.000244,0.117783,0.405465,...,0.0,1.098612,1.386294,0.0,0.693147,5.747544,5.771566,5.525453,3.970292,3.931826


In [18]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                 569
B_avg_opp_KD             557
B_avg_SIG_STR_pct        126
B_avg_opp_SIG_STR_pct    141
B_avg_TD_pct               0
                        ... 
R_Height_cms              19
R_Reach_cms               83
R_Weight_lbs             301
B_age                     31
R_age                     75
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 10.517560073937153%
B_avg_opp_KD = 10.295748613678374%
B_avg_SIG_STR_pct = 2.32902033271719%
B_avg_opp_SIG_STR_pct = 2.6062846580406656%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 7.22735674676525%
B_avg_opp_SUB_ATT = 6.709796672828096%
B_avg_REV = 12.476894639556377%
B_avg_opp_REV = 11.571164510166358%
B_avg_SIG_STR_att = 3.4565619223659887%
B_avg_SIG_STR_landed = 3.197781885397412%
B_avg_opp_SIG_STR_att = 3.807763401109057%
B_avg_opp_SIG_STR_landed = 2.902033271719039%
B_avg_TOTAL_STR_att = 2.255083179297597%
B_avg_TOTAL_STR_landed = 2.273567467652495

In [19]:
df['gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [20]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,Middleweight,Orthodox,Southpaw,Red,0.0,0.0,0.24,0.44,0.5625,1.0,...,0.0,0.0,0.0,0.0,182.88,177.8,185.0,26.0,21.0,male
1,Lightweight,Orthodox,Orthodox,Blue,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,...,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0,male
2,Welterweight,Orthodox,Orthodox,Blue,0.0,0.0,0.51,0.18,0.55,0.0,...,0.0,0.0,0.0,0.0,185.42,180.34,170.0,28.0,31.0,male
3,Welterweight,Southpaw,Orthodox,Blue,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,...,5.0,6.0,3.0,0.0,177.8,185.42,170.0,28.0,34.0,male
4,Bantamweight,Orthodox,Orthodox,Red,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,...,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0,male


In [21]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')

In [22]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Middleweight', 'Lightweight', 'Welterweight', 'Bantamweight',
       'Flyweight', 'LightHeavyweight', 'Strawweight', 'Featherweight',
       'OpenWeight', 'Heavyweight', 'CatchWeight'], dtype=object)

In [23]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [24]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)

In [25]:
# Map each gender value to the correct number
df['gender'] = df['gender'].map(gender_dict)

In [26]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,7,Orthodox,Southpaw,Red,0.0,0.0,0.24,0.44,0.5625,1.0,...,0.0,0.0,0.0,0.0,182.88,177.8,185.0,26.0,21.0,1
1,5,Orthodox,Orthodox,Blue,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,...,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0,1
2,6,Orthodox,Orthodox,Blue,0.0,0.0,0.51,0.18,0.55,0.0,...,0.0,0.0,0.0,0.0,185.42,180.34,170.0,28.0,31.0,1
3,6,Southpaw,Orthodox,Blue,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,...,5.0,6.0,3.0,0.0,177.8,185.42,170.0,28.0,34.0,1
4,3,Orthodox,Orthodox,Red,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,...,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0,1


In [27]:
df_classif = df.copy()
df_classif.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,7,Orthodox,Southpaw,Red,0.0,0.0,0.24,0.44,0.5625,1.0,...,0.0,0.0,0.0,0.0,182.88,177.8,185.0,26.0,21.0,1
1,5,Orthodox,Orthodox,Blue,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,...,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0,1
2,6,Orthodox,Orthodox,Blue,0.0,0.0,0.51,0.18,0.55,0.0,...,0.0,0.0,0.0,0.0,185.42,180.34,170.0,28.0,31.0,1
3,6,Southpaw,Orthodox,Blue,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,...,5.0,6.0,3.0,0.0,177.8,185.42,170.0,28.0,34.0,1
4,3,Orthodox,Orthodox,Red,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,...,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0,1


In [28]:
y_classif = df_classif['Winner']
df_classif.drop(columns=['Winner'], axis=1)

Unnamed: 0,weight_class,B_Stance,R_Stance,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,7,Orthodox,Southpaw,0.000000,0.000000,0.240000,0.440000,0.562500,1.000000,2.000000,...,0.0,0.0,0.0,0.0,182.88,177.80,185.0,26.0,21.0,1
1,5,Orthodox,Orthodox,1.156250,0.000000,0.394141,0.352422,0.239219,0.011484,0.156250,...,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0,1
2,6,Orthodox,Orthodox,0.000000,0.000000,0.510000,0.180000,0.550000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,185.42,180.34,170.0,28.0,31.0,1
3,6,Southpaw,Orthodox,0.695312,0.000000,0.783359,0.185547,0.088281,0.104375,0.093750,...,5.0,6.0,3.0,0.0,177.80,185.42,170.0,28.0,34.0,1
4,3,Orthodox,Orthodox,0.500000,0.266602,0.381462,0.456558,0.429614,0.469570,1.377930,...,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5405,5,Southpaw,Orthodox,0.125000,0.000000,0.481250,0.545000,0.721250,0.435000,0.625000,...,1.0,0.0,0.0,0.0,175.26,180.34,155.0,26.0,29.0,1
5406,6,Southpaw,Southpaw,0.015625,0.000000,0.384980,0.313633,0.613965,0.201641,0.554688,...,4.0,1.0,5.0,0.0,185.42,182.88,170.0,28.0,34.0,1
5407,4,Southpaw,Orthodox,0.250000,0.250000,0.602500,0.515000,0.330000,0.330000,0.250000,...,0.0,0.0,0.0,0.0,175.26,177.80,145.0,24.0,30.0,1
5408,5,Orthodox,Orthodox,0.875000,0.125000,0.543125,0.426250,0.031250,0.161250,0.000000,...,4.0,1.0,0.0,0.0,175.26,180.34,155.0,27.0,34.0,1


In [29]:
# Get the remaining object columns
df_classif_object = df_classif.select_dtypes(include='object')

# Perform one-hot encoding
df_classif_encoded = pd.get_dummies(df_classif_object)

# Combine the encoded columns with the original dataframe
df_classif = pd.concat([df_classif, df_classif_encoded], axis=1)

# Drop the original object columns
df_classif.drop(df_classif_object.columns, axis=1, inplace=True)

df_classif.head()

Unnamed: 0,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch,Winner_Blue,Winner_Draw,Winner_Red
0,7,0.0,0.0,0.24,0.44,0.5625,1.0,2.0,0.0,1.0,...,False,False,False,False,False,True,False,False,False,True
1,5,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,0.15625,0.132812,0.0,...,False,False,False,True,False,False,False,True,False,False
2,6,0.0,0.0,0.51,0.18,0.55,0.0,0.0,2.0,0.0,...,False,False,False,True,False,False,False,True,False,False
3,6,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,0.09375,0.0625,0.0,...,True,False,False,True,False,False,False,True,False,False
4,3,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,1.37793,0.000244,0.125,...,False,False,False,True,False,False,False,False,False,True


In [30]:
df_classif = df_classif.drop(columns=['Winner_Blue', 'Winner_Draw', 'Winner_Red'])
df_classif.head()

Unnamed: 0,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch
0,7,0.0,0.0,0.24,0.44,0.5625,1.0,2.0,0.0,1.0,...,False,True,False,False,False,False,False,False,True,False
1,5,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,0.15625,0.132812,0.0,...,False,True,False,False,False,False,True,False,False,False
2,6,0.0,0.0,0.51,0.18,0.55,0.0,0.0,2.0,0.0,...,False,True,False,False,False,False,True,False,False,False
3,6,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,0.09375,0.0625,0.0,...,False,False,False,True,False,False,True,False,False,False
4,3,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,1.37793,0.000244,0.125,...,False,True,False,False,False,False,True,False,False,False


In [31]:
from sklearn.tree import DecisionTreeClassifier

# Feature selection with Decision Tree

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(df_classif, y_classif, test_size=0.2, random_state=2020)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [32]:
feature_importances = model.feature_importances_

In [33]:
top_n = 20
top_features_indices = feature_importances.argsort()[-top_n:][::-1]

In [34]:
top_features_indices

array([104, 134,  70,  36, 107,  34,  28,  30,  43,  92,  48, 110,  95,
       105,  90,  39, 133,  86,  27,  82], dtype=int64)

In [35]:
# Get the names of the top features
top_features = df_classif.columns[top_features_indices]
top_features

Index(['R_avg_opp_DISTANCE_landed', 'R_age', 'R_avg_opp_SIG_STR_pct',
       'B_avg_DISTANCE_landed', 'R_avg_opp_CLINCH_att', 'B_avg_opp_LEG_landed',
       'B_avg_BODY_landed', 'B_avg_opp_BODY_landed', 'B_avg_GROUND_att',
       'R_avg_opp_HEAD_landed', 'B_avg_opp_CTRL_time(seconds)',
       'R_avg_GROUND_landed', 'R_avg_opp_BODY_att', 'R_avg_CLINCH_att',
       'R_avg_HEAD_landed', 'B_avg_CLINCH_att', 'B_age', 'R_avg_TD_landed',
       'B_avg_BODY_att', 'R_avg_TOTAL_STR_landed'],
      dtype='object')

In [36]:
# Create a dataframe with only the top features
df_classif_top_features = df_classif[top_features]
df_classif_top_features.head()


Unnamed: 0,R_avg_opp_DISTANCE_landed,R_age,R_avg_opp_SIG_STR_pct,B_avg_DISTANCE_landed,R_avg_opp_CLINCH_att,B_avg_opp_LEG_landed,B_avg_BODY_landed,B_avg_opp_BODY_landed,B_avg_GROUND_att,R_avg_opp_HEAD_landed,B_avg_opp_CTRL_time(seconds),R_avg_GROUND_landed,R_avg_opp_BODY_att,R_avg_CLINCH_att,R_avg_HEAD_landed,B_avg_CLINCH_att,B_age,R_avg_TD_landed,B_avg_BODY_att,R_avg_TOTAL_STR_landed
0,31.0,21.0,0.37,19.0,7.5,3.4375,3.5,8.433838,0.0,19.827877,74.0,0.455658,13.5,3.621492,40.0,12.0,26.0,1.0,6.25,61.234375
1,38.40625,30.0,0.339062,46.8125,4.5625,2.09375,7.765625,2.070312,4.757812,27.65625,14.601562,7.25,17.59375,14.28125,37.78125,5.671875,27.0,1.875,13.625,87.59375
2,15.0,31.0,0.373164,17.0,14.43969,0.25,5.25,2.125,22.75,15.5625,40.0,0.0,6.0,11.586592,21.25,7.0,28.0,1.625,7.375,36.0
3,33.409025,34.0,0.325567,8.875,6.983452,1.710938,4.546875,4.453125,1.648438,27.688076,67.15625,4.844223,8.554874,8.778824,19.695154,11.867188,28.0,1.440809,5.429688,41.379938
4,43.398438,31.0,0.42,20.501221,23.234375,6.250977,5.452637,9.236328,7.906982,35.625,181.141357,0.828125,24.078125,15.484375,15.8125,9.979004,34.0,1.171875,8.149902,72.429688


In [37]:
df_classif_top_features['Winner'] = y_classif
df_classif_top_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_classif_top_features['Winner'] = y_classif


Unnamed: 0,R_avg_opp_DISTANCE_landed,R_age,R_avg_opp_SIG_STR_pct,B_avg_DISTANCE_landed,R_avg_opp_CLINCH_att,B_avg_opp_LEG_landed,B_avg_BODY_landed,B_avg_opp_BODY_landed,B_avg_GROUND_att,R_avg_opp_HEAD_landed,...,R_avg_GROUND_landed,R_avg_opp_BODY_att,R_avg_CLINCH_att,R_avg_HEAD_landed,B_avg_CLINCH_att,B_age,R_avg_TD_landed,B_avg_BODY_att,R_avg_TOTAL_STR_landed,Winner
0,31.0,21.0,0.37,19.0,7.5,3.4375,3.5,8.433838,0.0,19.827877,...,0.455658,13.5,3.621492,40.0,12.0,26.0,1.0,6.25,61.234375,Red
1,38.40625,30.0,0.339062,46.8125,4.5625,2.09375,7.765625,2.070312,4.757812,27.65625,...,7.25,17.59375,14.28125,37.78125,5.671875,27.0,1.875,13.625,87.59375,Blue
2,15.0,31.0,0.373164,17.0,14.43969,0.25,5.25,2.125,22.75,15.5625,...,0.0,6.0,11.586592,21.25,7.0,28.0,1.625,7.375,36.0,Blue
3,33.409025,34.0,0.325567,8.875,6.983452,1.710938,4.546875,4.453125,1.648438,27.688076,...,4.844223,8.554874,8.778824,19.695154,11.867188,28.0,1.440809,5.429688,41.379938,Blue
4,43.398438,31.0,0.42,20.501221,23.234375,6.250977,5.452637,9.236328,7.906982,35.625,...,0.828125,24.078125,15.484375,15.8125,9.979004,34.0,1.171875,8.149902,72.429688,Red


In [38]:
df_classif_top_features.to_csv('../classification/dataframe/UFC_kombinasi5.csv', index=False)


In [39]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()


Unnamed: 0,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch,Winner_Blue,Winner_Draw,Winner_Red
0,7,0.0,0.0,0.24,0.44,0.5625,1.0,2.0,0.0,1.0,...,False,False,False,False,False,True,False,False,False,True
1,5,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,0.15625,0.132812,0.0,...,False,False,False,True,False,False,False,True,False,False
2,6,0.0,0.0,0.51,0.18,0.55,0.0,0.0,2.0,0.0,...,False,False,False,True,False,False,False,True,False,False
3,6,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,0.09375,0.0625,0.0,...,True,False,False,True,False,False,False,True,False,False
4,3,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,1.37793,0.000244,0.125,...,False,False,False,True,False,False,False,False,False,True


In [40]:
from sklearn.tree import DecisionTreeRegressor

# Feature selection with Decision Tree
X = df.drop('B_Reach_cms', axis=1)
y = df['B_Reach_cms']

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [41]:
feature_importances = model.feature_importances_

In [42]:
top_n = 20
top_features_indices = feature_importances.argsort()[-top_n:][::-1]

In [43]:
top_features_indices

array([ 64,  65,  31,   3, 134,  39, 132, 131,  62,  15,  91, 113,  18,
        63,  97,  48,  32,  23,  16,  46], dtype=int64)

In [44]:
# Get the names of the top features
top_features = X.columns[top_features_indices]
top_features

Index(['B_Height_cms', 'B_Weight_lbs', 'B_avg_LEG_att', 'B_avg_SIG_STR_pct',
       'gender', 'B_avg_CLINCH_att', 'B_age', 'R_Weight_lbs',
       'B_win_by_Submission', 'B_avg_TOTAL_STR_att', 'R_avg_opp_HEAD_landed',
       'R_avg_opp_CTRL_time(seconds)', 'B_avg_opp_TOTAL_STR_landed',
       'B_win_by_TKO_Doctor_Stoppage', 'R_avg_LEG_landed',
       'B_avg_opp_CTRL_time(seconds)', 'B_avg_LEG_landed', 'B_avg_HEAD_att',
       'B_avg_TOTAL_STR_landed', 'B_avg_opp_GROUND_landed'],
      dtype='object')

In [45]:
# Create a dataframe with only the top features
df_top_features = df[top_features]
df_top_features['B_Reach_cms'] = df['B_Reach_cms']
df_top_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_features['B_Reach_cms'] = df['B_Reach_cms']


Unnamed: 0,B_Height_cms,B_Weight_lbs,B_avg_LEG_att,B_avg_SIG_STR_pct,gender,B_avg_CLINCH_att,B_age,R_Weight_lbs,B_win_by_Submission,B_avg_TOTAL_STR_att,...,R_avg_opp_CTRL_time(seconds),B_avg_opp_TOTAL_STR_landed,B_win_by_TKO_Doctor_Stoppage,R_avg_LEG_landed,B_avg_opp_CTRL_time(seconds),B_avg_LEG_landed,B_avg_HEAD_att,B_avg_TOTAL_STR_landed,B_avg_opp_GROUND_landed,B_Reach_cms
0,182.88,185.0,0.25,0.24,1,12.0,26.0,185.0,0.0,96.34214,...,44.375,47.451416,0.0,5.0,74.0,0.25,70.0,47.708145,4.332031,190.5
1,177.8,155.0,6.523438,0.394141,1,5.671875,27.0,170.0,0.0,141.210938,...,115.0,33.078125,0.0,5.3125,14.601562,5.492188,117.3125,56.71875,0.359375,177.8
2,182.88,170.0,3.0625,0.51,1,7.0,28.0,170.0,0.0,78.5,...,122.375,23.038086,0.0,2.59375,40.0,2.0,40.503113,54.0,0.25,190.5
3,175.26,185.0,3.421875,0.783359,1,11.867188,28.0,170.0,0.0,51.5,...,116.292816,28.671875,0.0,0.155367,67.15625,2.664062,30.367188,30.742188,0.8125,175.26
4,165.1,135.0,8.278564,0.381462,1,9.979004,34.0,135.0,3.0,102.17041,...,162.796875,71.359619,0.0,9.9375,181.141357,5.88208,58.093994,51.280762,3.843018,170.18


In [46]:
df_top_features['B_Reach_cms'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_features['B_Reach_cms'] = y


In [47]:
# Export to CSV
df_top_features.to_csv('../Punya Andi/UFC_kombinasi5.csv', index=False)
