In [17]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [18]:
df = pd.read_csv("dataframe/UFC_Test_Classif_X.csv")
df.head()

Unnamed: 0,id,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,Tecia Torres,Juliana Lima,Chris Tognoni,2017-07-07,"Las Vegas, Nevada, USA",False,WomenStrawweight,0.0,0.0,...,4,0,0,0,Orthodox,154.94,152.4,115.0,35.0,27.0
1,1,John Howard,Lorenz Larkin,Herb Dean,2015-01-18,"Boston, Massachusetts, USA",False,Welterweight,0.0,0.25,...,1,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0
2,2,Kyle Bochniak,Jeremy Kennedy,Todd Ronald Anderson,2017-07-22,"Uniondale, New York, USA",False,Featherweight,0.0,0.5,...,0,0,0,0,Orthodox,170.18,177.8,145.0,24.0,30.0
3,3,Yao Zhikui,Royston Wee,Steve Perceval,2014-08-23,"Macau, China",False,Bantamweight,0.0,0.0,...,0,0,0,0,Orthodox,165.1,162.56,125.0,27.0,23.0
4,4,Carlos Newton,Pat Miletich,John McCarthy,2001-05-04,"Atlantic City, New Jersey, USA",True,Welterweight,0.0,0.0,...,0,0,1,0,Orthodox,175.26,,170.0,33.0,24.0


In [19]:
# Delete duplicate
print(f"Shape before dropping duplicates : {df.shape}") # Before dropping duplicates
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {df.shape}") # After dropping duplicates

Shape before dropping duplicates : (602, 143)
Shape after dropping duplicates: (602, 143)


In [20]:
def check_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na / len(df)
    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")
    else:
        print(missing_data[missing_data['Total'] > 0])

In [21]:
check_null(df)

                          Total   Percent
Referee                       1  0.001661
R_Stance                      2  0.003322
B_Stance                      2  0.003322
R_age                         7  0.011628
B_age                        11  0.018272
...                         ...       ...
B_avg_CTRL_time(seconds)    134  0.222591
B_avg_opp_GROUND_att        134  0.222591
B_avg_GROUND_landed         134  0.222591
B_avg_BODY_landed           134  0.222591
B_avg_DISTANCE_att          134  0.222591

[104 rows x 2 columns]


In [22]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [23]:
df_number_imputed = df_number.fillna(df_number.median())

In [24]:
df

Unnamed: 0,id,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,Tecia Torres,Juliana Lima,Chris Tognoni,2017-07-07,"Las Vegas, Nevada, USA",False,WomenStrawweight,0.00000,0.000000,...,4,0,0,0,Orthodox,154.94,152.40,115.0,35.0,27.0
1,1,John Howard,Lorenz Larkin,Herb Dean,2015-01-18,"Boston, Massachusetts, USA",False,Welterweight,0.00000,0.250000,...,1,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0
2,2,Kyle Bochniak,Jeremy Kennedy,Todd Ronald Anderson,2017-07-22,"Uniondale, New York, USA",False,Featherweight,0.00000,0.500000,...,0,0,0,0,Orthodox,170.18,177.80,145.0,24.0,30.0
3,3,Yao Zhikui,Royston Wee,Steve Perceval,2014-08-23,"Macau, China",False,Bantamweight,0.00000,0.000000,...,0,0,0,0,Orthodox,165.10,162.56,125.0,27.0,23.0
4,4,Carlos Newton,Pat Miletich,John McCarthy,2001-05-04,"Atlantic City, New Jersey, USA",True,Welterweight,0.00000,0.000000,...,0,0,1,0,Orthodox,175.26,,170.0,33.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,597,Julianna Pena,Jessica Rakoczy,Mario Yamasaki,2013-11-30,"Las Vegas, Nevada, USA",True,WomenBantamweight,,,...,0,0,0,0,Orthodox,167.64,175.26,135.0,36.0,24.0
598,598,Jake Matthews,Andrew Holbrook,John Sharp,2016-11-26,"Melbourne, Victoria, Australia",False,Lightweight,0.00000,0.500000,...,0,0,3,1,Orthodox,180.34,185.42,170.0,30.0,22.0
599,599,Gregor Gillespie,Yancy Medeiros,Todd Ronald Anderson,2019-01-19,"Brooklyn, New York, USA",False,Lightweight,0.80127,2.015625,...,1,2,2,0,Orthodox,170.18,180.34,155.0,31.0,32.0
600,600,Pat Barry,Shawn Jordan,Jerin Valel,2013-06-15,"Winnipeg, Manitoba, Canada",False,Heavyweight,0.00000,0.000000,...,1,4,0,0,Orthodox,180.34,187.96,235.0,28.0,33.0


In [25]:
# Drop R_fighter, B_fighter, Referee, location, and date
df_object = df_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [26]:
# Combine df_object and df_imputed_number
df = pd.concat([df_object, df_number_imputed], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,WomenStrawweight,Orthodox,Orthodox,0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,0,4,0,0,0,154.94,152.4,115.0,35.0,27.0
1,Welterweight,Orthodox,Orthodox,1,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,3,1,2,0,0,170.18,182.88,170.0,28.0,31.0
2,Featherweight,Orthodox,Orthodox,2,0.0,0.5,0.5,0.48,0.615,0.0,...,1,0,0,0,0,170.18,177.8,145.0,24.0,30.0
3,Bantamweight,Orthodox,Orthodox,3,0.0,0.0,0.66,0.69,0.85,0.0,...,0,0,0,0,0,165.1,162.56,125.0,27.0,23.0
4,Welterweight,Orthodox,Orthodox,4,0.0,0.0,0.505312,0.439375,0.815937,0.067187,...,0,0,0,1,0,175.26,182.88,170.0,33.0,24.0


In [27]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)

In [28]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [29]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [30]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

In [31]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
id                         0
B_avg_KD                  49
B_avg_opp_KD             126
B_avg_SIG_STR_pct         66
B_avg_opp_SIG_STR_pct     54
                        ... 
R_Height_cms               3
R_Reach_cms               11
R_Weight_lbs              32
B_age                     11
R_age                      1
Length: 134, dtype: int64


percentage of outliers in each column:
id = 0.0%
B_avg_KD = 8.13953488372093%
B_avg_opp_KD = 20.930232558139537%
B_avg_SIG_STR_pct = 10.96345514950166%
B_avg_opp_SIG_STR_pct = 8.970099667774086%
B_avg_TD_pct = 4.318936877076411%
B_avg_opp_TD_pct = 6.64451827242525%
B_avg_SUB_ATT = 7.807308970099667%
B_avg_opp_SUB_ATT = 5.647840531561462%
B_avg_REV = 21.428571428571427%
B_avg_opp_REV = 23.920265780730897%
B_avg_SIG_STR_att = 5.48172757475083%
B_avg_SIG_STR_landed = 4.485049833887043%
B_avg_opp_SIG_STR_att = 4.983388704318937%
B_avg_opp_SIG_STR_landed = 5.3156146179401995%
B_avg_TOTAL_STR_att = 5.3156146179401995%
B_avg_

In [32]:
# Add a constant value to make all values positive
df_reach = df_number['B_Reach_cms']
df_number_positive = df_number + abs(df_number.min()) + 1

df_number_log = np.log(df_number_positive)
df_number_log['B_Reach_cms'] = df_reach

KeyError: 'B_Reach_cms'

In [None]:
df_id = df_test_number['id']
df_test_number.drop(['id'], axis=1, inplace=True)
df_test_number_positive = df_test_number + abs(df_test_number.min()) + 1
df_test_number_log = np.log(df_test_number_positive)
df_test_number_log = pd.concat([df_id, df_test_number_log], axis=1)

In [None]:
df_number_log

Unnamed: 0,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,B_avg_opp_REV,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0.015504,0.000000,0.371564,0.355925,0.223144,0.182322,0.145182,0.117783,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.817944,5.802722,5.707110,3.806662,3.713572
1,0.768371,0.000000,0.332278,0.301897,0.214481,0.011419,0.145182,0.124703,0.000000,0.000000,...,0.000000,1.609438,0.000000,0.693147,0.000000,5.810362,5.832938,5.655992,3.828641,3.912023
2,0.015504,0.000000,0.371564,0.355925,0.223144,0.182322,0.145182,0.117783,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.825469,5.817944,5.655992,3.850148,3.931826
3,0.527867,0.000000,0.578499,0.170204,0.084600,0.099280,0.089612,0.060625,0.000000,0.000000,...,0.693147,1.791759,1.945910,1.386294,0.000000,5.802722,5.825469,5.655992,3.850148,3.988984
4,0.405465,0.236337,0.323143,0.376076,0.357405,0.384970,0.866230,0.000244,0.117783,0.405465,...,0.000000,1.098612,1.386294,0.000000,0.693147,5.747544,5.771566,5.525453,3.970292,3.931826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5405,0.117783,0.000000,0.392886,0.435024,0.543051,0.361165,0.485508,0.318454,0.810930,0.117783,...,0.000000,0.693147,0.000000,0.000000,0.000000,5.795024,5.810362,5.602119,3.806662,3.891820
5406,0.015504,0.000000,0.325686,0.272796,0.478694,0.183688,0.441275,0.286705,0.009718,0.119518,...,0.000000,1.609438,0.693147,1.791759,0.000000,5.825469,5.817944,5.655992,3.850148,3.988984
5407,0.223144,0.223144,0.471565,0.415415,0.285179,0.285179,0.223144,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.795024,5.802722,5.564520,3.761200,3.912023
5408,0.628609,0.117783,0.433810,0.355049,0.030772,0.149497,0.000000,0.015504,0.000000,0.000000,...,0.693147,1.609438,0.693147,0.000000,0.000000,5.795024,5.810362,5.602119,3.828641,3.988984


In [None]:
outliers = dict(check_outlier(df_number_log).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number_log[key].shape[0] * 100}%")

NameError: name 'df_number_log' is not defined

In [None]:
df = pd.concat([df_object, df_number_log], axis=1)
df_test = pd.concat([df_test_object, df_test_number_log], axis=1)

In [33]:
df['gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [34]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,WomenStrawweight,Orthodox,Orthodox,0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,4,0,0,0,154.94,152.4,115.0,35.0,27.0,women
1,Welterweight,Orthodox,Orthodox,1,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,1,2,0,0,170.18,182.88,170.0,28.0,31.0,male
2,Featherweight,Orthodox,Orthodox,2,0.0,0.5,0.5,0.48,0.615,0.0,...,0,0,0,0,170.18,177.8,145.0,24.0,30.0,male
3,Bantamweight,Orthodox,Orthodox,3,0.0,0.0,0.66,0.69,0.85,0.0,...,0,0,0,0,165.1,162.56,125.0,27.0,23.0,male
4,Welterweight,Orthodox,Orthodox,4,0.0,0.0,0.505312,0.439375,0.815937,0.067187,...,0,0,1,0,175.26,182.88,170.0,33.0,24.0,male


In [35]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')

In [36]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Strawweight', 'Welterweight', 'Featherweight', 'Bantamweight',
       'Middleweight', 'Lightweight', 'LightHeavyweight', 'Heavyweight',
       'CatchWeight', 'OpenWeight', 'Flyweight'], dtype=object)

In [37]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [38]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)

In [39]:
# Map each gender value to the correct number
df['gender'] = df['gender'].map(gender_dict)

In [40]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,1,Orthodox,Orthodox,0,0.0,0.0,0.49375,0.44875,0.475,0.1775,...,4,0,0,0,154.94,152.4,115.0,35.0,27.0,0
1,6,Orthodox,Orthodox,1,0.0,0.25,0.473125,0.371875,0.0,0.20375,...,1,2,0,0,170.18,182.88,170.0,28.0,31.0,1
2,4,Orthodox,Orthodox,2,0.0,0.5,0.5,0.48,0.615,0.0,...,0,0,0,0,170.18,177.8,145.0,24.0,30.0,1
3,3,Orthodox,Orthodox,3,0.0,0.0,0.66,0.69,0.85,0.0,...,0,0,0,0,165.1,162.56,125.0,27.0,23.0,1
4,6,Orthodox,Orthodox,4,0.0,0.0,0.505312,0.439375,0.815937,0.067187,...,0,0,1,0,175.26,182.88,170.0,33.0,24.0,1


In [41]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,R_age,gender,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0,0.0,0.0,0.49375,0.44875,0.475,0.1775,0.0,0.125,...,27.0,0,False,True,False,False,False,True,False,False
1,6,1,0.0,0.25,0.473125,0.371875,0.0,0.20375,0.0,0.5,...,31.0,1,False,True,False,False,False,True,False,False
2,4,2,0.0,0.5,0.5,0.48,0.615,0.0,0.0,0.0,...,30.0,1,False,True,False,False,False,True,False,False
3,3,3,0.0,0.0,0.66,0.69,0.85,0.0,0.0,2.0,...,23.0,1,False,True,False,False,False,True,False,False
4,6,4,0.0,0.0,0.505312,0.439375,0.815937,0.067187,1.28125,0.640625,...,24.0,1,False,True,False,False,False,True,False,False


In [42]:
# Get the remaining object columns
df_test_object = df_test.select_dtypes(include='object')

# Perform one-hot encoding
df_test_encoded = pd.get_dummies(df_test_object)

# Combine the encoded columns with the original dataframe
df_test = pd.concat([df_test, df_test_encoded], axis=1)

# Drop the original object columns
df_test.drop(df_test_object.columns, axis=1, inplace=True)

df_test.head()

NameError: name 'df_test' is not defined

In [None]:
df.to_csv('../UFC_Pre_kombinasi6.csv', index=False)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
# Perform feature selection using SelectKBest
selector = SelectKBest(k=25, score_func=f_regression)
X = df.drop('B_Reach_cms', axis=1) 
y = df['B_Reach_cms'] 
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]
print(selected_feature_names)
# Create a new dataframe with only the selected features
df_selected = pd.DataFrame(X_selected, columns=selected_feature_names)
df_selected.head()

ValueError: could not convert string to float: 'Red'

In [None]:
print(df.shape)

(5410, 146)


In [None]:
df_selected['B_Reach_cms'] = y

In [None]:
df_selected.to_csv('../Punya Andi/UFC_kombinasi6.csv', index=False)

In [None]:
df_test_id = df_test['id']
df_test = df_test.drop(columns=['id'])
df_test = df_test.reindex(columns=df_selected.columns, fill_value=0)
df_test = pd.concat([df_test_id, df_test], axis=1)

In [None]:
df_test.to_csv('../regression_kaggle/UFC_kombinasi6.csv', index=False)