### Kombinasi 8 :
* Delete Duplicate
* Delete Null
* Outlier capping pake log
* Encoding 
* Standard scaler
* Feature selection -> K-Best

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("dataframe/UFC_Test_Classif_X.csv")
df.head()

Unnamed: 0,id,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,Tecia Torres,Juliana Lima,Chris Tognoni,2017-07-07,"Las Vegas, Nevada, USA",False,WomenStrawweight,0.0,0.0,...,4,0,0,0,Orthodox,154.94,152.4,115.0,35.0,27.0
1,1,John Howard,Lorenz Larkin,Herb Dean,2015-01-18,"Boston, Massachusetts, USA",False,Welterweight,0.0,0.25,...,1,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0
2,2,Kyle Bochniak,Jeremy Kennedy,Todd Ronald Anderson,2017-07-22,"Uniondale, New York, USA",False,Featherweight,0.0,0.5,...,0,0,0,0,Orthodox,170.18,177.8,145.0,24.0,30.0
3,3,Yao Zhikui,Royston Wee,Steve Perceval,2014-08-23,"Macau, China",False,Bantamweight,0.0,0.0,...,0,0,0,0,Orthodox,165.1,162.56,125.0,27.0,23.0
4,4,Carlos Newton,Pat Miletich,John McCarthy,2001-05-04,"Atlantic City, New Jersey, USA",True,Welterweight,0.0,0.0,...,0,0,1,0,Orthodox,175.26,,170.0,33.0,24.0


In [3]:
# Delete duplicate
print(f"Shape before dropping duplicates : {df.shape}") # Before dropping duplicates
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {df.shape}") # After dropping duplicates

Shape before dropping duplicates : (602, 143)
Shape after dropping duplicates: (602, 143)


In [4]:
def check_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na / len(df)
    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")
    else:
        print(missing_data[missing_data['Total'] > 0])

In [5]:
check_null(df)

                          Total   Percent
Referee                       1  0.001661
R_Stance                      2  0.003322
B_Stance                      2  0.003322
R_age                         7  0.011628
B_age                        11  0.018272
...                         ...       ...
B_avg_CTRL_time(seconds)    134  0.222591
B_avg_opp_GROUND_att        134  0.222591
B_avg_GROUND_landed         134  0.222591
B_avg_BODY_landed           134  0.222591
B_avg_DISTANCE_att          134  0.222591

[104 rows x 2 columns]


In [7]:
df.dropna(inplace=True)

In [9]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [12]:
# Drop R_fighter, B_fighter, Referee, location, and date
df = df.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [13]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [14]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [15]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)

In [16]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [17]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [18]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)


In [19]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
id                        0
B_avg_KD                  7
B_avg_opp_KD             25
B_avg_SIG_STR_pct        13
B_avg_opp_SIG_STR_pct    10
                         ..
R_Height_cms              3
R_Reach_cms              11
R_Weight_lbs             20
B_age                     2
R_age                     1
Length: 134, dtype: int64


percentage of outliers in each column:
id = 0.0%
B_avg_KD = 1.6203703703703702%
B_avg_opp_KD = 5.787037037037037%
B_avg_SIG_STR_pct = 3.009259259259259%
B_avg_opp_SIG_STR_pct = 2.314814814814815%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 3.7037037037037033%
B_avg_SUB_ATT = 6.481481481481481%
B_avg_opp_SUB_ATT = 6.944444444444445%
B_avg_REV = 17.12962962962963%
B_avg_opp_REV = 19.90740740740741%
B_avg_SIG_STR_att = 2.083333333333333%
B_avg_SIG_STR_landed = 2.5462962962962963%
B_avg_opp_SIG_STR_att = 1.8518518518518516%
B_avg_opp_SIG_STR_landed = 2.314814814814815%
B_avg_TOTAL_STR_att = 1.6203703703703702%
B_avg_TOTAL_STR_landed = 2.0

In [20]:
# Add a constant value to make all values positive
df_reach = df_number['B_Reach_cms']
df_number_positive = df_number + abs(df_number.min()) + 1

df_number_log = np.log(df_number_positive)
df_number_log['B_Reach_cms'] = df_reach

KeyError: 'B_Reach_cms'

In [69]:
df_id = df_test_number['id']
df_test_number.drop(['id'], axis=1, inplace=True)
df_test_number_positive = df_test_number + abs(df_test_number.min()) + 1
df_test_number_log = np.log(df_test_number_positive)
df_test_number_log = pd.concat([df_id, df_test_number_log], axis=1)

In [21]:
outliers = dict(check_outlier(df_number_log).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number_log[key].shape[0] * 100}%")

NameError: name 'df_number_log' is not defined

In [71]:
df = pd.concat([df_object, df_number_log], axis=1)

In [72]:
check_null(df_test)

Tidak ditemukan missing value pada dataset


In [22]:
df['gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [23]:
df.head()

Unnamed: 0,id,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,0,False,WomenStrawweight,0.0,0.0,0.49375,0.44875,0.475,0.1775,0.0,...,0,0,0,Orthodox,154.94,152.4,115.0,35.0,27.0,women
1,1,False,Welterweight,0.0,0.25,0.473125,0.371875,0.0,0.20375,0.0,...,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0,male
2,2,False,Featherweight,0.0,0.5,0.5,0.48,0.615,0.0,0.0,...,0,0,0,Orthodox,170.18,177.8,145.0,24.0,30.0,male
6,6,False,Middleweight,0.5,0.0,0.675,0.55,0.5,0.2,0.0,...,6,1,0,Southpaw,185.42,187.96,185.0,30.0,39.0,male
7,7,False,Middleweight,0.25,0.0,0.57,0.1575,0.595,0.0825,0.5,...,2,0,0,Orthodox,185.42,187.96,185.0,21.0,31.0,male


In [24]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')

In [25]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Strawweight', 'Welterweight', 'Featherweight', 'Middleweight',
       'Bantamweight', 'LightHeavyweight', 'Lightweight', 'Heavyweight',
       'CatchWeight', 'Flyweight'], dtype=object)

In [26]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [27]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)

In [28]:
# Map each gender value to the correct number
df['gender'] = df['gender'].map(gender_dict)

In [29]:
df.head()

Unnamed: 0,id,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,0,False,1,0.0,0.0,0.49375,0.44875,0.475,0.1775,0.0,...,0,0,0,Orthodox,154.94,152.4,115.0,35.0,27.0,0
1,1,False,6,0.0,0.25,0.473125,0.371875,0.0,0.20375,0.0,...,2,0,0,Orthodox,170.18,182.88,170.0,28.0,31.0,1
2,2,False,4,0.0,0.5,0.5,0.48,0.615,0.0,0.0,...,0,0,0,Orthodox,170.18,177.8,145.0,24.0,30.0,1
6,6,False,7,0.5,0.0,0.675,0.55,0.5,0.2,0.0,...,6,1,0,Southpaw,185.42,187.96,185.0,30.0,39.0,1
7,7,False,7,0.25,0.0,0.57,0.1575,0.595,0.0825,0.5,...,2,0,0,Orthodox,185.42,187.96,185.0,21.0,31.0,1


In [30]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()

Unnamed: 0,id,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_age,gender,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,0,False,1,0.0,0.0,0.49375,0.44875,0.475,0.1775,0.0,...,27.0,0,False,True,False,False,False,True,False,False
1,1,False,6,0.0,0.25,0.473125,0.371875,0.0,0.20375,0.0,...,31.0,1,False,True,False,False,False,True,False,False
2,2,False,4,0.0,0.5,0.5,0.48,0.615,0.0,0.0,...,30.0,1,False,True,False,False,False,True,False,False
6,6,False,7,0.5,0.0,0.675,0.55,0.5,0.2,0.0,...,39.0,1,False,False,True,False,False,False,True,False
7,7,False,7,0.25,0.0,0.57,0.1575,0.595,0.0825,0.5,...,31.0,1,False,True,False,False,False,True,False,False


In [82]:
# Get the remaining object columns
df_test_object = df_test.select_dtypes(include='object')

# Perform one-hot encoding
df_test_encoded = pd.get_dummies(df_test_object)

# Combine the encoded columns with the original dataframe
df_test = pd.concat([df_test, df_test_encoded], axis=1)

# Drop the original object columns
df_test.drop(df_test_object.columns, axis=1, inplace=True)

df_test.head()

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,B_age,R_age,gender,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0.0,0.405465,0.0,0.500775,0.392042,0.0,0.14842,0.0,0.0,...,3.951244,3.7612,0,True,False,False,False,True,False,False
1,3,1.0,0.0,0.0,0.521766,0.398776,0.559616,0.693147,0.405465,0.405465,...,3.951244,3.850148,0,True,False,False,False,True,False,False
2,7,2.0,0.45038,0.363118,0.5326,0.401902,0.004437,0.153746,0.004324,0.226896,...,4.204693,4.060443,1,True,False,False,False,False,True,False
3,2,3.0,0.405465,0.0,0.441476,0.34359,0.223144,0.223144,1.098612,0.0,...,3.871201,3.89182,1,False,True,False,False,True,False,False
4,3,4.0,0.0,0.0,0.239017,0.402126,0.285179,0.165514,0.0,0.0,...,3.89182,3.850148,1,True,False,False,False,False,True,False


In [33]:
df = df.drop([ "R_Stance_Sideways", "B_Stance_Sideways"], axis=1)

KeyError: "['R_Stance_Sideways', 'B_Stance_Sideways'] not found in axis"

In [32]:
df = df.drop(columns="B_Reach_cms")

KeyError: "['B_Reach_cms'] not found in axis"

In [None]:
df.to_csv('../Without Feature Selection/UFC_kombinasi8.csv', index=False)

In [34]:
from sklearn.feature_selection import SelectKBest, f_regression
# Perform feature selection using SelectKBest
selector = SelectKBest(k=25, score_func=f_regression)
X = df.drop('B_Reach_cms', axis=1) 
y = df['B_Reach_cms'] 
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]
print(selected_feature_names)
# Create a new dataframe with only the selected features
df_selected = pd.DataFrame(X_selected, columns=selected_feature_names)
df_selected.head()

KeyError: "['B_Reach_cms'] not found in axis"

In [None]:
y = df['B_Reach_cms'].reset_index(drop=True)

In [None]:
y

0       177.80
1       175.26
2       170.18
3       167.64
4       190.50
         ...  
3475    162.56
3476    190.50
3477    180.34
3478    190.50
3479    195.58
Name: B_Reach_cms, Length: 3480, dtype: float64

In [None]:
df_selected = pd.concat([df_selected, y], axis=1)

In [None]:
df_selected.to_csv('../Punya Andi/UFC_kombinasi8.csv', index=False)

In [None]:
df_test_id = df_test['id']
df_test = df_test.drop(columns=['id'])
df_test = df_test.reindex(columns=df_selected.columns, fill_value=0)
df_test = pd.concat([df_test_id, df_test], axis=1)

In [None]:
df_test.to_csv('../regression_kaggle/UFC_kombinasi8.csv', index=False)

In [None]:
y_has_missing_values = y.isnull().any()
print(f"Does y have any missing values? {y_has_missing_values}")


Does y have any missing values? False


In [None]:
len(y)

3480

In [None]:
df_selected.shape

(3480, 26)

In [None]:
check_null(df_selected)

Tidak ditemukan missing value pada dataset
