### Kombinasi 4
* Delete Duplicate
* Impute Null pake iterative imputer
* Outlier capping pake Log Transform
* Encoding 
* Standard scaler
* Feature selection -> K-Best

In [210]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [211]:
df = pd.read_csv("../UFC_train.csv")
df_test = pd.read_csv('../regression/New_UFC_Test_Regression_X(3).csv')
df.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,Winner
0,Joe Riggs,Joe Doerksen,Steve Mazzagatti,2004-08-21,"Las Vegas, Nevada, USA",False,Middleweight,,,,...,0,0,0,Southpaw,182.88,177.8,185.0,26.0,21.0,Red
1,Jorge Masvidal,Al Iaquinta,Keith Peterson,2015-04-04,"Fairfax, Virginia, USA",False,Lightweight,1.15625,0.0,0.394141,...,0,1,0,Orthodox,180.34,187.96,170.0,27.0,30.0,Blue
2,Dan Stittgen,Stephen Thompson,Josh Rosenthal,2012-02-04,"Las Vegas, Nevada, USA",False,Welterweight,,,,...,0,0,0,Orthodox,185.42,,170.0,28.0,31.0,Blue
3,Josh Koscheck,Johny Hendricks,Kevin Mulhall,2012-05-05,"East Rutherford, New Jersey, USA",False,Welterweight,0.695312,0.0,0.783359,...,6,3,0,Orthodox,177.8,185.42,170.0,28.0,34.0,Blue
4,John Dodson,Manvel Gamburyan,James Warring,2016-04-16,"Tampa, Florida, USA",False,Bantamweight,0.5,0.266602,0.381462,...,3,0,1,Orthodox,160.02,167.64,135.0,34.0,31.0,Red


In [212]:
# Delete duplicate
print(f"Shape before dropping duplicates : {df.shape}") # Before dropping duplicates
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {df.shape}") # After dropping duplicates

Shape before dropping duplicates : (5410, 144)
Shape after dropping duplicates: (5410, 144)


In [213]:
def check_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na / len(df)
    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")
    else:
        print(missing_data[missing_data['Total'] > 0])

In [214]:
check_null(df)

                         Total   Percent
R_Weight_lbs                 2  0.000370
R_Height_cms                 4  0.000739
B_Weight_lbs                 8  0.001479
B_Height_cms                10  0.001848
R_Stance                    27  0.004991
...                        ...       ...
B_avg_GROUND_landed       1293  0.239002
B_avg_GROUND_att          1293  0.239002
B_avg_opp_CLINCH_landed   1293  0.239002
B_avg_TD_att              1293  0.239002
B_avg_HEAD_landed         1293  0.239002

[109 rows x 2 columns]


In [215]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

df_test_object = df_test.select_dtypes(include='object')
df_test_number = df_test.select_dtypes(include=np.number)

In [216]:
# Impute missing value using iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import ExtraTreeRegressor

imputer = IterativeImputer(initial_strategy='median', random_state=42, estimator=ExtraTreeRegressor(random_state=42), max_iter=15)
df_imputed_number = imputer.fit_transform(df_number)
df_imputed_number = pd.DataFrame(df_imputed_number, columns=df_number.columns)

df_test_imputed_number = imputer.fit_transform(df_test_number)
df_test_imputed_number = pd.DataFrame(df_test_imputed_number, columns=df_test_number.columns)



In [217]:
# Drop R_fighter, B_fighter, Referee, location, and date
df_object = df_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date', 'Winner'], axis=1)
df_test_object = df_test_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [218]:
# Combine df_object and df_imputed_number
df = pd.concat([df_object, df_imputed_number], axis=1)
df_test = pd.concat([df_test_object, df_test_imputed_number], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Middleweight,Orthodox,Southpaw,0.0,0.0,0.43,0.44375,1.0,0.165,0.0,...,0.0,0.0,0.0,0.0,0.0,182.88,177.8,185.0,26.0,21.0
1,Lightweight,Orthodox,Orthodox,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,0.15625,...,0.0,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0
2,Welterweight,Orthodox,Orthodox,0.5,0.0,0.4525,0.313125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,185.42,185.42,170.0,28.0,31.0
3,Welterweight,Southpaw,Orthodox,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,0.09375,...,1.0,5.0,6.0,3.0,0.0,177.8,185.42,170.0,28.0,34.0
4,Bantamweight,Orthodox,Orthodox,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,1.37793,...,0.0,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0


In [219]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)

df_test['R_Stance'].fillna(df_test['R_Stance'].mode()[0], inplace=True)
df_test['B_Stance'].fillna(df_test['B_Stance'].mode()[0], inplace=True)

In [220]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [221]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [222]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

df_test_object = df_test.select_dtypes(include='object')
df_test_number = df_test.select_dtypes(include=np.number)

In [223]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                 476
B_avg_opp_KD             646
B_avg_SIG_STR_pct        171
B_avg_opp_SIG_STR_pct    143
B_avg_TD_pct               0
                        ... 
R_Height_cms              19
R_Reach_cms               85
R_Weight_lbs             301
B_age                     31
R_age                     75
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 8.79852125693161%
B_avg_opp_KD = 11.940850277264325%
B_avg_SIG_STR_pct = 3.1608133086876156%
B_avg_opp_SIG_STR_pct = 2.6432532347504623%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 6.506469500924214%
B_avg_opp_SUB_ATT = 6.691312384473198%
B_avg_REV = 17.689463955637706%
B_avg_opp_REV = 17.13493530499076%
B_avg_SIG_STR_att = 3.3086876155268024%
B_avg_SIG_STR_landed = 2.7356746765249538%
B_avg_opp_SIG_STR_att = 3.6229205175600745%
B_avg_opp_SIG_STR_landed = 2.9944547134935307%
B_avg_TOTAL_STR_att = 1.77449168207024%
B_avg_TOTAL_STR_landed = 1.7190388170055

In [224]:
# Add a constant value to make all values positive
df_number_reach = df['B_Reach_cms']
df_number_positive = df_number + abs(df_number.min()) + 1
df_number_log = np.log(df_number_positive)
df_number_log['B_Reach_cms'] = df_number_reach

In [225]:
df_id = df_test_number['id']
df_test_number.drop(['id'], axis=1, inplace=True)
df_test_number_positive = df_test_number + abs(df_test_number.min()) + 1
df_test_number_log = np.log(df_test_number_positive)
df_test_number_log = pd.concat([df_id, df_test_number_log], axis=1)

In [226]:
df_number_log

Unnamed: 0,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,B_avg_opp_REV,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0.000000,0.000000,0.357674,0.367244,0.693147,0.152721,0.000000,0.693147,0.117783,0.693147,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.817944,5.802722,5.707110,3.806662,3.713572
1,0.768371,0.000000,0.332278,0.301897,0.214481,0.011419,0.145182,0.124703,0.000000,0.000000,...,0.000000,1.609438,0.000000,0.693147,0.000000,5.810362,5.832938,5.655992,3.828641,3.912023
2,0.405465,0.000000,0.373286,0.272410,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.825469,5.825469,5.655992,3.850148,3.931826
3,0.527867,0.000000,0.578499,0.170204,0.084600,0.099280,0.089612,0.060625,0.000000,0.000000,...,0.693147,1.791759,1.945910,1.386294,0.000000,5.802722,5.825469,5.655992,3.850148,3.988984
4,0.405465,0.236337,0.323143,0.376076,0.357405,0.384970,0.866230,0.000244,0.117783,0.405465,...,0.000000,1.098612,1.386294,0.000000,0.693147,5.747544,5.771566,5.525453,3.970292,3.931826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5405,0.117783,0.000000,0.392886,0.435024,0.543051,0.361165,0.485508,0.318454,0.810930,0.117783,...,0.000000,0.693147,0.000000,0.000000,0.000000,5.795024,5.810362,5.602119,3.806662,3.891820
5406,0.015504,0.000000,0.325686,0.272796,0.478694,0.183688,0.441275,0.286705,0.009718,0.119518,...,0.000000,1.609438,0.693147,1.791759,0.000000,5.825469,5.817944,5.655992,3.850148,3.988984
5407,0.223144,0.223144,0.471565,0.415415,0.285179,0.285179,0.223144,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,5.795024,5.802722,5.564520,3.761200,3.912023
5408,0.628609,0.117783,0.433810,0.355049,0.030772,0.149497,0.000000,0.015504,0.000000,0.000000,...,0.693147,1.609438,0.693147,0.000000,0.000000,5.795024,5.810362,5.602119,3.828641,3.988984


In [227]:
outliers = dict(check_outlier(df_number_log).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number_log[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                 409
B_avg_opp_KD             578
B_avg_SIG_STR_pct        166
B_avg_opp_SIG_STR_pct    123
B_avg_TD_pct               0
                        ... 
R_Height_cms              19
R_Reach_cms               85
R_Weight_lbs             210
B_age                     21
R_age                     66
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 7.560073937153419%
B_avg_opp_KD = 10.683918669131238%
B_avg_SIG_STR_pct = 3.0683918669131236%
B_avg_opp_SIG_STR_pct = 2.2735674676524953%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 0.9796672828096119%
B_avg_opp_SUB_ATT = 2.051756007393715%
B_avg_REV = 11.34935304990758%
B_avg_opp_REV = 16.6728280961183%
B_avg_SIG_STR_att = 4.288354898336414%
B_avg_SIG_STR_landed = 3.6598890942698707%
B_avg_opp_SIG_STR_att = 5.157116451016636%
B_avg_opp_SIG_STR_landed = 3.7523105360443623%
B_avg_TOTAL_STR_att = 4.473197781885398%
B_avg_TOTAL_STR_landed = 3.53049907578558

In [228]:
df = pd.concat([df_object, df_number_log], axis=1)
df_test = pd.concat([df_test_object, df_test_number_log], axis=1)

In [229]:
df['gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')
df_test['gender'] = df_test['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [230]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,Middleweight,Orthodox,Southpaw,0.0,0.0,0.357674,0.367244,0.693147,0.152721,0.0,...,0.0,0.0,0.0,0.0,5.817944,5.802722,5.70711,3.806662,3.713572,male
1,Lightweight,Orthodox,Orthodox,0.768371,0.0,0.332278,0.301897,0.214481,0.011419,0.145182,...,1.609438,0.0,0.693147,0.0,5.810362,5.832938,5.655992,3.828641,3.912023,male
2,Welterweight,Orthodox,Orthodox,0.405465,0.0,0.373286,0.27241,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.825469,5.825469,5.655992,3.850148,3.931826,male
3,Welterweight,Southpaw,Orthodox,0.527867,0.0,0.578499,0.170204,0.0846,0.09928,0.089612,...,1.791759,1.94591,1.386294,0.0,5.802722,5.825469,5.655992,3.850148,3.988984,male
4,Bantamweight,Orthodox,Orthodox,0.405465,0.236337,0.323143,0.376076,0.357405,0.38497,0.86623,...,1.098612,1.386294,0.0,0.693147,5.747544,5.771566,5.525453,3.970292,3.931826,male


In [231]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')
df_test['weight_class'] = df_test['weight_class'].str.replace('Women', '')

In [232]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Middleweight', 'Lightweight', 'Welterweight', 'Bantamweight',
       'Flyweight', 'LightHeavyweight', 'Strawweight', 'Featherweight',
       'OpenWeight', 'Heavyweight', 'CatchWeight'], dtype=object)

In [233]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [234]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)
df_test['weight_class'] = df_test['weight_class'].map(weight_class_dict)

In [235]:
# Map each gender value to the correct number
df['gender'] = df['gender'].map(gender_dict)
df_test['gender'] = df_test['gender'].map(gender_dict)

In [236]:
df_classif = df.copy()
df_classif.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,7,Orthodox,Southpaw,0.0,0.0,0.357674,0.367244,0.693147,0.152721,0.0,...,0.0,0.0,0.0,0.0,5.817944,5.802722,5.70711,3.806662,3.713572,1
1,5,Orthodox,Orthodox,0.768371,0.0,0.332278,0.301897,0.214481,0.011419,0.145182,...,1.609438,0.0,0.693147,0.0,5.810362,5.832938,5.655992,3.828641,3.912023,1
2,6,Orthodox,Orthodox,0.405465,0.0,0.373286,0.27241,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.825469,5.825469,5.655992,3.850148,3.931826,1
3,6,Southpaw,Orthodox,0.527867,0.0,0.578499,0.170204,0.0846,0.09928,0.089612,...,1.791759,1.94591,1.386294,0.0,5.802722,5.825469,5.655992,3.850148,3.988984,1
4,3,Orthodox,Orthodox,0.405465,0.236337,0.323143,0.376076,0.357405,0.38497,0.86623,...,1.098612,1.386294,0.0,0.693147,5.747544,5.771566,5.525453,3.970292,3.931826,1


In [237]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()

Unnamed: 0,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch
0,7,0.0,0.0,0.357674,0.367244,0.693147,0.152721,0.0,0.693147,0.117783,...,False,True,False,False,False,False,False,False,True,False
1,5,0.768371,0.0,0.332278,0.301897,0.214481,0.011419,0.145182,0.124703,0.0,...,False,True,False,False,False,False,True,False,False,False
2,6,0.405465,0.0,0.373286,0.27241,0.0,0.0,0.0,0.0,0.0,...,False,True,False,False,False,False,True,False,False,False
3,6,0.527867,0.0,0.578499,0.170204,0.0846,0.09928,0.089612,0.060625,0.0,...,False,False,False,True,False,False,True,False,False,False
4,3,0.405465,0.236337,0.323143,0.376076,0.357405,0.38497,0.86623,0.000244,0.117783,...,False,True,False,False,False,False,True,False,False,False


In [238]:
# Get the remaining object columns
df_test_object = df_test.select_dtypes(include='object')

# Perform one-hot encoding
df_test_encoded = pd.get_dummies(df_test_object)

# Combine the encoded columns with the original dataframe
df_test = pd.concat([df_test, df_test_encoded], axis=1)

# Drop the original object columns
df_test.drop(df_test_object.columns, axis=1, inplace=True)

df_test.head()

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,B_age,R_age,gender,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0.0,0.405465,0.0,0.500775,0.392042,0.0,0.14842,0.0,0.0,...,3.951244,3.7612,0,True,False,False,False,True,False,False
1,3,1.0,0.0,0.0,0.521766,0.398776,0.559616,0.693147,0.405465,0.405465,...,3.951244,3.850148,0,True,False,False,False,True,False,False
2,7,2.0,0.45038,0.363118,0.5326,0.401902,0.004437,0.153746,0.004324,0.226896,...,4.204693,4.060443,1,True,False,False,False,False,True,False
3,2,3.0,0.405465,0.0,0.441476,0.34359,0.223144,0.223144,1.098612,0.0,...,3.871201,3.89182,1,False,True,False,False,True,False,False
4,3,4.0,0.0,0.0,0.239017,0.402126,0.285179,0.165514,0.0,0.0,...,3.89182,3.850148,1,True,False,False,False,False,True,False


In [239]:
df.to_csv('../Without Feature Selection/UFC_kombinasi45_wt_featureselection.csv', index=False)

In [240]:
from sklearn.feature_selection import SelectKBest, f_regression
# Perform feature selection using SelectKBest
selector = SelectKBest(k=25, score_func=f_regression)
X = df.drop('B_Reach_cms', axis=1) 
y = df['B_Reach_cms'] 
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]
print(selected_feature_names)
# Create a new dataframe with only the selected features
df_selected = pd.DataFrame(X_selected, columns=selected_feature_names)
df_selected.head()

Index(['weight_class', 'B_avg_opp_SIG_STR_att', 'B_avg_TOTAL_STR_att',
       'B_avg_opp_TOTAL_STR_att', 'B_avg_opp_HEAD_att', 'B_avg_opp_BODY_att',
       'B_avg_opp_BODY_landed', 'B_avg_DISTANCE_att', 'B_avg_opp_DISTANCE_att',
       'B_avg_opp_DISTANCE_landed', 'B_win_by_KO/TKO', 'B_Height_cms',
       'B_Weight_lbs', 'R_avg_opp_SIG_STR_att', 'R_avg_opp_TOTAL_STR_att',
       'R_avg_BODY_att', 'R_avg_opp_BODY_att', 'R_avg_opp_BODY_landed',
       'R_avg_DISTANCE_att', 'R_avg_opp_DISTANCE_att', 'R_win_by_KO/TKO',
       'R_Height_cms', 'R_Reach_cms', 'R_Weight_lbs', 'gender'],
      dtype='object')


Unnamed: 0,weight_class,B_avg_opp_SIG_STR_att,B_avg_TOTAL_STR_att,B_avg_opp_TOTAL_STR_att,B_avg_opp_HEAD_att,B_avg_opp_BODY_att,B_avg_opp_BODY_landed,B_avg_DISTANCE_att,B_avg_opp_DISTANCE_att,B_avg_opp_DISTANCE_landed,...,R_avg_BODY_att,R_avg_opp_BODY_att,R_avg_opp_BODY_landed,R_avg_DISTANCE_att,R_avg_opp_DISTANCE_att,R_win_by_KO/TKO,R_Height_cms,R_Reach_cms,R_Weight_lbs,gender
0,7.0,4.424847,3.637586,4.887997,4.189655,2.484907,2.071598,3.223863,4.276666,3.07385,...,2.224624,1.791759,1.609438,3.713572,4.023117,0.0,5.817944,5.802722,5.70711,1.0
1,5.0,4.510344,4.957311,4.531339,4.433325,1.601595,1.121779,4.852274,4.496419,3.462067,...,3.120436,2.922826,2.231358,4.597642,4.71178,0.0,5.810362,5.832938,5.655992,1.0
2,6.0,4.329911,4.204693,4.594251,4.106767,2.197225,1.827569,4.110874,4.18205,3.448001,...,2.60269,1.378451,0.693147,4.353499,4.421602,0.0,5.825469,5.825469,5.655992,1.0
3,6.0,3.643531,3.960813,3.971176,3.393617,2.035522,1.696189,3.284781,3.228207,1.995975,...,2.217149,2.257051,1.742787,4.225975,4.642033,1.94591,5.802722,5.825469,5.655992,1.0
4,3.0,4.546479,4.636382,4.843083,4.346158,2.43521,2.325943,4.054156,4.259439,3.315916,...,3.26558,3.221996,2.781339,4.488636,4.640929,1.386294,5.747544,5.771566,5.525453,1.0


In [241]:
print(df.shape)

(5410, 146)


In [242]:
df_selected['B_Reach_cms'] = y

In [243]:
df_selected.to_csv('../Punya Andi/UFC_kombinasi4.csv', index=False)

In [244]:
df_test_id = df_test['id']
df_test = df_test.drop(columns=['id'])
df_test = df_test.reindex(columns=df_selected.columns, fill_value=0)
df_test = pd.concat([df_test_id, df_test], axis=1)

In [245]:
df_test.to_csv('../regression_kaggle/UFC_kombinasi4.csv', index=False)