## Kombinasi 5 :
- Delete duplicate
- Impute missing value using iterative imputer
- Outlier handling with log tranformation
- Encoding
- Standard Scaler
- Feature selection with Decision Tree

In [145]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [146]:
df = pd.read_csv("../UFC_train.csv")
df_test = pd.read_csv('../regression/New_UFC_Test_Regression_X(3).csv')
df.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,title_bout,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,Winner
0,Joe Riggs,Joe Doerksen,Steve Mazzagatti,2004-08-21,"Las Vegas, Nevada, USA",False,Middleweight,,,,...,0,0,0,Southpaw,182.88,177.8,185.0,26.0,21.0,Red
1,Jorge Masvidal,Al Iaquinta,Keith Peterson,2015-04-04,"Fairfax, Virginia, USA",False,Lightweight,1.15625,0.0,0.394141,...,0,1,0,Orthodox,180.34,187.96,170.0,27.0,30.0,Blue
2,Dan Stittgen,Stephen Thompson,Josh Rosenthal,2012-02-04,"Las Vegas, Nevada, USA",False,Welterweight,,,,...,0,0,0,Orthodox,185.42,,170.0,28.0,31.0,Blue
3,Josh Koscheck,Johny Hendricks,Kevin Mulhall,2012-05-05,"East Rutherford, New Jersey, USA",False,Welterweight,0.695312,0.0,0.783359,...,6,3,0,Orthodox,177.8,185.42,170.0,28.0,34.0,Blue
4,John Dodson,Manvel Gamburyan,James Warring,2016-04-16,"Tampa, Florida, USA",False,Bantamweight,0.5,0.266602,0.381462,...,3,0,1,Orthodox,160.02,167.64,135.0,34.0,31.0,Red


In [147]:
# Delete duplicate
print(f"Shape before dropping duplicate : {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicate : {df.shape}")

Shape before dropping duplicate : (5410, 144)
Shape after dropping duplicate : (5410, 144)


In [148]:
def check_null(df):
    col_na = df.isnull().sum().sort_values(ascending=True)
    percent = col_na / len(df)
    missing_data = pd.concat([col_na, percent], axis=1, keys=['Total', 'Percent'])

    if (missing_data[missing_data['Total'] > 0].shape[0] == 0):
        print("Tidak ditemukan missing value pada dataset")
    else:
        print(missing_data[missing_data['Total'] > 0])

In [149]:
check_null(df)

                         Total   Percent
R_Weight_lbs                 2  0.000370
R_Height_cms                 4  0.000739
B_Weight_lbs                 8  0.001479
B_Height_cms                10  0.001848
R_Stance                    27  0.004991
...                        ...       ...
B_avg_GROUND_landed       1293  0.239002
B_avg_GROUND_att          1293  0.239002
B_avg_opp_CLINCH_landed   1293  0.239002
B_avg_TD_att              1293  0.239002
B_avg_HEAD_landed         1293  0.239002

[109 rows x 2 columns]


In [150]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

df_test_object = df_test.select_dtypes(include='object')
df_test_number = df_test.select_dtypes(include=np.number)

In [151]:
# Impute missing value using iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import ExtraTreeRegressor

imputer = IterativeImputer(initial_strategy='median', random_state=42, estimator=ExtraTreeRegressor(random_state=42), max_iter=15)
df_imputed_number = imputer.fit_transform(df_number)
df_imputed_number = pd.DataFrame(df_imputed_number, columns=df_number.columns)

df_test_imputed_number = imputer.fit_transform(df_test_number)
df_test_imputed_number = pd.DataFrame(df_test_imputed_number, columns=df_test_number.columns)



In [152]:
# Drop R_fighter, B_fighter, Referee, location, and date
df_object = df_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)
df_test_object = df_test_object.drop(['R_fighter', 'B_fighter', 'Referee', 'location', 'date'], axis=1)

In [153]:
# Combine df_object and df_imputed_number
df = pd.concat([df_object, df_imputed_number], axis=1)
df_test = pd.concat([df_test_object, df_test_imputed_number], axis=1)
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Middleweight,Orthodox,Southpaw,Red,0.0,0.0,0.43,0.44375,1.0,0.165,...,0.0,0.0,0.0,0.0,0.0,182.88,177.8,185.0,26.0,21.0
1,Lightweight,Orthodox,Orthodox,Blue,1.15625,0.0,0.394141,0.352422,0.239219,0.011484,...,0.0,4.0,0.0,1.0,0.0,180.34,187.96,170.0,27.0,30.0
2,Welterweight,Orthodox,Orthodox,Blue,0.5,0.0,0.4525,0.313125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,185.42,185.42,170.0,28.0,31.0
3,Welterweight,Southpaw,Orthodox,Blue,0.695312,0.0,0.783359,0.185547,0.088281,0.104375,...,1.0,5.0,6.0,3.0,0.0,177.8,185.42,170.0,28.0,34.0
4,Bantamweight,Orthodox,Orthodox,Red,0.5,0.266602,0.381462,0.456558,0.429614,0.46957,...,0.0,2.0,3.0,0.0,1.0,160.02,167.64,135.0,34.0,31.0


In [154]:
check_null(df)

          Total   Percent
R_Stance     27  0.004991
B_Stance     64  0.011830


In [155]:
# Impute missing R_stance and B_stance with mode
df['R_Stance'].fillna(df['R_Stance'].mode()[0], inplace=True)
df['B_Stance'].fillna(df['B_Stance'].mode()[0], inplace=True)

df_test['R_Stance'].fillna(df_test['R_Stance'].mode()[0], inplace=True)
df_test['B_Stance'].fillna(df_test['B_Stance'].mode()[0], inplace=True)

In [156]:
check_null(df)

Tidak ditemukan missing value pada dataset


In [157]:
def check_outlier(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)

    # Menghitung RUB dan RLB.
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5*IQR
    upper_limit = Q3 + 1.5*IQR

    # Menampilkan banyaknya outlier pada atribut.
    outliers = (df < lower_limit) | (df > upper_limit)
    print ("Outlier pada tiap atribut:")
    print(outliers.sum())

    return outliers

In [158]:
# Split string and number
# Split dataframe into object type and number type
df_object = df.select_dtypes(include='object')
df_number = df.select_dtypes(include=np.number)

df_test_object = df_test.select_dtypes(include='object')
df_test_number = df_test.select_dtypes(include=np.number)

In [159]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                 476
B_avg_opp_KD             646
B_avg_SIG_STR_pct        171
B_avg_opp_SIG_STR_pct    143
B_avg_TD_pct               0
                        ... 
R_Height_cms              19
R_Reach_cms               85
R_Weight_lbs             301
B_age                     31
R_age                     75
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 8.79852125693161%
B_avg_opp_KD = 11.940850277264325%
B_avg_SIG_STR_pct = 3.1608133086876156%
B_avg_opp_SIG_STR_pct = 2.6432532347504623%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 6.506469500924214%
B_avg_opp_SUB_ATT = 6.691312384473198%
B_avg_REV = 17.689463955637706%
B_avg_opp_REV = 17.13493530499076%
B_avg_SIG_STR_att = 3.3086876155268024%
B_avg_SIG_STR_landed = 2.7356746765249538%
B_avg_opp_SIG_STR_att = 3.6229205175600745%
B_avg_opp_SIG_STR_landed = 2.9944547134935307%
B_avg_TOTAL_STR_att = 1.77449168207024%
B_avg_TOTAL_STR_landed = 1.7190388170055

In [160]:
# Add a constant value to make all values positive
df_number_positive = df_number + abs(df_number.min()) + 1

df_number_log = np.log(df_number_positive)


In [161]:
# Scaling
scaler = StandardScaler()
df_number_log_scaled = scaler.fit_transform(df_number_log)
df_number_log_scaled = pd.DataFrame(df_number_log_scaled, columns=df_number_log.columns)

In [162]:
df_test_number_positive = df_test_number + abs(df_test_number.min()) + 1
df_test_number_log = np.log(df_test_number_positive)

In [163]:
df_number_log.head()

Unnamed: 0,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,B_avg_opp_REV,...,R_win_by_Decision_Split,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0.0,0.0,0.357674,0.367244,0.693147,0.152721,0.0,0.693147,0.117783,0.693147,...,0.0,0.0,0.0,0.0,0.0,5.817944,5.802722,5.70711,3.806662,3.713572
1,0.768371,0.0,0.332278,0.301897,0.214481,0.011419,0.145182,0.124703,0.0,0.0,...,0.0,1.609438,0.0,0.693147,0.0,5.810362,5.832938,5.655992,3.828641,3.912023
2,0.405465,0.0,0.373286,0.27241,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.825469,5.825469,5.655992,3.850148,3.931826
3,0.527867,0.0,0.578499,0.170204,0.0846,0.09928,0.089612,0.060625,0.0,0.0,...,0.693147,1.791759,1.94591,1.386294,0.0,5.802722,5.825469,5.655992,3.850148,3.988984
4,0.405465,0.236337,0.323143,0.376076,0.357405,0.38497,0.86623,0.000244,0.117783,0.405465,...,0.0,1.098612,1.386294,0.0,0.693147,5.747544,5.771566,5.525453,3.970292,3.931826


In [164]:
outliers = dict(check_outlier(df_number).sum())
print("\n\npercentage of outliers in each column:")
for key in outliers.keys():
    print(f"{key} = {outliers[key]/df_number[key].shape[0] * 100}%")

Outlier pada tiap atribut:
B_avg_KD                 476
B_avg_opp_KD             646
B_avg_SIG_STR_pct        171
B_avg_opp_SIG_STR_pct    143
B_avg_TD_pct               0
                        ... 
R_Height_cms              19
R_Reach_cms               85
R_Weight_lbs             301
B_age                     31
R_age                     75
Length: 134, dtype: int64


percentage of outliers in each column:
B_avg_KD = 8.79852125693161%
B_avg_opp_KD = 11.940850277264325%
B_avg_SIG_STR_pct = 3.1608133086876156%
B_avg_opp_SIG_STR_pct = 2.6432532347504623%
B_avg_TD_pct = 0.0%
B_avg_opp_TD_pct = 0.0%
B_avg_SUB_ATT = 6.506469500924214%
B_avg_opp_SUB_ATT = 6.691312384473198%
B_avg_REV = 17.689463955637706%
B_avg_opp_REV = 17.13493530499076%
B_avg_SIG_STR_att = 3.3086876155268024%
B_avg_SIG_STR_landed = 2.7356746765249538%
B_avg_opp_SIG_STR_att = 3.6229205175600745%
B_avg_opp_SIG_STR_landed = 2.9944547134935307%
B_avg_TOTAL_STR_att = 1.77449168207024%
B_avg_TOTAL_STR_landed = 1.7190388170055

In [165]:
df = pd.concat([df_object, df_number_log_scaled], axis=1)
df_test = pd.concat([df_test_object, df_test_number_log], axis=1)

In [166]:
df['gender'] = df['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')
df_test['gender'] = df_test['weight_class'].apply(lambda x: 'women' if 'women' in x.lower() else 'male')

In [167]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,Middleweight,Orthodox,Southpaw,Red,-0.640974,-0.62727,-0.089688,0.093006,2.01218,-0.407253,...,-0.948928,-0.895763,-0.73328,-0.25627,0.451875,-0.557639,0.462803,-0.803318,-2.249718,male
1,Lightweight,Orthodox,Orthodox,Blue,2.476374,-0.62727,-0.368532,-0.6453,-0.223517,-1.09119,...,1.612046,-0.895763,0.514507,-0.25627,0.166462,0.404493,0.03682,-0.539624,0.136262,male
2,Welterweight,Orthodox,Orthodox,Blue,1.004034,-0.62727,0.081725,-0.978454,-1.22529,-1.146461,...,-0.948928,-0.895763,-0.73328,-0.25627,0.735141,0.166672,0.03682,-0.281601,0.374349,male
3,Welterweight,Southpaw,Orthodox,Blue,1.50063,-0.62727,2.334904,-2.1332,-0.830152,-0.665923,...,1.90216,2.064332,1.762295,-0.25627,-0.121132,0.166672,0.03682,-0.281601,1.061566,male
4,Bantamweight,Orthodox,Orthodox,Red,1.004034,0.318045,-0.468838,0.192792,0.444034,0.71689,...,0.799208,1.213051,-0.73328,3.629505,-2.198265,-1.549751,-1.050991,1.159842,0.374349,male


In [168]:
df['weight_class'] = df['weight_class'].str.replace('Women', '')
df_test['weight_class'] = df_test['weight_class'].str.replace('Women', '')

In [169]:
# Get all weight_class values
weight_class = df['weight_class'].unique()
weight_class

array(['Middleweight', 'Lightweight', 'Welterweight', 'Bantamweight',
       'Flyweight', 'LightHeavyweight', 'Strawweight', 'Featherweight',
       'OpenWeight', 'Heavyweight', 'CatchWeight'], dtype=object)

In [170]:
# Create a dictionary to map weight_class values to numbers
weight_class_dict = {
    'CatchWeight' : 0,
    'Strawweight' : 1,
    'Flyweight' : 2,
    'Bantamweight' : 3,
    'Featherweight' : 4,
    'Lightweight' : 5,
    'Welterweight' : 6,
    'Middleweight' : 7,
    'LightHeavyweight' : 8,
    'Heavyweight' : 9,
    'OpenWeight' : 10,
}

gender_dict = {
    'male' : 1,
    'women' : 0
}

In [171]:
# Map each weight_class value to the correct number
df['weight_class'] = df['weight_class'].map(weight_class_dict)
df_test['weight_class'] = df_test['weight_class'].map(weight_class_dict)

In [172]:
# Map each gender value to the correct number
df['gender'] = df['gender'].map(gender_dict)
df_test['gender'] = df_test['gender'].map(gender_dict)

In [173]:
df.head()

Unnamed: 0,weight_class,B_Stance,R_Stance,Winner,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age,gender
0,7,Orthodox,Southpaw,Red,-0.640974,-0.62727,-0.089688,0.093006,2.01218,-0.407253,...,-0.948928,-0.895763,-0.73328,-0.25627,0.451875,-0.557639,0.462803,-0.803318,-2.249718,1
1,5,Orthodox,Orthodox,Blue,2.476374,-0.62727,-0.368532,-0.6453,-0.223517,-1.09119,...,1.612046,-0.895763,0.514507,-0.25627,0.166462,0.404493,0.03682,-0.539624,0.136262,1
2,6,Orthodox,Orthodox,Blue,1.004034,-0.62727,0.081725,-0.978454,-1.22529,-1.146461,...,-0.948928,-0.895763,-0.73328,-0.25627,0.735141,0.166672,0.03682,-0.281601,0.374349,1
3,6,Southpaw,Orthodox,Blue,1.50063,-0.62727,2.334904,-2.1332,-0.830152,-0.665923,...,1.90216,2.064332,1.762295,-0.25627,-0.121132,0.166672,0.03682,-0.281601,1.061566,1
4,3,Orthodox,Orthodox,Red,1.004034,0.318045,-0.468838,0.192792,0.444034,0.71689,...,0.799208,1.213051,-0.73328,3.629505,-2.198265,-1.549751,-1.050991,1.159842,0.374349,1


In [174]:
# Get the remaining object columns
df_object = df.select_dtypes(include='object')

# Perform one-hot encoding
df_encoded = pd.get_dummies(df_object)

# Combine the encoded columns with the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# Drop the original object columns
df.drop(df_object.columns, axis=1, inplace=True)

df.head()


Unnamed: 0,weight_class,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,B_avg_REV,...,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch,Winner_Blue,Winner_Draw,Winner_Red
0,7,-0.640974,-0.62727,-0.089688,0.093006,2.01218,-0.407253,-0.803907,1.065387,-0.057964,...,0,0,0,0,0,1,0,0,0,1
1,5,2.476374,-0.62727,-0.368532,-0.6453,-0.223517,-1.09119,-0.43128,-0.445783,-0.538077,...,0,0,0,1,0,0,0,1,0,0
2,6,1.004034,-0.62727,0.081725,-0.978454,-1.22529,-1.146461,-0.803907,-0.777299,-0.538077,...,0,0,0,1,0,0,0,1,0,0
3,6,1.50063,-0.62727,2.334904,-2.1332,-0.830152,-0.665923,-0.573907,-0.616132,-0.538077,...,1,0,0,1,0,0,0,1,0,0
4,3,1.004034,0.318045,-0.468838,0.192792,0.444034,0.71689,1.419378,-0.77665,-0.057964,...,0,0,0,1,0,0,0,0,0,1


In [175]:
# Get the remaining object columns
df_test_object = df_test.select_dtypes(include='object')

# Perform one-hot encoding
df_test_encoded = pd.get_dummies(df_test_object)

# Combine the encoded columns with the original dataframe
df_test = pd.concat([df_test, df_test_encoded], axis=1)

# Drop the original object columns
df_test.drop(df_test_object.columns, axis=1, inplace=True)

df_test.head()

Unnamed: 0,weight_class,id,B_avg_KD,B_avg_opp_KD,B_avg_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_avg_TD_pct,B_avg_opp_TD_pct,B_avg_SUB_ATT,B_avg_opp_SUB_ATT,...,B_age,R_age,gender,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,0.0,0.405465,0.0,0.500775,0.392042,0.0,0.14842,0.0,0.0,...,3.951244,3.7612,0,1,0,0,0,1,0,0
1,3,0.693147,0.0,0.0,0.521766,0.398776,0.559616,0.693147,0.405465,0.405465,...,3.951244,3.850148,0,1,0,0,0,1,0,0
2,7,1.098612,0.45038,0.363118,0.5326,0.401902,0.004437,0.153746,0.004324,0.226896,...,4.204693,4.060443,1,1,0,0,0,0,1,0
3,2,1.386294,0.405465,0.0,0.441476,0.34359,0.223144,0.223144,1.098612,0.0,...,3.871201,3.89182,1,0,1,0,0,1,0,0
4,3,1.609438,0.0,0.0,0.239017,0.402126,0.285179,0.165514,0.0,0.0,...,3.89182,3.850148,1,1,0,0,0,0,1,0


In [176]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Feature selection with Lasso
X = df.drop('B_Reach_cms', axis=1)
y = df['B_Reach_cms']

# Create the KNN model
lasso = Lasso(random_state=42)

lasso.fit(X, y)

# Get features with non-zero weights
non_zero_features = np.where(lasso.coef_ != 0)[0]

# Display the feature indices with non-zero weights
print("Features with non-zero weights:", non_zero_features)

Features with non-zero weights: [0]


In [177]:
# Define the parameter grid
param_grid = {
    'alpha': [0.1, 0.5],
    'fit_intercept': [True, False],
    'precompute': [True, False],
    'max_iter': [1000, 2000, 5000],
    'tol': [0.0001, 0.001, 0.01],
    'selection': ['cyclic', 'random']
}

# Create the GridSearchCV object
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the data to perform grid search
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best R-squared Score:", grid_search.best_score_)

Best Parameters: {'alpha': 0.1, 'fit_intercept': False, 'max_iter': 1000, 'precompute': True, 'selection': 'cyclic', 'tol': 0.01}
Best R-squared Score: 0.783599972551135


In [178]:
# Feature importance
feature_importance = grid_search.best_estimator_.coef_
print(feature_importance)

# Get the indices of the feature importance
feature_importance_indices = np.argsort(feature_importance)

[ 0.00482174  0.          0.         -0.          0.         -0.
 -0.         -0.         -0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.          0.          0.         -0.
 -0.         -0.          0.          0.          0.         -0.
  0.          0.          0.          0.          0.         -0.
 -0.          0.          0.          0.          0.68515278  0.1067579
  0.          0.          0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.  

In [179]:
print(len(feature_importance_indices))

148


In [180]:
non_zero_features = np.where(feature_importance != 0)[0]

non_zero_features_indices = np.argsort(feature_importance[non_zero_features])

print(len(non_zero_features_indices))

5


In [181]:
# df_feature_selection is df that has column with non-zero weight
df_feature_selection = df.iloc[:, non_zero_features]

df_feature_selection.head()

Unnamed: 0,weight_class,B_Height_cms,B_Reach_cms,R_win_by_TKO_Doctor_Stoppage,R_Height_cms
0,7,0.460636,0.665583,-0.25627,0.451875
1,5,-0.117598,-0.558654,-0.25627,0.166462
2,6,0.460636,0.665583,-0.25627,0.735141
3,6,-0.41005,-0.809216,-0.25627,-0.121132
4,3,-1.602906,-1.316294,3.629505,-2.198265


In [182]:
df_feature_selection['B_Reach_cms'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feature_selection['B_Reach_cms'] = y


In [183]:
# Export to CSV
df_feature_selection.to_csv('../Punya Andi/UFC_kombinasi5_FS_lasso.csv', index=False)


In [184]:
df_test_id = df_test['id']
df_test = df_test.drop(columns=['id'])
df_test = df_test.reindex(columns=df_feature_selection.columns, fill_value=0)
df_test = pd.concat([df_test_id, df_test], axis=1)
df_test.to_csv('../regression_kaggle/UFC_kombinasi5.csv', index=False)

In [185]:
# Export to CSV
df_test.to_csv('../Punya Andi/UFC_kombinasi5.csv', index=False)

In [186]:
from sklearn.tree import DecisionTreeRegressor

# Feature selection with Decision Tree
X = df.drop('B_Reach_cms', axis=1)
y = df['B_Reach_cms']

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

DecisionTreeRegressor()

In [187]:
feature_importances = model.feature_importances_

In [188]:
top_n = 20
top_features_indices = feature_importances.argsort()[-top_n:][::-1]

In [189]:
top_features_indices

array([ 64,  65, 134,  63,  39,  31,  32,  47, 132,  62, 113,  26,  21,
        66,   8,  18,  16,  45,  69,   4])

In [190]:
# Get the names of the top features
top_features = X.columns[top_features_indices]
top_features

Index(['B_Height_cms', 'B_Weight_lbs', 'gender',
       'B_win_by_TKO_Doctor_Stoppage', 'B_avg_CLINCH_att', 'B_avg_LEG_att',
       'B_avg_LEG_landed', 'B_avg_CTRL_time(seconds)', 'B_age',
       'B_win_by_Submission', 'R_avg_opp_CTRL_time(seconds)',
       'B_avg_opp_HEAD_landed', 'B_avg_opp_TD_att', 'R_avg_KD',
       'B_avg_opp_SUB_ATT', 'B_avg_opp_TOTAL_STR_landed',
       'B_avg_TOTAL_STR_landed', 'B_avg_opp_GROUND_att',
       'R_avg_opp_SIG_STR_pct', 'B_avg_opp_SIG_STR_pct'],
      dtype='object')

In [191]:
# Create a dataframe with only the top features
df_top_features = df[top_features]
df_top_features['B_Reach_cms'] = df['B_Reach_cms']
df_top_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_features['B_Reach_cms'] = df['B_Reach_cms']


Unnamed: 0,B_Height_cms,B_Weight_lbs,gender,B_win_by_TKO_Doctor_Stoppage,B_avg_CLINCH_att,B_avg_LEG_att,B_avg_LEG_landed,B_avg_CTRL_time(seconds),B_age,B_win_by_Submission,...,B_avg_opp_HEAD_landed,B_avg_opp_TD_att,R_avg_KD,B_avg_opp_SUB_ATT,B_avg_opp_TOTAL_STR_landed,B_avg_TOTAL_STR_landed,B_avg_opp_GROUND_att,R_avg_opp_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_Reach_cms
0,0.460636,0.460662,1,-0.203023,-0.078703,0.625452,0.631658,0.399169,-0.803318,-0.596205,...,0.569593,-0.858338,-0.716784,1.065387,0.928123,-0.631524,0.378318,0.192736,0.093006,0.665583
1,-0.117598,-0.402,1,-0.203023,0.069085,0.414684,0.473823,-0.464297,-0.539624,-0.596205,...,0.750149,-0.151009,0.970997,-0.445783,-0.252617,0.392716,-1.168628,-0.673627,-0.6453,-0.558654
2,0.460636,0.040647,1,-0.203023,-2.136089,-0.065873,-0.482006,-0.979286,-0.281601,-0.596205,...,0.528993,-1.678281,-0.716784,-0.777299,-0.008321,-0.508442,-1.379686,-1.686652,-0.978454,0.665583
3,-0.41005,0.460662,1,-0.203023,0.832198,-0.210392,-0.234427,-0.041181,-0.281601,-0.596205,...,-0.68596,-0.264255,-0.63415,-0.616132,-0.448532,-0.433333,-1.01167,-0.788885,-2.1332,-0.809216
4,-1.602906,-1.031926,1,-0.203023,0.647809,0.661304,0.546033,0.708066,1.159842,2.193018,...,0.740503,0.583684,0.439865,-0.77665,0.812871,0.256011,0.298496,-0.005849,0.192792,-1.316294


In [192]:
df_top_features['B_Reach_cms'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_features['B_Reach_cms'] = y


In [193]:
df_top_features.head()

Unnamed: 0,B_Height_cms,B_Weight_lbs,gender,B_win_by_TKO_Doctor_Stoppage,B_avg_CLINCH_att,B_avg_LEG_att,B_avg_LEG_landed,B_avg_CTRL_time(seconds),B_age,B_win_by_Submission,...,B_avg_opp_HEAD_landed,B_avg_opp_TD_att,R_avg_KD,B_avg_opp_SUB_ATT,B_avg_opp_TOTAL_STR_landed,B_avg_TOTAL_STR_landed,B_avg_opp_GROUND_att,R_avg_opp_SIG_STR_pct,B_avg_opp_SIG_STR_pct,B_Reach_cms
0,0.460636,0.460662,1,-0.203023,-0.078703,0.625452,0.631658,0.399169,-0.803318,-0.596205,...,0.569593,-0.858338,-0.716784,1.065387,0.928123,-0.631524,0.378318,0.192736,0.093006,0.665583
1,-0.117598,-0.402,1,-0.203023,0.069085,0.414684,0.473823,-0.464297,-0.539624,-0.596205,...,0.750149,-0.151009,0.970997,-0.445783,-0.252617,0.392716,-1.168628,-0.673627,-0.6453,-0.558654
2,0.460636,0.040647,1,-0.203023,-2.136089,-0.065873,-0.482006,-0.979286,-0.281601,-0.596205,...,0.528993,-1.678281,-0.716784,-0.777299,-0.008321,-0.508442,-1.379686,-1.686652,-0.978454,0.665583
3,-0.41005,0.460662,1,-0.203023,0.832198,-0.210392,-0.234427,-0.041181,-0.281601,-0.596205,...,-0.68596,-0.264255,-0.63415,-0.616132,-0.448532,-0.433333,-1.01167,-0.788885,-2.1332,-0.809216
4,-1.602906,-1.031926,1,-0.203023,0.647809,0.661304,0.546033,0.708066,1.159842,2.193018,...,0.740503,0.583684,0.439865,-0.77665,0.812871,0.256011,0.298496,-0.005849,0.192792,-1.316294
