In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Kombinasi 3 :
- Delete duplicate
- Impute missing value using iterative imputer
- Outlier capping with winsorization
- Encoding
- Standard Scaler
- Feature selection with Decision Tree

In [2]:
df = pd.read_csv('../../Without Feature Selection/UFC_kombinasi9_all_features.csv')

In [3]:
X = df.drop(['B_Reach_cms'], axis=1)
y = df['B_Reach_cms']

In [4]:
df_test = pd.read_csv('../../regression_kaggle/UFC_kombinasi9_all_features.csv')
df_test = df_test.drop(['B_Reach_cms'], axis=1, errors='ignore')
df_test_id = df_test['id']
df_test = df_test.drop(['id'], axis=1, errors='ignore')
# Get the common columns between df and df_test
common_columns = list(set(X.columns) & set(df_test.columns))
# Update df_test to only include the common columns
df_test = df_test[common_columns]
X = X[common_columns]

In [5]:
# Read lasso pickle
import joblib
lasso = joblib.load('../lasso_regression/Lasso2.pkl')

lasso_coef = lasso.coef_

print(lasso_coef)

[ 0.         -0.          0.11707767 ...  0.02283567  0.
 -0.04792495]


In [6]:
# Non-zero indices
non_zero_indices = np.where(lasso_coef != 0)[0]

print(non_zero_indices)

[   2    4    8   13   15   26   32   33   39   44   47   48   53   57
   58   65   70   72   74   78   82   85   89   97  109  110  111  112
  113  115  118  120  125  127  136  142  143  145  146  151  159  167
  173  174  177  182  187  190  193  199  207  208  212  213  219  248
  252  256  264  267  271  273  278  283  291  292  296  301  304  307
  308  312  316  320  321  323  326  327  333  334  344  347  350  354
  375  386  388  391  407  421  428  437  440  441  447  451  454  455
  457  468  472  478  481  485  496  505  507  511  515  519  529  530
  545  546  549  556  557  560  562  565  567  572  579  580  583  585
  587  588  593  599  600  612  614  618  621  622  623  626  627  646
  656  659  662  665  666  668  673  678  679  687  691  692  697  700
  706  708  709  712  713  724  731  741  742  760  761  764  765  775
  777  782  785  787  792  803  805  814  828  831  834  837  839  848
  849  850  854  864  866  870  879  881  888  898  904  908  911  914
  918 

In [7]:
# Feature selection
X = X.iloc[:, non_zero_indices]

df_test = df_test.iloc[:, non_zero_indices]

X.head()

Unnamed: 0,B_fighter_Ryan Hall,R_fighter_Jessica Eye,"location_Jaragua do Sul, Santa Catarina, Brazil",B_fighter_Robbie Lawler,Referee_Jimmy Neely,R_fighter_Viscardi Andrade,B_avg_REV,R_fighter_Yan Cabral,R_fighter_Erik Koch,B_fighter_John Cholish,...,B_fighter_Edwin Figueroa,R_fighter_Evan Dunham,R_fighter_Brandon Vera,Referee_Jason McCoy,B_fighter_Tony Sims,B_win_by_TKO_Doctor_Stoppage,R_fighter_Daniel Omielanczuk,B_fighter_Amanda Lemos,Referee_Steve Rita,R_avg_HEAD_att
0,False,False,False,False,False,False,0.0,False,False,False,...,False,False,False,False,False,0,False,False,False,93.46875
1,False,False,False,False,False,False,0.0,False,False,False,...,False,False,False,False,False,0,False,False,False,72.988626
2,False,False,False,False,False,False,0.125,False,False,False,...,False,False,False,False,False,0,False,False,False,68.460938
3,False,False,False,False,False,False,0.0,False,False,False,...,False,False,False,False,False,0,False,False,False,73.3125
4,False,False,False,False,False,False,0.0,False,False,False,...,False,False,False,False,False,0,False,False,False,86.625


In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# Create the KNN model
svr = SVR()

# Create the GridSearchCV object
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the data to perform grid search
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best R-squared Score:", grid_search.best_score_)

Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best R-squared Score: 0.7856096525330604


In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Evaluate the best model using 5-fold cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='r2')

# Calculate the mean R-squared
mean_r2 = cv_scores.mean()
print("Mean R-squared:", mean_r2)

Mean R-squared: 0.7856096525330604


In [11]:
# Calculate the RMSE using 5-fold cross-validation
cv_rmse = np.sqrt(np.abs(cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='neg_mean_squared_error')))

# Calculate the mean RMSE
mean_rmse = cv_rmse.mean()
print("Mean RMSE:", mean_rmse)

Mean RMSE: 4.328437712000643


In [12]:
df_test = scaler.transform(df_test)
y_pred = grid_search.best_estimator_.predict(df_test)
submission = pd.DataFrame({'id': df_test_id, 'B_Reach_cms': y_pred})
submission.to_csv('pred_kombinasi3_support_vector.csv', index=False)