In [10]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Kombinasi 3 :
- Delete duplicate
- Impute missing value using iterative imputer
- Outlier capping with winsorization
- Encoding
- Standard Scaler
- Feature selection with Decision Tree

In [11]:
df = pd.read_csv('../../Without Feature Selection/UFC_kombinasi9_all_features.csv')

In [12]:
X = df.drop(['B_Reach_cms'], axis=1)
y = df['B_Reach_cms']

In [13]:
df_test = pd.read_csv('../../regression_kaggle/UFC_kombinasi9_all_features.csv')
df_test = df_test.drop(['B_Reach_cms'], axis=1, errors='ignore')
df_test_id = df_test['id']
df_test = df_test.drop(['id'], axis=1, errors='ignore')
# Get the common columns between df and df_test
common_columns = list(set(X.columns) & set(df_test.columns))
# Update df_test to only include the common columns
df_test = df_test[common_columns]
X = X[common_columns]

In [14]:
# Read lasso pickle
import joblib
lasso = joblib.load('../lasso_regression/Lasso2.pkl')

lasso_coef = lasso.coef_

print(lasso_coef)

[ 0.31184921  0.         -0.         ... -0.         -0.
  0.        ]


In [15]:
# Non-zero indices
non_zero_indices = np.where(lasso_coef != 0)[0]

print(non_zero_indices)

[   0    4    8   15   16   19   26   28   42   43   47   60   61   69
   70   73   74   78   92   98  102  103  104  113  116  118  122  125
  128  131  135  139  143  144  148  153  154  156  158  166  175  177
  179  187  188  190  199  202  205  210  211  218  219  221  224  225
  228  244  249  258  260  262  268  269  270  274  277  284  287  288
  291  299  302  308  309  314  317  328  333  334  335  337  344  346
  352  360  372  373  379  381  387  388  390  402  406  410  411  416
  420  422  427  435  447  449  457  466  492  498  500  504  506  512
  518  520  522  524  527  532  533  534  546  552  553  556  563  564
  567  568  572  574  577  588  603  610  612  622  629  632  638  640
  642  643  644  651  653  660  661  666  680  688  690  697  698  702
  707  709  714  719  723  724  726  731  737  742  748  751  752  754
  755  761  762  763  769  779  786  790  791  792  799  801  805  824
  827  832  833  834  835  836  845  846  850  857  861  868  871  873
  881 

In [16]:
# Feature selection
X = X.iloc[:, non_zero_indices]

df_test = df_test.iloc[:, non_zero_indices]

X.head()

Unnamed: 0,R_avg_opp_TOTAL_STR_att,"location_Oakland, California, USA",R_fighter_Jimy Hettes,B_fighter_Francis Carmont,"location_Atlantic City, New Jersey, USA",R_fighter_Luiz Cane,"location_Jaragua do Sul, Santa Catarina, Brazil",R_fighter_Evan Dunham,R_fighter_Jordan Mein,R_fighter_Corey Anderson,...,B_avg_opp_LEG_att,"location_Fresno, California, USA",R_fighter_Phil Davis,B_avg_HEAD_landed,R_fighter_Derek Brunson,R_fighter_Andre Ewell,B_fighter_Andre Winner,B_fighter_Jeremy Kennedy,Referee_Joe Solis,B_fighter_Lyoto Machida
0,129.59375,False,False,False,False,False,False,False,False,False,...,2.78125,False,False,39.976562,False,False,False,False,False,False
1,125.441042,False,False,False,False,False,False,False,False,False,...,1.796875,False,False,11.804688,False,False,False,False,False,False
2,168.796875,False,False,False,False,False,False,False,False,False,...,6.700195,False,False,18.651855,False,False,False,False,False,False
3,75.1875,False,False,False,False,False,False,False,False,False,...,6.0,False,False,16.0,False,False,False,False,False,False
4,94.375,False,False,False,False,False,False,False,False,False,...,4.25,False,False,25.5,False,False,False,False,False,False


In [17]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create the KNN model
random_forest = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(random_forest, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the data to perform grid search
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best R-squared Score:", grid_search.best_score_)


KeyboardInterrupt: 

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Evaluate the best model using 5-fold cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='r2')

# Calculate the mean R-squared
mean_r2 = cv_scores.mean()
print("Mean R-squared:", mean_r2)

Mean R-squared: 0.8134256176140257


In [7]:
# Calculate the RMSE using 5-fold cross-validation
cv_rmse = np.sqrt(np.abs(cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='neg_mean_squared_error')))

# Calculate the mean RMSE
mean_rmse = cv_rmse.mean()
print("Mean RMSE:", mean_rmse)

Mean RMSE: 4.05282046595969


In [8]:
df_test = scaler.transform(df_test)
y_pred = grid_search.best_estimator_.predict(df_test)
submission = pd.DataFrame({'id': df_test_id, 'B_Reach_cms': y_pred})
submission.to_csv('pred_kombinasi3_random_forest.csv', index=False)