In [44]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Kombinasi 9 :
* Delete Duplicate
* Delete Null
* Outlier capping winsorize sisi kanan
* Encoding 
* Standard scaler
* Feature selection Implied

In [45]:
df = pd.read_csv('../../Without Feature Selection/UFC_kombinasi9.csv')

In [46]:
X = df.drop(['B_Reach_cms'], axis=1)
y = df['B_Reach_cms']

In [47]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [48]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Create the KNN model
decisionTree = Lasso(random_state=42)

decisionTree.fit(X, y)

# Get features with non-zero weights
non_zero_features = np.where(decisionTree.coef_ != 0)[0]

# Display the feature indices with non-zero weights
print("Features with non-zero weights:", non_zero_features)

Features with non-zero weights: [  0  64  65 130 134]


In [49]:
# Define the parameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'fit_intercept': [True, False],
    'precompute': [True, False],
    'max_iter': [1000, 2000, 5000],
    'tol': [0.0001, 0.001, 0.01],
    'selection': ['cyclic', 'random']
}



# Create the GridSearchCV object
grid_search = GridSearchCV(decisionTree, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the data to perform grid search
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best R-squared Score:", grid_search.best_score_)

Best Parameters: {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 1000, 'precompute': True, 'selection': 'random', 'tol': 0.01}
Best R-squared Score: 0.8027484551381054


In [50]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Evaluate the best model using 5-fold cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='r2')

# Calculate the mean R-squared
mean_r2 = cv_scores.mean()
print("Mean R-squared:", mean_r2)

Mean R-squared: 0.8027484551381054


In [51]:
# Calculate the RMSE using 5-fold cross-validation
cv_rmse = np.sqrt(np.abs(cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='neg_mean_squared_error')))

# Calculate the mean RMSE
mean_rmse = cv_rmse.mean()
print("Mean RMSE:", mean_rmse)

Mean RMSE: 4.152562291543263


In [52]:
# Feature importance
feature_importance = grid_search.best_estimator_.coef_
print(feature_importance)

[ 7.41098684e-01  2.32617091e-02  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  5.00002571e-02 -4.88705328e-02  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -4.02567295e-02 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -1.93942161e-02 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -7.57812274e-02 -8.69653960e-03
 -4.20898065e-03  1.97837950e-01  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -8.74910743e-02
 -0.00000000e+00 -9.28186510e-02 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  1.23080686e-01 -0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  4.81882463e-03
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000

In [53]:
import joblib

# Save the model as a pickle file
filename = 'Lasso.pkl'
joblib.dump(grid_search.best_estimator_, filename)

['Lasso.pkl']