In [13]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Kombinasi 9 :
* Delete Duplicate
* Delete Null
* Outlier capping winsorize sisi kanan
* Encoding 
* Standard scaler
* Feature selection Implied

In [14]:
df = pd.read_csv('../../Without Feature Selection/UFC_kombinasi9_all_features.csv')

In [15]:
X = df.drop(['B_Reach_cms'], axis=1)
y = df['B_Reach_cms']

In [16]:
df_test = pd.read_csv('../../regression_kaggle/UFC_kombinasi9_all_features.csv')
df_test = df_test.drop(['B_Reach_cms'], axis=1, errors='ignore')
df_test_id = df_test['id']
df_test = df_test.drop(['id'], axis=1, errors='ignore')
# Get the common columns between df and df_test
common_columns = list(set(X.columns) & set(df_test.columns))
# Update df_test to only include the common columns
df_test = df_test[common_columns]
X = X[common_columns]

In [17]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [18]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Create the KNN model
decisionTree = Lasso(random_state=42)

decisionTree.fit(X, y)

# Get features with non-zero weights
non_zero_features = np.where(decisionTree.coef_ != 0)[0]

# Display the feature indices with non-zero weights
print("Features with non-zero weights:", non_zero_features)

Features with non-zero weights: [ 109  447  760  839 1005]


In [19]:
# Define the parameter grid
param_grid = {
    'alpha': [0.001, 0.0001, 0.1, 0.5, 1.0],
    'fit_intercept': [True, False],
    'precompute': [True, False],
    'max_iter': [1000, 2000, 5000, 10000],
    'tol': [0.0001, 0.001, 0.01],
    'selection': ['cyclic', 'random']
}



# Create the GridSearchCV object
grid_search = GridSearchCV(decisionTree, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the data to perform grid search
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best R-squared Score:", grid_search.best_score_)

Best Parameters: {'alpha': 0.1, 'fit_intercept': True, 'max_iter': 1000, 'precompute': False, 'selection': 'cyclic', 'tol': 0.001}
Best R-squared Score: 0.832243546043563


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Evaluate the best model using 5-fold cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='r2')

# Calculate the mean R-squared
mean_r2 = cv_scores.mean()
print("Mean R-squared:", mean_r2)

Mean R-squared: 0.832243546043563


In [21]:
# Calculate the RMSE using 5-fold cross-validation
cv_rmse = np.sqrt(np.abs(cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='neg_mean_squared_error')))

# Calculate the mean RMSE
mean_rmse = cv_rmse.mean()
print("Mean RMSE:", mean_rmse)

Mean RMSE: 3.8291569662392435


In [22]:
# Feature importance
feature_importance = grid_search.best_estimator_.coef_
print(feature_importance)

[ 0.         -0.          0.11707767 ...  0.02283567  0.
 -0.04792495]


In [23]:
import joblib

# Save the model as a pickle file
filename = 'Lasso2.pkl'
joblib.dump(grid_search.best_estimator_, filename)

['Lasso2.pkl']

In [24]:
df_test = scaler.transform(df_test)
y_pred = grid_search.best_estimator_.predict(df_test)
submission = pd.DataFrame({'id': df_test_id, 'B_Reach_cms': y_pred})
submission.to_csv('pred_kombinasi2_lasso_fix.csv', index=False)