In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Turn off warnings
import warnings
warnings.filterwarnings('ignore')

### Kombinasi 9 :
* Delete Duplicate
* Delete Null
* Outlier capping winsorize sisi kanan
* Encoding 
* Standard scaler
* Feature selection Implied

In [2]:
df = pd.read_csv('../../Without Feature Selection/UFC_kombinasi9.csv')

In [3]:
X = df.drop(['B_Reach_cms'], axis=1)
y = df['B_Reach_cms']

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Define the parameter grid
param_grid = {
    'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

# Create the KNN model
decisionTree = DecisionTreeRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(decisionTree, param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the data to perform grid search
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best R-squared Score:", grid_search.best_score_)




Best Parameters: {'criterion': 'mse', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'splitter': 'best'}
Best R-squared Score: 0.8062180160434405


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Evaluate the best model using 5-fold cross-validation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='r2')

# Calculate the mean R-squared
mean_r2 = cv_scores.mean()
print("Mean R-squared:", mean_r2)

Mean R-squared: 0.8042681535440682


In [7]:
# Calculate the RMSE using 5-fold cross-validation
cv_rmse = np.sqrt(np.abs(cross_val_score(grid_search.best_estimator_, X, y, cv=5, scoring='neg_mean_squared_error')))

# Calculate the mean RMSE
mean_rmse = cv_rmse.mean()
print("Mean RMSE:", mean_rmse)

Mean RMSE: 4.116977860423779


In [8]:
import joblib

# Model with best parameters
best_model = grid_search.best_estimator_

# Save the model as a pickle file
filename = 'decision_tree_model.pkl'
joblib.dump(best_model, filename)

['decision_tree_model.pkl']

In [None]:
df_test = pd.read_csv('../../regression_kaggle/UFC_kombinasi9_no_fs.csv')
df_test = df_test.drop(['B_Reach_cms'], axis=1, errors='ignore')
df_test_id = df_test['id']
df_test = df_test.drop(['id'], axis=1)
df_test = scaler.transform(df_test)
y_pred = grid_search.best_estimator_.predict(df_test)
submission = pd.DataFrame({'id': df_test_id, 'B_Reach_cms': y_pred})
submission.to_csv('pred_kombinasi2_decision_tree.csv', index=False)