In [1]:
import pandas as pd

# Load the uploaded Excel file to check its structure
file_path = '../dataset/rab107row.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows to understand the structure
data.head()

Unnamed: 0,namaproyek,rab,waktu,provinsi,tahun,luas,subitem,tinggi,lantai,ikk,ihbp,inflasi
0,Gedung Apartemen,26419140000.0,196,Jawa Barat,2019,17401.0,4,21.8,6,103.03,143.72,3.21
1,Gedung Apartemen,88210790000.0,273,DKI Jakarta,2022,16320.59,6,47.6,14,121.48,107.66,4.21
2,Gedung Apartemen,81712000000.0,348,DKI Jakarta,2023,23688.0,5,34.0,10,116.73,113.88,2.28
3,Gedung Apartemen,369000000000.0,830,Banten,2021,55000.0,6,49.8,16,97.64,109.64,1.91
4,Gedung Apartemen,540031600000.0,700,DKI Jakarta,2023,150000.0,6,105.0,30,116.73,113.88,2.28


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import BayesianRidge, Lasso, LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [3]:
# 6. One-Hot Encoding untuk 'provinsi' dan 'namaproyek'
data = pd.get_dummies(data, columns=['namaproyek', 'provinsi'])

In [4]:
data.head()

Unnamed: 0,rab,waktu,tahun,luas,subitem,tinggi,lantai,ikk,ihbp,inflasi,...,provinsi_DKI Jakarta,provinsi_Jawa Barat,provinsi_Jawa Tengah,provinsi_Jawa Timur,provinsi_Kalimantan Timur,provinsi_Kepulauan Riau,provinsi_Lampung,provinsi_Nusa Tenggara Barat,provinsi_Papua,provinsi_Sulawesi Selatan
0,26419140000.0,196,2019,17401.0,4,21.8,6,103.03,143.72,3.21,...,False,True,False,False,False,False,False,False,False,False
1,88210790000.0,273,2022,16320.59,6,47.6,14,121.48,107.66,4.21,...,True,False,False,False,False,False,False,False,False,False
2,81712000000.0,348,2023,23688.0,5,34.0,10,116.73,113.88,2.28,...,True,False,False,False,False,False,False,False,False,False
3,369000000000.0,830,2021,55000.0,6,49.8,16,97.64,109.64,1.91,...,False,False,False,False,False,False,False,False,False,False
4,540031600000.0,700,2023,150000.0,6,105.0,30,116.73,113.88,2.28,...,True,False,False,False,False,False,False,False,False,False


In [4]:
# Split data into features and target
X1 = data.drop(columns=['rab'])
y1 = data['rab']

# Split into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X1, y1, 
    test_size=0.3, 
    random_state=42)

# Split into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X1, y1, 
    test_size=0.2, 
    random_state=42)



In [5]:
# Initialize models
models = {
    'AdaBoost': AdaBoostRegressor(),
    'Bayesian Ridge': BayesianRidge(),
    'Decision Tree': DecisionTreeRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'KNN Regressor': KNeighborsRegressor(),
    'Lasso Regression': Lasso(),
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Ridge Regression': Ridge(),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor()
}

# 70:30
# Train and evaluate each model with additional metrics
results1 = []
for name1, model1 in models.items():
    # Fit model to training data
    model1.fit(X_train1, y_train1)
    
    # Predict on training data
    y_train_pred1 = model1.predict(X_train1)
    
    # Predict on testing data
    y_pred1 = model1.predict(X_test1)
    
    # Calculate metrics for training data
    r2_train1 = r2_score(y_train1, y_train_pred1)
    
    # Calculate metrics for testing data
    mse1 = mean_squared_error(y_test1, y_pred1)
    mae1 = mean_absolute_error(y_test1, y_pred1)
    r21 = r2_score(y_test1, y_pred1)
    
    # Append results
    results1.append({
        'Model': name1,
        'Training R2 Score': r2_train1,
        'Testing RMSE': np.sqrt(mse1),
        'Testing MAE': mae1,
        'Testing R2 Score': r21
    })

# Convert results into a DataFrame for better visualization
results_df1 = pd.DataFrame(results1).sort_values(by='Testing R2 Score', ascending=False)

# 80:20
# Train and evaluate each model with additional metrics
results2 = []
for name2, model2 in models.items():
    # Fit model to training data
    model2.fit(X_train2, y_train2)
    
    # Predict on training data
    y_train_pred2 = model2.predict(X_train2)
    
    # Predict on testing data
    y_pred2 = model2.predict(X_test2)
    
    # Calculate metrics for training data
    r2_train2 = r2_score(y_train2, y_train_pred2)
    
    # Calculate metrics for testing data
    mse2 = mean_squared_error(y_test2, y_pred2)
    mae2 = mean_absolute_error(y_test2, y_pred2)
    r22 = r2_score(y_test2, y_pred2)
    
    # Append results
    results2.append({
        'Model': name2,
        'Training R2 Score': r2_train2,
        'Testing RMSE': np.sqrt(mse2),
        'Testing MAE': mae2,
        'Testing R2 Score': r22
    })

# Convert results into a DataFrame for better visualization
results_df2 = pd.DataFrame(results2).sort_values(by='Testing R2 Score', ascending=False)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Hasil 70:30

In [6]:
results_df1

Unnamed: 0,Model,Training R2 Score,Testing RMSE,Testing MAE,Testing R2 Score
3,Gradient Boosting,0.9992779,603600500000.0,212089400000.0,0.397987
7,Random Forest,0.8746062,671878400000.0,194789300000.0,0.254087
4,KNN Regressor,0.348822,678155500000.0,210800000000.0,0.240085
8,Ridge Regression,0.773586,678759900000.0,230852800000.0,0.23873
6,Linear Regression,0.7876641,696885700000.0,237462600000.0,0.197528
5,Lasso Regression,0.7876641,698193900000.0,241806900000.0,0.194513
0,AdaBoost,0.9677269,698643800000.0,208379800000.0,0.193474
1,Bayesian Ridge,7.12097e-13,782561400000.0,262619800000.0,-0.011914
2,Decision Tree,1.0,794958300000.0,239573300000.0,-0.044228
9,SVR,-0.08377046,802312100000.0,238110100000.0,-0.063637


Hasil 80:20

In [7]:
results_df2

Unnamed: 0,Model,Training R2 Score,Testing RMSE,Testing MAE,Testing R2 Score
4,KNN Regressor,0.3882603,806085800000.0,257347400000.0,0.268514
8,Ridge Regression,0.7302853,850566600000.0,268210200000.0,0.185559
7,Random Forest,0.9098887,855819900000.0,255504900000.0,0.175467
6,Linear Regression,0.7472063,869942900000.0,269813400000.0,0.148029
5,Lasso Regression,0.7472063,871675700000.0,273915100000.0,0.144632
0,AdaBoost,0.9457109,906816200000.0,285871100000.0,0.074275
3,Gradient Boosting,0.9986625,925237200000.0,289305900000.0,0.036283
1,Bayesian Ridge,9.15712e-13,954087400000.0,332002000000.0,-0.024754
2,Decision Tree,1.0,954526600000.0,323819800000.0,-0.025698
9,SVR,-0.08530357,976148200000.0,299961000000.0,-0.072691
