In [12]:
import pandas as pd

# Load the uploaded Excel file to check its structure
file_path = '../dataset/rab107row.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows to understand the structure
data.head()

Unnamed: 0,namaproyek,rab,waktu,provinsi,tahun,luas,subitem,tinggi,lantai,ikk,ihbp,inflasi
0,Gedung Apartemen,26419140000.0,196,Jawa Barat,2019,17401.0,4,21.8,6,103.03,143.72,3.21
1,Gedung Apartemen,88210790000.0,273,DKI Jakarta,2022,16320.59,6,47.6,14,121.48,107.66,4.21
2,Gedung Apartemen,81712000000.0,348,DKI Jakarta,2023,23688.0,5,34.0,10,116.73,113.88,2.28
3,Gedung Apartemen,369000000000.0,830,Banten,2021,55000.0,6,49.8,16,97.64,109.64,1.91
4,Gedung Apartemen,540031600000.0,700,DKI Jakarta,2023,150000.0,6,105.0,30,116.73,113.88,2.28


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import BayesianRidge, Lasso, LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [14]:
# Label encoding kolom 'Provinsi' dan nama proyek karena nilainya kategorikal
label_encoder = LabelEncoder()
data['label_provinsi'] = label_encoder.fit_transform(data['provinsi'])
data['label_namaproyek'] = label_encoder.fit_transform(data['namaproyek'])

# Melihat nilai unik dari hasil encoding kolom 'provinsi_encoded'
provinsi_unik = data[['provinsi', 'label_provinsi']].drop_duplicates().sort_values(
    by='label_provinsi')

# Melihat nilai unik dari hasil encoding kolom 'namaproyek_encoded'
namaproyek_unik = data[['namaproyek', 'label_namaproyek']].drop_duplicates().sort_values(
    by='label_namaproyek')

In [15]:
provinsi_unik

Unnamed: 0,provinsi,label_provinsi
95,Bali,0
3,Banten,1
1,DKI Jakarta,2
0,Jawa Barat,3
23,Jawa Tengah,4
59,Jawa Timur,5
66,Kalimantan Timur,6
55,Kepulauan Riau,7
90,Lampung,8
25,Nusa Tenggara Barat,9


In [16]:
namaproyek_unik

Unnamed: 0,namaproyek,label_namaproyek
0,Gedung Apartemen,0
35,Gedung Asrama,1
43,Gedung Gudang,2
45,Gedung Hotel,3
49,Gedung Kampus,4
55,Gedung Kantor,5
76,Gedung Laboratorium,6
80,Gedung Olahraga,7
83,Gedung Pabrik,8
84,Gedung Parkir,9


In [17]:
# Split data into features and target
X1 = data.drop(columns=['rab', 'namaproyek', 'provinsi'])
y1 = data['rab']

# Split into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X1, y1, 
    test_size=0.3, 
    random_state=42)

# Split into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X1, y1, 
    test_size=0.2, 
    random_state=42)



In [18]:
# Initialize models
models = {
    'AdaBoost': AdaBoostRegressor(),
    'Bayesian Ridge': BayesianRidge(),
    'Decision Tree': DecisionTreeRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'KNN Regressor': KNeighborsRegressor(),
    'Lasso Regression': Lasso(),
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Ridge Regression': Ridge(),
    'SVR': SVR(),
    'XGBoost': xgb.XGBRegressor()
}

# 70:30
# Train and evaluate each model with additional metrics
results1 = []
for name1, model1 in models.items():
    # Fit model to training data
    model1.fit(X_train1, y_train1)
    
    # Predict on training data
    y_train_pred1 = model1.predict(X_train1)
    
    # Predict on testing data
    y_pred1 = model1.predict(X_test1)
    
    # Calculate metrics for training data
    r2_train1 = r2_score(y_train1, y_train_pred1)
    
    # Calculate metrics for testing data
    mse1 = mean_squared_error(y_test1, y_pred1)
    mae1 = mean_absolute_error(y_test1, y_pred1)
    r21 = r2_score(y_test1, y_pred1)
    
    # Append results
    results1.append({
        'Model': name1,
        'Training R2 Score': r2_train1,
        'Testing RMSE': np.sqrt(mse1),
        'Testing MAE': mae1,
        'Testing R2 Score': r21
    })

# Convert results into a DataFrame for better visualization
results_df1 = pd.DataFrame(results1).sort_values(by='Testing R2 Score', ascending=False)

# 80:20
# Train and evaluate each model with additional metrics
results2 = []
for name2, model2 in models.items():
    # Fit model to training data
    model2.fit(X_train2, y_train2)
    
    # Predict on training data
    y_train_pred2 = model2.predict(X_train2)
    
    # Predict on testing data
    y_pred2 = model2.predict(X_test2)
    
    # Calculate metrics for training data
    r2_train2 = r2_score(y_train2, y_train_pred2)
    
    # Calculate metrics for testing data
    mse2 = mean_squared_error(y_test2, y_pred2)
    mae2 = mean_absolute_error(y_test2, y_pred2)
    r22 = r2_score(y_test2, y_pred2)
    
    # Append results
    results2.append({
        'Model': name2,
        'Training R2 Score': r2_train2,
        'Testing RMSE': np.sqrt(mse2),
        'Testing MAE': mae2,
        'Testing R2 Score': r22
    })

# Convert results into a DataFrame for better visualization
results_df2 = pd.DataFrame(results2).sort_values(by='Testing R2 Score', ascending=False)


Hasil 70:30

In [19]:
results_df1

Unnamed: 0,Model,Training R2 Score,Testing RMSE,Testing MAE,Testing R2 Score
0,AdaBoost,0.9672525,573612800000.0,199655500000.0,0.456319
2,Decision Tree,1.0,592443200000.0,240346900000.0,0.420037
3,Gradient Boosting,0.999306,623884100000.0,223188100000.0,0.356847
7,Random Forest,0.8932643,633726500000.0,196144300000.0,0.336394
8,Ridge Regression,0.6900052,661682100000.0,235704800000.0,0.276555
5,Lasso Regression,0.6900084,661888500000.0,235842600000.0,0.276104
6,Linear Regression,0.6900084,661888500000.0,235842600000.0,0.276104
4,KNN Regressor,0.348822,678155500000.0,210800000000.0,0.240085
1,Bayesian Ridge,7.12097e-13,782561400000.0,262619800000.0,-0.011914
9,SVR,-0.08377046,802312100000.0,238110100000.0,-0.063637


Hasil 80:20

In [20]:
results_df2

Unnamed: 0,Model,Training R2 Score,Testing RMSE,Testing MAE,Testing R2 Score
4,KNN Regressor,0.3882603,806085800000.0,257347400000.0,0.268514
8,Ridge Regression,0.6200637,834012400000.0,270016500000.0,0.216952
5,Lasso Regression,0.6200664,834231400000.0,270090800000.0,0.216541
6,Linear Regression,0.6200664,834231400000.0,270090800000.0,0.216541
7,Random Forest,0.8977659,862679200000.0,263953600000.0,0.162197
0,AdaBoost,0.9504328,904393400000.0,286954500000.0,0.079215
3,Gradient Boosting,0.9983627,938598900000.0,298283700000.0,0.008248
1,Bayesian Ridge,9.15712e-13,954087400000.0,332002000000.0,-0.024754
2,Decision Tree,1.0,964900200000.0,355115600000.0,-0.048113
9,SVR,-0.08530357,976148200000.0,299961000000.0,-0.072691
