<a href="https://colab.research.google.com/github/ardiusebenezer07/Machine-Learning/blob/main/XGBoost_Melb_data_csv_Ardius_Ebenezer_1103210220_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mengimpor library yang diperlukan
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

In [None]:
# Menghubungkan dengan Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Memuat data
file_path = '/content/drive/MyDrive/melb_data.csv'
data = pd.read_csv(file_path)

In [None]:
data.info()

In [None]:
# Rekayasa fitur (feature engineering)
# Contoh: mengisi missing values dengan mean atau modus, mengubah data kategorikal menjadi numerik, dll.

# Memvisualisasikan data
# Contoh: memvisualisasikan sebaran harga rumah (Price) terhadap luas tanah (Landsize)
plt.scatter(data['Landsize'], data['Price'])
plt.xlabel('Landsize')
plt.ylabel('Price')
plt.title('Scatter Plot of Price vs Landsize')
plt.show()

In [None]:
# Mengambil hanya fitur numerik
numerical_features = data.select_dtypes(include=['int64', 'float64'])

# Menghitung korelasi antar fitur
correlation_matrix = numerical_features.corr()

# Membuat heatmap korelasi
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Heatmap of Correlation Matrix')
plt.show()

In [None]:
# Membagi data menjadi train dan test set (80% train, 20% test)
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encoding fitur kategorikal menggunakan one-hot encoding
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Memastikan keduanya memiliki jumlah kolom yang sama setelah encoding
missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
for col in missing_cols:
    X_test_encoded[col] = 0
X_test_encoded = X_test_encoded[X_train_encoded.columns]

# Mengganti nilai NaN dengan nilai yang sesuai (misalnya, mean) menggunakan SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_test_imputed = imputer.transform(X_test_encoded)

# Pelatihan model menggunakan XGBRegressor setelah data diimputasi
xgb_model = XGBRegressor()
xgb_model.fit(X_train_imputed, y_train)

# Pelatihan model menggunakan RandomForestRegressor setelah data diimputasi
rf_model = RandomForestRegressor()
rf_model.fit(X_train_imputed, y_train)

# Pelatihan model menggunakan DecisionTreeRegressor setelah data diimputasi
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train_imputed, y_train)

# Mengevaluasi model menggunakan MAE setelah data diimputasi
y_pred_xgb = xgb_model.predict(X_test_imputed)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print("MAE (XGBoost):", mae_xgb)

y_pred_rf = rf_model.predict(X_test_imputed)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("MAE (Random Forest):", mae_rf)

y_pred_dt = dt_model.predict(X_test_imputed)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
print("MAE (Decision Tree):", mae_dt)