<a href="https://colab.research.google.com/github/bintangnabiil/Machine_Learning/blob/main/Machine_Learning_Automobile%2C_Boston_House%2C_Infrared.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#A. Automobile Dataset

##1) Import Library dan Load Dataset

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('Automobile.csv')

##2) Preprocessing

In [6]:
# Ganti ? dengan NaN
df.replace('?', np.nan, inplace=True)

# Ubah semua kolom numerik menjadi float jika memungkinkan
for col in df.columns:
    try:
        df[col] = df[col].astype(float)
    except:
        pass

# Pisahkan fitur numerik dan kategorikal
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Imputasi nilai numerik dengan median
imputer = SimpleImputer(strategy='median')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Encode kolom kategorikal
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Gabungkan semua kembali
X = df.drop(columns=['price'])  # Target: price
y = df['price']

# Buang baris dengan target kosong
X = X[y.notna()]
y = y[y.notna()]

##3) Split Data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##4) Bagging

In [8]:
bagging_model = RandomForestRegressor(random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)

##5) Boosting

In [9]:
boosting_model = GradientBoostingRegressor(random_state=42)
boosting_model.fit(X_train, y_train)
y_pred_boosting = boosting_model.predict(X_test)

##6) Evaluasi Matriks

In [10]:
# === Evaluasi Model Bagging ===
mse_bagging = mean_squared_error(y_test, y_pred_bagging)
rmse_bagging = np.sqrt(mse_bagging)
r2_bagging = r2_score(y_test, y_pred_bagging)

print("=== Bagging (Random Forest) Evaluation ===")
print(f"MSE  : {mse_bagging:.2f}")
print(f"RMSE : {rmse_bagging:.2f}")
print(f"R2   : {r2_bagging:.2f}")

# === Evaluasi Model Boosting ===
mse_boosting = mean_squared_error(y_test, y_pred_boosting)
rmse_boosting = np.sqrt(mse_boosting)
r2_boosting = r2_score(y_test, y_pred_boosting)

print("\n=== Boosting (Gradient Boosting) Evaluation ===")
print(f"MSE  : {mse_boosting:.2f}")
print(f"RMSE : {rmse_boosting:.2f}")
print(f"R2   : {r2_boosting:.2f}")

=== Bagging (Random Forest) Evaluation ===
MSE  : 6346203.87
RMSE : 2519.17
R2   : 0.92

=== Boosting (Gradient Boosting) Evaluation ===
MSE  : 7170602.17
RMSE : 2677.80
R2   : 0.91


#B. Boston House Dataset

##1) Import Library

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

##2) Load Boston House Dataset

In [13]:
url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(url, sep="\s+", skiprows=22, header=None)

##3) Parse Dataset Structure

In [14]:
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

##4) Split Data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

##5) Bagging

In [16]:
bagging_model = RandomForestRegressor(random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)

##6) Boosting

In [17]:
boosting_model = GradientBoostingRegressor(random_state=42)
boosting_model.fit(X_train, y_train)
y_pred_boosting = boosting_model.predict(X_test)

##7) Evaluasi Matriks

In [19]:
# === Evaluasi Bagging: Random Forest ===
mse_bagging = mean_squared_error(y_test, y_pred_bagging)
rmse_bagging = np.sqrt(mse_bagging)
r2_bagging = r2_score(y_test, y_pred_bagging)

print("=== Evaluasi Bagging (Random Forest) ===")
print("MSE     :", mse_bagging)
print("RMSE    :", rmse_bagging)
print("R2 Score:", r2_bagging)

# === Evaluasi Boosting: Gradient Boosting ===
mse_boosting = mean_squared_error(y_test, y_pred_boosting)
rmse_boosting = np.sqrt(mse_boosting)
r2_boosting = r2_score(y_test, y_pred_boosting)

print("\n=== Evaluasi Boosting (Gradient Boosting) ===")
print("MSE     :", mse_boosting)
print("RMSE    :", rmse_boosting)
print("R2 Score:", r2_boosting)

=== Evaluasi Bagging (Random Forest) ===
MSE     : 7.901513892156864
RMSE    : 2.8109631609391226
R2 Score: 0.8922527442109116

=== Evaluasi Boosting (Gradient Boosting) ===
MSE     : 6.208861361528038
RMSE    : 2.491758688462436
R2 Score: 0.9153342280466539


#C. Infrared

##1) Import Library

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pandas as pd

##2) Load Dataset dan Preprocessing

In [3]:
# === Load dataset ===
df = pd.read_csv('Infrared.csv')

# === Encode semua kolom kategorikal (string) jadi numerik ===
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # disimpan jika nanti ingin inverse_transform

# === Imputasi nilai NaN setelah semua kolom sudah numerik ===
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

##3) Pisah Fitur dan Target

In [5]:
X = df_imputed.drop(columns=['aveOralM'])
y = df_imputed['aveOralM']

##4) Train-Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


##5) Bagging

In [7]:
bagging_model = RandomForestRegressor(random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)

##6) Boosting

In [8]:
boosting_model = GradientBoostingRegressor(random_state=42)
boosting_model.fit(X_train, y_train)
y_pred_boosting = boosting_model.predict(X_test)

##7) Evaluasi Matriks

In [9]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# === Evaluasi Bagging (Random Forest) ===
mse_bagging = mean_squared_error(y_test, y_pred_bagging)
rmse_bagging = np.sqrt(mse_bagging)
r2_bagging = r2_score(y_test, y_pred_bagging)

print("=== Random Forest Regressor (Bagging) ===")
print(f"MSE  : {mse_bagging:.4f}")
print(f"RMSE : {rmse_bagging:.4f}")
print(f"R²   : {r2_bagging:.4f}")

# === Evaluasi Boosting (Gradient Boosting) ===
mse_boosting = mean_squared_error(y_test, y_pred_boosting)
rmse_boosting = np.sqrt(mse_boosting)
r2_boosting = r2_score(y_test, y_pred_boosting)

print("\n=== Gradient Boosting Regressor ===")
print(f"MSE  : {mse_boosting:.4f}")
print(f"RMSE : {rmse_boosting:.4f}")
print(f"R²   : {r2_boosting:.4f}")

=== Random Forest Regressor (Bagging) ===
MSE  : 0.0568
RMSE : 0.2384
R²   : 0.7302

=== Gradient Boosting Regressor ===
MSE  : 0.0491
RMSE : 0.2217
R²   : 0.7666


#D. Penjelasan Persamaan Matematika
##1) Mean Squared Error (MSE)
MSE digunakan untuk MSE mengukur rata-rata kesalahan kuadrat dari prediksi model. Semakin kecil nilainya, semakin akurat model.
<br><br>
Rumus:

$$
MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i-\hat{y}_i)^2
$$

Keterangan:
- $y_i$ = nilai aktual (ground truth) dari data ke-$i$
- $\hat{y}_i$ = nilai prediksi oleh model untuk data ke-$i$
- $n$ = jumlah total data pada set pengujian
<br><br>

##2) Root Mean Squared Error (RMSE)
RMSE lebih sensitif terhadap kesalahan besar, dan mudah diinterpretasikan karena satuannya sama dengan target aslinya.
<br><br>
Rumus:

$$
RMSE = \sqrt{MSE} = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i-\hat{y}_i)^2}
$$

Keterangan:
- RMSE merupakan akar dari MSE.
- Satuan RMSE sama seperti satuan target/output (berbeda dengan MSE yang satuannya kuadrat).
<br><br>

##3) R-squared (R$^2$) atau Koefisien Determinasi
Menunjukkan seberapa besar proporsi variansi target yang bisa dijelaskan oleh model. Nilai R$^2$ = 1 artinya prediksi sempurna. Kalau R$^2$ = 0 artinya model tidak lebih baik dari rata-rata. Jika negatif maka model sangat buruk.
<br><br>

Rumus:

$$
R^2 = 1 - \frac{\sum_{i=1}^{n}(y_i-\hat{y}_i)^2}{\sum_{i=1}^{n}(y_i-\overline{y}_i)^2}
$$

Keterangan:
- $\overline{y}$ = rata-rata dari semua nilai aktual $y$
- Pembilang adalah total kesalahan model (jumlah kuadrat residual).
- Penyebut adalah total variansi data aktual (jumlah kuadrat total).