# 4️⃣ Encode Data

#### 1️⃣ Import Library yang diperlukan

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate
from sklearn.model_selection import train_test_split

#### 2️⃣ Memuat Dataset

In [2]:
# Memuat dataset
df = pd.read_csv('../data/processed/noEncoded_data_harga_rumah_kota_bandung.csv')
print("✅ Dataset berhasil dimuat!")

✅ Dataset berhasil dimuat!


In [3]:
# Menampilkan beberapa data pertama
print("\nBeberapa data pertama:")
print(df.head().to_string(index=False))

# Menampilkan info dataset
print("\nInfo Dataset:")
df.info()


Beberapa data pertama:
       price  installment       location  bedroom_count  bathroom_count  carport_count  land_area  building_area
2100000000.0    7000000.0 Andir, Bandung              3               2              2      137.0          170.0
4100000000.0   15000000.0 Andir, Bandung              3               2              3      202.0          300.0
3300000000.0   12000000.0 Andir, Bandung              5               2              1      350.0          258.0
 580000000.0    2000000.0 Andir, Bandung              2               2              1       30.0           80.0
1300000000.0    4000000.0 Andir, Bandung             11               3              0      176.0          176.0

Info Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           5574 non-null   float64
 1   installment     5574 non-null   float

#### 3️⃣ Melakukan Label Encoding

In [4]:
# Inisialisasi Label Encoder
le = LabelEncoder()

# Menerapkan Label Encoding pada kolom 'location'
df["location_encoded"] = le.fit_transform(df["location"])

# Menampilkan mapping lokasi → angka
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Mengubah mapping ke dalam bentuk list of lists
mapping_data = [[location, code] for location, code in label_mapping.items()]

# Menampilkan dalam bentuk tabel
print("Mapping Label Encoding:")
print(tabulate(mapping_data, headers=["Lokasi", "Kode"], tablefmt="grid"))

Mapping Label Encoding:
+---------------------------+--------+
| Lokasi                    |   Kode |
| Andir, Bandung            |      0 |
+---------------------------+--------+
| Antapani, Bandung         |      1 |
+---------------------------+--------+
| Arcamanik, Bandung        |      2 |
+---------------------------+--------+
| Astanaanyar, Bandung      |      3 |
+---------------------------+--------+
| Babakanciparay, Bandung   |      4 |
+---------------------------+--------+
| Bandung Kidul, Bandung    |      5 |
+---------------------------+--------+
| Bandung Kulon, Bandung    |      6 |
+---------------------------+--------+
| Bandung Wetan, Bandung    |      7 |
+---------------------------+--------+
| Batununggal, Bandung      |      8 |
+---------------------------+--------+
| Bojongloa Kidul, Bandung  |      9 |
+---------------------------+--------+
| Buah Batu, Bandung        |     10 |
+---------------------------+--------+
| Cibeunying Kidul, Bandung |     11 |
+

#### 4️⃣ Menyimpan Hasil Encoding

In [5]:
df.drop(columns=['location'], inplace=True)
print("Kolom 'location' berhasil dihapus!")

# Simpan dataset yang sudah di-label encode
df.to_csv("../data/processed/house_prices_label_encoded.csv", index=False)

print("\n✅ Label Encoding berhasil dilakukan dan data berhasil disimpan!")

# Mengacak urutan baris dalam dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Membagi dataset: 70% Train, 30% Sisa (Validation + Test)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)

# Membagi sisa data: 15% Validation, 15% Test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Menyimpan hasil pembagian dataset
train_df.to_csv("../data/processed/house_prices_train.csv", index=False)
val_df.to_csv("../data/processed/house_prices_val.csv", index=False)
test_df.to_csv("../data/processed/house_prices_test.csv", index=False)

print("✅ Dataset berhasil dibagi menjadi Train (70%), Validation (15%), dan Test (15%)!")


Kolom 'location' berhasil dihapus!

✅ Label Encoding berhasil dilakukan dan data berhasil disimpan!
✅ Dataset berhasil dibagi menjadi Train (70%), Validation (15%), dan Test (15%)!
