In [1]:
# Sel 1: Import libraries
import re
import matplotlib.pyplot as plt
import logging
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Konfigurasi logging
logging.basicConfig(level=logging.INFO)

In [2]:
# Sel 2: Load data
logging.info("1. Memuat data...")
columns = ['Harga_Normalized', 'Kamar_Normalized', 'WC_Normalized', 'Parkir_Normalized',
           'Luas_Tanah_Normalized', 'Luas_Bangunan_Normalized', 'Judul_Clean', 'Lokasi_Clean',
           'Deskripsi_Clean', 'Keywords_Clean', 'Image_Link', 'Property_Link']
df = pd.read_csv('databaru.csv', encoding='utf-8')
df = df[columns]
print(df.head())
print(df.info())



INFO:root:1. Memuat data...


   Harga_Normalized  Kamar_Normalized  WC_Normalized  Parkir_Normalized  \
0          0.161530          0.333333          0.375               0.00   
1          0.235080          0.444444          0.375               0.00   
2          0.132110          0.222222          0.250               0.05   
3          0.183595          0.333333          0.125               0.05   
4          0.183595          0.333333          0.125               0.05   

   Luas_Tanah_Normalized  Luas_Bangunan_Normalized  \
0               0.410169                  0.595041   
1               0.532203                  0.652893   
2               0.433898                  0.450413   
3               0.372881                  0.690083   
4               0.372881                  0.690083   

                                         Judul_Clean           Lokasi_Clean  \
0  layout keren efisien rumah full furnished deka...      sleman yogyakarta   
1    mewah kolam renang sleman dekat resto jejamuran          slem

In [3]:
# Sel 3: Preprocessing data
logging.info("2. Mempersiapkan fitur dan target...")

# Gabungkan teks yang sudah dibersihkan
df['text_combined'] = df['Judul_Clean'] + ' ' + \
                      df['Lokasi_Clean'] + ' ' + \
                      df['Deskripsi_Clean']

# Hitung panjang dokumen
df['doc_length'] = df['text_combined'].str.len()

# Filter dokumen dengan panjang > 3 karakter
df = df[df['text_combined'].str.strip().str.len() > 3]

print("\nJumlah dokumen setelah preprocessing:", len(df))


INFO:root:2. Mempersiapkan fitur dan target...



Jumlah dokumen setelah preprocessing: 5720


In [4]:
# Sel 4: TF-IDF Vectorization
logging.info("3. Melakukan TF-IDF Vectorization...")
tfidf = TfidfVectorizer(max_features=1000, min_df=1, stop_words=None)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
    
try:
    text_features = tfidf.fit_transform(df['text_combined']).toarray()
except ValueError as e:
    print("\n[ERROR] Terjadi error dalam TF-IDF Vectorization:", e)
    print("\n[DEBUG] Contoh isi text_combined (5 dokumen pertama):")
    print(df['text_combined'].head())
    print("\n[DEBUG] Distribusi panjang dokumen:")
    print(df['text_combined'].str.len().describe())
    raise

print("\n[DEBUG] Fitur TF-IDF berhasil dibuat:")
print("Shape:", text_features.shape)
print("Contoh fitur (baris pertama):", text_features[0])



INFO:root:3. Melakukan TF-IDF Vectorization...



[DEBUG] Fitur TF-IDF berhasil dibuat:
Shape: (5720, 1000)
Contoh fitur (baris pertama): [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0. 

In [5]:
# Sel 5: Prepare features and target
logging.info("4. Mempersiapkan fitur dan target...")
features = text_features
target = df['Harga_Normalized'].values

# Hapus sample yang memiliki NaN
mask = ~np.isnan(target)
features = features[mask]
target = target[mask]



INFO:root:4. Mempersiapkan fitur dan target...


In [6]:
# Sel 6: Split data
logging.info("5. Membagi data latih dan uji...")
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

INFO:root:5. Membagi data latih dan uji...


In [7]:
# Sel 7: Persiapan Data

# Persiapan fitur teks
tfidf = TfidfVectorizer(max_features=1000, min_df=1, stop_words=None)
text_features = tfidf.fit_transform(df['text_combined']).toarray()

# Simpan TfidfVectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("\n[DEBUG] Fitur TF-IDF berhasil dibuat:")
print("Shape:", text_features.shape)
print("Contoh fitur (baris pertama):", text_features[0])

# Persiapan fitur numerik
numeric_features = np.column_stack([
    df[[
        'Harga_Normalized', 'Kamar_Normalized', 'WC_Normalized',
        'Parkir_Normalized', 'Luas_Tanah_Normalized', 'Luas_Bangunan_Normalized'
    ]].values,
    df['Harga_Normalized'] / df['Luas_Bangunan_Normalized'],
    df['Luas_Bangunan_Normalized'] / df['Luas_Tanah_Normalized'],
    df['Kamar_Normalized'] * df['WC_Normalized']
])

print("\n[DEBUG] Fitur numerik berhasil dibuat:")
print("Shape:", numeric_features.shape)
print("Contoh fitur (baris pertama):", numeric_features[0])

# Target
target = df['Harga_Normalized'].values

# Hapus sample yang memiliki NaN
mask = ~np.isnan(target)
text_features = text_features[mask]
numeric_features = numeric_features[mask]
target = target[mask]


[DEBUG] Fitur TF-IDF berhasil dibuat:
Shape: (5720, 1000)
Contoh fitur (baris pertama): [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0. 

In [8]:
# Sel 8: Split data
logging.info("5. Membagi data latih dan uji...")
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    text_features, numeric_features, target, test_size=0.2, random_state=42
)
X_text_train, X_text_val, X_num_train, X_num_val, y_train, y_val = train_test_split(
    X_text_train, X_num_train, y_train, test_size=0.2, random_state=42
)


INFO:root:5. Membagi data latih dan uji...


In [9]:
# Sel 9: Create and train text model
logging.info("6. Membuat dan melatih model teks...")
text_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_text_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1)
])

text_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

text_history = text_model.fit(
    X_text_train, y_train, 
    validation_data=(X_text_val, y_val),
    epochs=1000, 
    batch_size=32, 
    callbacks=[early_stopping],
    verbose=1
)

INFO:root:6. Membuat dan melatih model teks...
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0566 - mae: 0.1318 - val_loss: 0.0470 - val_mae: 0.1201
Epoch 2/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0469 - mae: 0.1210 - val_loss: 0.0439 - val_mae: 0.1104
Epoch 3/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0370 - mae: 0.1043 - val_loss: 0.0434 - val_mae: 0.1059
Epoch 4/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0299 - mae: 0.0947 - val_loss: 0.0442 - val_mae: 0.1027
Epoch 5/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0215 - mae: 0.0770 - val_loss: 0.0458 - val_mae: 0.1017
Epoch 6/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0174 - mae: 0.0701 - val_loss: 0.0457 - val_mae: 0.0974
Epoch 7/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[

In [10]:
# Sel 10: Create and train numeric model
logging.info("7. Membuat dan melatih model numerik...")
numeric_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_num_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1)
])

numeric_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

numeric_history = numeric_model.fit(
    X_num_train, y_train, 
    validation_data=(X_num_val, y_val),
    epochs=1000, 
    batch_size=32, 
    callbacks=[early_stopping],
    verbose=1
)

INFO:root:7. Membuat dan melatih model numerik...


Epoch 1/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0423 - mae: 0.1058 - val_loss: 0.0040 - val_mae: 0.0319
Epoch 2/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0102 - mae: 0.0519 - val_loss: 0.0024 - val_mae: 0.0235
Epoch 3/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0068 - mae: 0.0380 - val_loss: 0.0020 - val_mae: 0.0178
Epoch 4/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 992us/step - loss: 0.0054 - mae: 0.0348 - val_loss: 0.0030 - val_mae: 0.0229
Epoch 5/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 973us/step - loss: 0.0045 - mae: 0.0330 - val_loss: 0.0026 - val_mae: 0.0224
Epoch 6/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 996us/step - loss: 0.0040 - mae: 0.0293 - val_loss: 0.0026 - val_mae: 0.0213
Epoch 7/1000
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [11]:
# Sel 11: Evaluate models
logging.info("8. Mengevaluasi model...")

# Visualisasi Training Text Model
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.title('Text Model Training Performance')
plt.plot(text_history.history['loss'], label='Training Loss')
plt.plot(text_history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()

plt.subplot(1, 2, 2)
plt.title('Text Model Training MAE')
plt.plot(text_history.history['mae'], label='Training MAE')
plt.plot(text_history.history['val_mae'], label='Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Error')
plt.legend()

plt.tight_layout()
plt.savefig('text_model_training_performance.png')
plt.close()

# Visualisasi Training Numeric Model
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.title('Numeric Model Training Performance')
plt.plot(numeric_history.history['loss'], label='Training Loss')
plt.plot(numeric_history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()

plt.subplot(1, 2, 2)
plt.title('Numeric Model Training MAE')
plt.plot(numeric_history.history['mae'], label='Training MAE')
plt.plot(numeric_history.history['val_mae'], label='Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Error')
plt.legend()

plt.tight_layout()
plt.savefig('numeric_model_training_performance.png')
plt.close()

# Scatter Plots untuk Actual vs Predicted Values
# Prediksi untuk model teks
y_pred_text = text_model.predict(X_text_test).flatten()
y_pred_numeric = numeric_model.predict(X_num_test).flatten()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.title('Text Model: Actual vs Predicted')
plt.scatter(y_test, y_pred_text, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

plt.subplot(1, 2, 2)
plt.title('Numeric Model: Actual vs Predicted')
plt.scatter(y_test, y_pred_numeric, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

plt.tight_layout()
plt.savefig('model_prediction_comparison.png')
plt.close()

print("Visualization images have been saved:")
print("1. text_model_training_performance.png")
print("2. numeric_model_training_performance.png")
print("3. model_prediction_comparison.png")

INFO:root:8. Mengevaluasi model...


[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Visualization images have been saved:
1. text_model_training_performance.png
2. numeric_model_training_performance.png
3. model_prediction_comparison.png


In [12]:
# Sel 12: Save models
logging.info("9. Menyimpan model...")
text_model.save('text_model.keras')
numeric_model.save('numeric_model.keras')

logging.info("Proses pelatihan model selesai.")

INFO:root:9. Menyimpan model...
INFO:root:Proses pelatihan model selesai.
