In [None]:
!pip install keras_tuner
!pip install scikeras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks, optimizers, regularizers
import os
import shap
import keras_tuner as kt
from scikeras.wrappers import KerasRegressor, KerasClassifier
from sklearn.ensemble import VotingRegressor, VotingClassifier
from sklearn.model_selection import RandomizedSearchCV

In [7]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [11]:
# Load and preprocess data
print("Loading and preprocessing data...")
df = pd.read_csv("RegresiUTSTelkom.csv")
df = df.dropna()
df = df.drop_duplicates()

Loading and preprocessing data...


In [12]:
# Display basic information
print("Data shape:", df.shape)
df.info()
print("\nDescriptive statistics:")
print(df.describe())

Data shape: (515130, 91)
<class 'pandas.core.frame.DataFrame'>
Index: 515130 entries, 0 to 515343
Data columns (total 91 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   2001        515130 non-null  int64  
 1   49.94357    515130 non-null  float64
 2   21.47114    515130 non-null  float64
 3   73.0775     515130 non-null  float64
 4   8.74861     515130 non-null  float64
 5   -17.40628   515130 non-null  float64
 6   -13.09905   515130 non-null  float64
 7   -25.01202   515130 non-null  float64
 8   -12.23257   515130 non-null  float64
 9   7.83089     515130 non-null  float64
 10  -2.46783    515130 non-null  float64
 11  3.32136     515130 non-null  float64
 12  -2.31521    515130 non-null  float64
 13  10.20556    515130 non-null  float64
 14  611.10913   515130 non-null  float64
 15  951.0896    515130 non-null  float64
 16  698.11428   515130 non-null  float64
 17  408.98485   515130 non-null  float64
 18  383.70912   515130 non-n

In [13]:
# Remove outliers using IQR method
print("Removing outliers...")
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
mask = ~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)
df_clean = df[mask]
print(f"Removed {df.shape[0] - df_clean.shape[0]} outliers")

Removing outliers...
Removed 335697 outliers


In [14]:
# Define target variables
REG_TARGET = df_clean.columns[0]
df_clean['target_clf'] = (df_clean[REG_TARGET] > df_clean[REG_TARGET].median()).astype(int)
CLS_TARGET = 'target_clf'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['target_clf'] = (df_clean[REG_TARGET] > df_clean[REG_TARGET].median()).astype(int)


In [15]:
# Visualize target distributions
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.histplot(df_clean[REG_TARGET], kde=True, ax=axes[0])
axes[0].set_title('Distribution of Regression Target')
sns.countplot(x=df_clean[CLS_TARGET], ax=axes[1])
axes[1].set_title('Distribution of Classification Target')
plt.tight_layout()
plt.savefig('target_distributions.png')
plt.close()

In [16]:
# Create correlation matrix
plt.figure(figsize=(12, 10))
corr_matrix = df_clean.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

In [17]:
# Feature engineering
print("Performing feature engineering...")
features = [c for c in df_clean.columns if c not in [REG_TARGET, CLS_TARGET]]
numeric_feats = df_clean[features].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_feats = df_clean[features].select_dtypes(include=['object', 'category']).columns.tolist()

Performing feature engineering...


In [18]:
# Advanced preprocessing pipeline with feature selection
def create_advanced_pipeline(numeric_features, categorical_features):
    """
    Create an advanced preprocessing pipeline with:
    - PowerTransformer for numeric features to handle skewed distributions
    - StandardScaler for normalized numeric features
    - OneHotEncoder for categorical features
    - Feature selection using SelectKBest
    """
    numeric_pipeline = Pipeline([
        ('power_transform', PowerTransformer(method='yeo-johnson', standardize=False)),
        ('scaler', StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_features),
            ('cat', categorical_pipeline, categorical_features)
        ],
        remainder='drop'
    )

    return preprocessor

In [19]:
# Create the advanced preprocessing pipeline
preprocessor = create_advanced_pipeline(numeric_feats, categorical_feats)

In [20]:
# Split data into train and test sets
print("Splitting data into train and test sets...")
X, y_reg, y_clf = df_clean[features], df_clean[REG_TARGET], df_clean[CLS_TARGET]
X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.2, stratify=y_clf, random_state=42
)

Splitting data into train and test sets...


In [21]:
# Preprocess data
print("Preprocessing data...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
X_train_clf_processed = preprocessor.transform(X_train_clf)
X_test_clf_processed = preprocessor.transform(X_test_clf)

Preprocessing data...


In [22]:
# Get feature names after preprocessing
feature_names = []
if len(numeric_feats) > 0:
    feature_names.extend(numeric_feats)
if len(categorical_feats) > 0:
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    cat_features = ohe.get_feature_names_out(categorical_feats)
    feature_names.extend(cat_features.tolist())

In [23]:
# Feature selection for regression
print("Performing feature selection for regression...")
k_best = min(30, X_train_processed.shape[1])  # Select top 30 features or less if fewer features
feature_selector = SelectKBest(f_regression, k=k_best)
X_train_reg_selected = feature_selector.fit_transform(X_train_processed, y_train_reg)
X_test_reg_selected = feature_selector.transform(X_test_processed)

Performing feature selection for regression...


In [24]:
# Feature selection for classification
print("Performing feature selection for classification...")
feature_selector_clf = SelectKBest(mutual_info_regression, k=k_best)
X_train_clf_selected = feature_selector_clf.fit_transform(X_train_clf_processed, y_train_clf)
X_test_clf_selected = feature_selector_clf.transform(X_test_clf_processed)

Performing feature selection for classification...


In [30]:
# Advanced Regression Model
def create_advanced_reg_model(input_dim):
    """Create an improved regression model with residual connections and regularization"""
    inputs = layers.Input(shape=(input_dim,))

    # First block
    x = layers.Dense(256, kernel_regularizer=regularizers.l2(0.001))(inputs)
    x = layers.LeakyReLU(alpha=0.1)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)

    # Residual block 1
    res = x
    x = layers.Dense(256, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.LeakyReLU(alpha=0.1)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.add([x, res])  # Residual connection

    # Second block
    x = layers.Dense(128, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.LeakyReLU(alpha=0.1)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)

    # Residual block 2
    res = layers.Dense(128)(x)  # Match dimensions
    x = layers.Dense(128, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.LeakyReLU(alpha=0.1)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.add([x, res])  # Residual connection

    # Output layer
    outputs = layers.Dense(1, activation='linear')(x)

    model = models.Model(inputs=inputs, outputs=outputs, name="advanced_regressor")

    # FIX: Use tf.keras.losses.Huber() instead of 'huber_loss' string
    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.001),
        loss=tf.keras.losses.Huber(),  # Huber loss is more robust to outliers
        metrics=[
            tf.keras.metrics.RootMeanSquaredError(),
            tf.keras.metrics.MeanAbsoluteError()
        ]
    )

    return model

In [31]:
# Cross-validation for regression model
print("Performing cross-validation for regression model...")
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
reg_cv_scores = []

for train_idx, val_idx in kfold.split(X_train_reg_selected):
    # Split data
    X_train_fold, X_val_fold = X_train_reg_selected[train_idx], X_train_reg_selected[val_idx]
    y_train_fold, y_val_fold = y_train_reg.iloc[train_idx], y_train_reg.iloc[val_idx]

    # Create and train model
    reg_model = create_advanced_reg_model(X_train_reg_selected.shape[1])
    early_stop = callbacks.EarlyStopping(monitor='val_root_mean_squared_error', patience=15, restore_best_weights=True)
    reg_model.fit(
        X_train_fold, y_train_fold,
        validation_data=(X_val_fold, y_val_fold),
        epochs=100, batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )

    # Evaluate
    y_pred = reg_model.predict(X_val_fold).flatten()
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    r2 = r2_score(y_val_fold, y_pred)
    reg_cv_scores.append((rmse, r2))

Performing cross-validation for regression model...
[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step




[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step




[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step




[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step




[1m898/898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [32]:
# Print cross-validation results
print("\nRegression Cross-Validation Results:")
rmse_scores = [score[0] for score in reg_cv_scores]
r2_scores = [score[1] for score in reg_cv_scores]
print(f"Mean RMSE: {np.mean(rmse_scores):.4f} (±{np.std(rmse_scores):.4f})")
print(f"Mean R²: {np.mean(r2_scores):.4f} (±{np.std(r2_scores):.4f})")


Regression Cross-Validation Results:
Mean RMSE: 6.6476 (±0.0498)
Mean R²: 0.3313 (±0.0110)


In [34]:
# Train final regression model
print("\nTraining final regression model...")
final_reg_model = create_advanced_reg_model(X_train_reg_selected.shape[1])
reg_callbacks = [
    callbacks.EarlyStopping(monitor='val_root_mean_squared_error', patience=20, restore_best_weights=True),
    callbacks.ModelCheckpoint('advanced_regressor.h5', save_best_only=True, monitor='val_root_mean_squared_error'),
    callbacks.ReduceLROnPlateau(monitor='val_root_mean_squared_error', factor=0.5, patience=10, min_lr=1e-6)
]

history_reg = final_reg_model.fit(
    X_train_reg_selected, y_train_reg,
    validation_split=0.2,
    epochs=25, batch_size=32,
    callbacks=reg_callbacks,
    verbose=1
)


Training final regression model...




Epoch 1/25
[1m3582/3589[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - loss: 510.2239 - mean_absolute_error: 509.9566 - root_mean_squared_error: 832.1047



[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 9ms/step - loss: 509.4441 - mean_absolute_error: 509.1766 - root_mean_squared_error: 831.3063 - val_loss: 7.5780 - val_mean_absolute_error: 7.1428 - val_root_mean_squared_error: 8.6081 - learning_rate: 0.0010
Epoch 2/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 8ms/step - loss: 23.5234 - mean_absolute_error: 23.0640 - root_mean_squared_error: 29.1575 - val_loss: 8.1499 - val_mean_absolute_error: 7.5785 - val_root_mean_squared_error: 10.0753 - learning_rate: 0.0010
Epoch 3/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 9ms/step - loss: 19.8516 - mean_absolute_error: 19.2638 - root_mean_squared_error: 24.2295 - val_loss: 7.6566 - val_mean_absolute_error: 7.0020 - val_root_mean_squared_error: 9.2125 - learning_rate: 0.0010
Epoch 4/25
[1m3589/3



[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 10ms/step - loss: 16.6630 - mean_absolute_error: 16.4068 - root_mean_squared_error: 20.5793 - val_loss: 7.2349 - val_mean_absolute_error: 7.0194 - val_root_mean_squared_error: 8.2254 - learning_rate: 0.0010
Epoch 9/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 9ms/step - loss: 16.3839 - mean_absolute_error: 16.1783 - root_mean_squared_error: 20.3417 - val_loss: 8.2883 - val_mean_absolute_error: 8.0967 - val_root_mean_squared_error: 10.1785 - learning_rate: 0.0010
Epoch 10/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 9ms/step - loss: 17.2029 - mean_absolute_error: 17.0165 - root_mean_squared_error: 21.3129 - val_loss: 11.8172 - val_mean_absolute_error: 11.6445 - val_root_mean_squared_error: 13.2880 - learning_rate: 0.0010
Epoch 11/25
[1m3589/3



[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 9ms/step - loss: 16.3205 - mean_absolute_error: 16.1540 - root_mean_squared_error: 20.1891 - val_loss: 6.6693 - val_mean_absolute_error: 6.5128 - val_root_mean_squared_error: 7.7490 - learning_rate: 0.0010
Epoch 12/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - loss: 16.2586 - mean_absolute_error: 16.1088 - root_mean_squared_error: 20.3301 - val_loss: 7.6220 - val_mean_absolute_error: 7.4648 - val_root_mean_squared_error: 8.8139 - learning_rate: 0.0010
Epoch 13/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - loss: 16.7622 - mean_absolute_error: 16.6080 - root_mean_squared_error: 20.9200 - val_loss: 11.8119 - val_mean_absolute_error: 11.6490 - val_root_mean_squared_error: 12.7474 - learning_rate: 0.0010
Epoch 14/25
[1m3589/35



[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - loss: 15.9053 - mean_absolute_error: 15.7688 - root_mean_squared_error: 19.8301 - val_loss: 5.3180 - val_mean_absolute_error: 5.1716 - val_root_mean_squared_error: 6.8608 - learning_rate: 0.0010
Epoch 21/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - loss: 16.2806 - mean_absolute_error: 16.1501 - root_mean_squared_error: 20.2075 - val_loss: 5.5200 - val_mean_absolute_error: 5.3712 - val_root_mean_squared_error: 7.4926 - learning_rate: 0.0010
Epoch 22/25
[1m3589/3589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - loss: 16.3411 - mean_absolute_error: 16.2194 - root_mean_squared_error: 20.3686 - val_loss: 14.1631 - val_mean_absolute_error: 14.0622 - val_root_mean_squared_error: 15.1889 - learning_rate: 0.0010
Epoch 23/25
[1m3589/35

In [35]:
# Plot training history for regression
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history_reg.history['root_mean_squared_error'])
plt.plot(history_reg.history['val_root_mean_squared_error'])
plt.title('Model RMSE')
plt.ylabel('RMSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.subplot(1, 2, 2)
plt.plot(history_reg.history['mean_absolute_error'])
plt.plot(history_reg.history['val_mean_absolute_error'])
plt.title('Model MAE')
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.tight_layout()
plt.savefig('regression_training_history.png')
plt.close()

In [36]:
# Evaluate regression model on test set
print("\nEvaluating regression model on test set...")
y_pred_reg = final_reg_model.predict(X_test_reg_selected).flatten()
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_reg, y_pred_reg)
print(f'MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}')


Evaluating regression model on test set...
[1m1122/1122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
MSE: 46.7271, RMSE: 6.8357, R²: 0.2973


In [37]:
# Plot actual vs predicted for regression
plt.figure(figsize=(8, 8))
plt.scatter(y_test_reg, y_pred_reg, alpha=0.5)
plt.plot(
    [y_test_reg.min(), y_test_reg.max()],
    [y_test_reg.min(), y_test_reg.max()],
    'r--'
)
plt.title('Actual vs Predicted (Regression)')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.grid(True)
plt.tight_layout()
plt.savefig('regression_actual_vs_predicted.png')
plt.close()

In [38]:
# Advanced Classification Model with Hyperparameter Tuning
print("\nPreparing data for classification model...")


Preparing data for classification model...


In [39]:
# Handle class imbalance with SMOTETomek (combines over-sampling and under-sampling)
print("Applying SMOTETomek for handling class imbalance...")
smote_tomek = SMOTETomek(random_state=42)
X_train_clf_resampled, y_train_clf_resampled = smote_tomek.fit_resample(X_train_clf_selected, y_train_clf)

def build_advanced_clf_model(hp):
    """Create an improved classification model with hyperparameter tuning"""
    inputs = layers.Input(shape=(X_train_clf_selected.shape[1],))

    # First layer
    x = layers.Dense(
        hp.Int('units_1', min_value=64, max_value=256, step=32),
        kernel_regularizer=regularizers.l2(hp.Float('l2_1', 1e-4, 1e-2, sampling='log'))
    )(inputs)
    x = layers.LeakyReLU(alpha=hp.Float('alpha_1', 0.05, 0.3, step=0.05))(x)
    if hp.Boolean('batch_norm_1'):
        x = layers.BatchNormalization()(x)
    x = layers.Dropout(hp.Float('dropout_1', 0.2, 0.5, step=0.1))(x)

    # Hidden layers
    for i in range(2, hp.Int('num_layers', 2, 5) + 1):
        x = layers.Dense(
            hp.Int(f'units_{i}', min_value=32, max_value=128, step=32),
            kernel_regularizer=regularizers.l2(hp.Float(f'l2_{i}', 1e-4, 1e-2, sampling='log'))
        )(x)
        x = layers.LeakyReLU(alpha=hp.Float(f'alpha_{i}', 0.05, 0.3, step=0.05))(x)
        if hp.Boolean(f'batch_norm_{i}'):
            x = layers.BatchNormalization()(x)
        x = layers.Dropout(hp.Float(f'dropout_{i}', 0.1, 0.4, step=0.1))(x)

    # Output layer
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = models.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer=optimizers.Adam(
            learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')
        ),
        loss='binary_crossentropy',
        metrics=['accuracy',
                 tf.keras.metrics.AUC(name='auc'),
                 tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )

    return model

Applying SMOTETomek for handling class imbalance...


In [44]:
print("Setting up hyperparameter tuning for classification model...")
tuner = kt.BayesianOptimization(
    build_advanced_clf_model,
    objective=kt.Objective('val_auc', direction='max'),
    max_trials=5,
    directory='kt_dir_advanced',
    project_name='advanced_mlp_clf',
    overwrite=True
)

Setting up hyperparameter tuning for classification model...




In [45]:
# Callbacks for tuner
stop_early = callbacks.EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True)

In [47]:
print("Starting hyperparameter search...")
tuner.search(
    X_train_clf_resampled, y_train_clf_resampled,
    epochs=3,
    validation_split=0.2,
    callbacks=[stop_early],
    verbose=1
)

Trial 5 Complete [00h 00m 57s]
val_auc: 0.8023454546928406

Best val_auc So Far: 0.8067225813865662
Total elapsed time: 00h 05m 44s


In [48]:
# Get best hyperparameters
best_hp = tuner.get_best_hyperparameters(1)[0]
print("\nBest hyperparameters:", best_hp.values)


Best hyperparameters: {'units_1': 256, 'l2_1': 0.0003602330794921003, 'alpha_1': 0.25, 'batch_norm_1': True, 'dropout_1': 0.2, 'num_layers': 3, 'units_2': 96, 'l2_2': 0.0019990629503147854, 'alpha_2': 0.2, 'batch_norm_2': True, 'dropout_2': 0.2, 'learning_rate': 0.0033439519045640126, 'units_3': 32, 'l2_3': 0.0001, 'alpha_3': 0.05, 'batch_norm_3': False, 'dropout_3': 0.1}


In [49]:
# Build and train best model
print("\nTraining final classification model with best hyperparameters...")
best_model = tuner.hypermodel.build(best_hp)
clf_callbacks = [
    callbacks.EarlyStopping(monitor='val_auc', patience=15, restore_best_weights=True),
    callbacks.ModelCheckpoint('advanced_classifier.h5', save_best_only=True, monitor='val_auc'),
    callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=7, min_lr=1e-6)
]

history_clf = best_model.fit(
    X_train_clf_resampled, y_train_clf_resampled,
    validation_split=0.2,
    epochs=5,
    batch_size=32,
    callbacks=clf_callbacks,
    verbose=1
)


Training final classification model with best hyperparameters...
Epoch 1/5
[1m3554/3555[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.7092 - auc: 0.7722 - loss: 0.6849 - precision: 0.6842 - recall: 0.7208



[1m3555/3555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - accuracy: 0.7092 - auc: 0.7722 - loss: 0.6848 - precision: 0.6842 - recall: 0.7208 - val_accuracy: 0.7260 - val_auc: 0.8023 - val_loss: 0.5796 - val_precision: 0.8010 - val_recall: 0.7182 - learning_rate: 0.0033
Epoch 2/5
[1m3549/3555[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.7239 - auc: 0.7898 - loss: 0.5800 - precision: 0.6944 - recall: 0.7482



[1m3555/3555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - accuracy: 0.7239 - auc: 0.7898 - loss: 0.5800 - precision: 0.6944 - recall: 0.7482 - val_accuracy: 0.7288 - val_auc: 0.8013 - val_loss: 0.5693 - val_precision: 0.7947 - val_recall: 0.7340 - learning_rate: 0.0033
Epoch 3/5
[1m3555/3555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 6ms/step - accuracy: 0.7257 - auc: 0.7921 - loss: 0.5700 - precision: 0.6980 - recall: 0.7452 - val_accuracy: 0.7318 - val_auc: 0.8042 - val_loss: 0.5668 - val_precision: 0.7965 - val_recall: 0.7380 - learning_rate: 0.0033
Epoch 4/5
[1m3555/3555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.7252 - auc: 0.7929 - loss: 0.5661 - precision: 0.6976 - recall: 0.7442 - val_accuracy: 0.7242 - val_auc: 0.8035 - val_loss: 0.5742 - val_precision: 0.8063 - val_recall: 0.7064 

In [50]:
# Plot training history for classification
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.plot(history_clf.history['accuracy'])
plt.plot(history_clf.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')

plt.subplot(1, 3, 2)
plt.plot(history_clf.history['auc'])
plt.plot(history_clf.history['val_auc'])
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')

plt.subplot(1, 3, 3)
plt.plot(history_clf.history['loss'])
plt.plot(history_clf.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.tight_layout()
plt.savefig('classification_training_history.png')
plt.close()

In [51]:
# Evaluate classification model
print("\nEvaluating classification model on test set...")
y_proba = best_model.predict(X_test_clf_selected)
y_pred = (y_proba > 0.5).astype(int)


Evaluating classification model on test set...
[1m1122/1122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [52]:
# Calculate metrics
acc = accuracy_score(y_test_clf, y_pred)
prec = precision_score(y_test_clf, y_pred)
rec = recall_score(y_test_clf, y_pred)
f1 = f1_score(y_test_clf, y_pred)
auc = roc_auc_score(y_test_clf, y_proba)

print(f'Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}')
print("\nClassification Report:")
print(classification_report(y_test_clf, y_pred))

Accuracy: 0.7186, Precision: 0.7042, Recall: 0.7083, F1: 0.7063, AUC: 0.7910

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.73      0.73     18745
           1       0.70      0.71      0.71     17142

    accuracy                           0.72     35887
   macro avg       0.72      0.72      0.72     35887
weighted avg       0.72      0.72      0.72     35887



In [53]:
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_clf, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

# Plot ROC curve
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_test_clf, y_proba)
plt.plot(fpr, tpr, label=f'AUC = {auc:.4f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curve.png')
plt.close()

# Plot Precision-Recall curve
plt.figure(figsize=(8, 6))
precision, recall, _ = precision_recall_curve(y_test_clf, y_proba)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('precision_recall_curve.png')
plt.close()

In [57]:
print("\nCalculating feature importance using permutation importance...")
try:
    from sklearn.inspection import permutation_importance

    # For regression model
    result_reg = permutation_importance(
        lambda X: final_reg_model.predict(X),
        X_test_reg_selected,
        y_test_reg,
        n_repeats=10,
        random_state=42,
        n_jobs=-1
    )

    # Plot feature importance for regression
    plt.figure(figsize=(10, 8))
    sorted_idx = result_reg.importances_mean.argsort()
    plt.barh(range(len(sorted_idx)), result_reg.importances_mean[sorted_idx])
    plt.yticks(range(len(sorted_idx)), [f"Feature {i}" for i in sorted_idx])
    plt.xlabel("Permutation Importance")
    plt.title("Feature Importance (Regression)")
    plt.tight_layout()
    plt.savefig('importance_regression.png')
    plt.close()

    # For classification model
    result_clf = permutation_importance(
        lambda X: best_model.predict(X),
        X_test_clf_selected,
        y_test_clf,
        n_repeats=10,
        random_state=42,
        n_jobs=-1
    )

    # Plot feature importance for classification
    plt.figure(figsize=(10, 8))
    sorted_idx = result_clf.importances_mean.argsort()
    plt.barh(range(len(sorted_idx)), result_clf.importances_mean[sorted_idx])
    plt.yticks(range(len(sorted_idx)), [f"Feature {i}" for i in sorted_idx])
    plt.xlabel("Permutation Importance")
    plt.title("Feature Importance (Classification)")
    plt.tight_layout()
    plt.savefig('importance_classification.png')
    plt.close()

except Exception as e:
    print(f"Error calculating feature importance: {e}")
    # Fallback to simple coefficient-based feature importance
    print("Falling back to simpler feature importance visualization...")

    # For regression - Extract weights from the first dense layer
    reg_weights = np.abs(final_reg_model.layers[1].get_weights()[0]).mean(axis=1)
    plt.figure(figsize=(10, 8))
    plt.bar(range(len(reg_weights)), reg_weights)
    plt.title('Simple Feature Importance (Regression)')
    plt.xlabel('Feature Index')
    plt.ylabel('Absolute Weight')
    plt.tight_layout()
    plt.savefig('simple_importance_regression.png')
    plt.close()

    # For classification - Extract weights from the first dense layer
    clf_weights = np.abs(best_model.layers[1].get_weights()[0]).mean(axis=1)
    plt.figure(figsize=(10, 8))
    plt.bar(range(len(clf_weights)), clf_weights)
    plt.title('Simple Feature Importance (Classification)')
    plt.xlabel('Feature Index')
    plt.ylabel('Absolute Weight')
    plt.tight_layout()
    plt.savefig('simple_importance_classification.png')
    plt.close()

print("\nModel training and evaluation complete!")


Calculating feature importance using permutation importance...
Error calculating feature importance: The 'estimator' parameter of permutation_importance must be an object implementing 'fit'. Got <function <lambda> at 0x7f47e4477380> instead.
Falling back to simpler feature importance visualization...

Model training and evaluation complete!


![My image](./target_distributions.png)
![My image](./correlation_matrix.png)
![My image](./regression_training_history.png)
![My image](./regression_actual_vs_predicted.png)
![My image](./classification_training_history.png)
![My image](./confusion_matrix.png)
![My image](./roc_curve.png)
![My image](./precision_recall_curve.png)
![My image](./simple_importance_regression.png)
![My image](./simple_importance_classification.png)

## Matriks Evaluasi Regresi (MSE: 46.7271, RMSE: 6.8357, R²: 0.2973)
Model menunjukkan performa yang kurang baik, dengan MSE dan RMSE yang cukup tinggi serta R² hanya 0.2973, yang berarti model hanya menjelaskan sekitar 30% variasi data. Perbaikan dapat dilakukan dengan mencoba algoritma lain atau menambahkan fitur yang lebih relevan.

## Matriks Evaluasi Klasifikasi
Model menunjukkan performa yang cukup baik, dengan **accuracy** 71.86%, **precision** 70.42%, **recall** 70.83%, dan **F1-score** 70.63%. **AUC** 0.7910 menunjukkan model mampu membedakan kelas dengan baik. Secara keseluruhan, kinerja model solid, meski masih bisa ditingkatkan.