In [1]:
import pandas as pd
import numpy as np
import mysql.connector
from mysql.connector import Error
from datetime import datetime
import env
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def run_query(sql_name):
    # Connect to the MySQL database (replace the placeholders with your actual credentials)
    host = env.db_host
    user = env.db_user
    password = env.db_password
    database = env.db_schema
    port = 33144

    # Create a connection to the database
    connection = mysql.connector.connect(host=host, user=user, password=password, database=database, port=port)

    # Create a cursor object to execute the SQL query
    cursor = connection.cursor()

    # Define the SQL query
    current_dir = f"./"
    file_path = f'{current_dir}{sql_name}'

    # Read the content of the file into a variable
    with open(file_path, 'r') as file:
        query = file.read()

    # Execute the query
    cursor.execute(query)

    # Fetch all the results into a list of tuples
    results = cursor.fetchall()

    # Get the column names from the cursor description
    columns = [col[0] for col in cursor.description]

    # Close the cursor and the connection
    cursor.close()
    connection.close()

    # Create a DataFrame from the results and column names
    df = pd.DataFrame(results, columns=columns)
    return df

In [3]:
# mager maka auto hapus yg korelasi tinggi
def remove_highly_correlated_features(df, threshold=0.9):
    """
    Menghapus fitur yang memiliki korelasi lebih dari ambang batas tertentu.
    
    Parameters:
    - df: DataFrame yang berisi fitur-fitur yang akan diperiksa.
    - threshold: Ambang batas korelasi.
    
    Returns:
    - DataFrame yang telah dikurangi fiturnya.
    """
    # Hanya mengambil kolom numerik untuk korelasi
    numeric_df = df.select_dtypes(include=[np.number])
    correlation_matrix = numeric_df.corr().abs()
    
    # Dapatkan matriks segitiga atas dari matriks korelasi
    upper_triangle = correlation_matrix.where(
        np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    
    #print(f"kolom dengan korelasi di atas {threshold}: {upper_triangle.columns}")
    
    # Temukan indeks kolom fitur yang memiliki korelasi lebih dari ambang batas
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    return df.drop(columns=to_drop)

In [4]:
def manual_oversample(X, y):
    # Dapatkan indeks dari kelas minoritas
    minority_class_indices = y[y == 1].index
    majority_class_indices = y[y == 0].index
    
    # Hitung jumlah kelas mayoritas
    num_majority = len(y[y == 0])
    
    # Dapatkan sampel acak dari kelas minoritas
    random_minority_indices = np.random.choice(minority_class_indices, num_majority, replace=True)
    
    # Gabungkan indeks kelas mayoritas dengan sampel acak dari kelas minoritas
    over_sample_indices = np.concatenate([majority_class_indices, random_minority_indices])
    
    # Dapatkan data yang oversampled
    X_oversampled = X.loc[over_sample_indices]
    y_oversampled = y.loc[over_sample_indices]
    
    return X_oversampled, y_oversampled

In [5]:
# Ambil data dari DB
df = run_query("coba5.sql")
class_counts = df['target'].value_counts()
print(class_counts)

1    7000
0    7000
Name: target, dtype: int64


In [6]:
pd.options.display.max_rows = 150
df.head()

Unnamed: 0,reco_id_curr,contract_type_name,gender,own_car_flag,own_realty_flag,children_count,income,loan_body,annuity_payment,goods_price,...,document_20_flag,document_21_flag,requests_bki_hour,requests_bki_day,requests_bki_week,requests_bki_month,requests_bki_qrt,requests_bki_year,reco_id_curr.1,target
0,16a5efff61d95fd7f8de14d186a69c01,Cash loans,F,N,Y,2,99000.0,450000.0,30573.0,450000.0,...,0,0,0.0,0.0,0.0,0.0,0.0,1.0,16a5efff61d95fd7f8de14d186a69c01,1
1,cbb7d8fa971b58cd1b19943f2b62f6ac,Cash loans,M,Y,N,2,180000.0,761872.0,70006.5,675000.0,...,0,0,0.0,0.0,0.0,2.0,0.0,9.0,cbb7d8fa971b58cd1b19943f2b62f6ac,1
2,462a089891aa9e2f3b288835870e9dee,Cash loans,F,N,Y,0,99000.0,152820.0,9949.5,135000.0,...,0,0,0.0,0.0,0.0,0.0,1.0,0.0,462a089891aa9e2f3b288835870e9dee,1
3,0db25409d914be200df6d9acf230c9ab,Cash loans,M,N,Y,0,225000.0,746280.0,54436.5,675000.0,...,0,0,0.0,0.0,0.0,1.0,0.0,6.0,0db25409d914be200df6d9acf230c9ab,1
4,c4c32decd9635cda528435f1091d948a,Cash loans,F,N,Y,1,270000.0,753840.0,27823.5,540000.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,c4c32decd9635cda528435f1091d948a,1


In [7]:
df.isna().sum()

reco_id_curr                          0
contract_type_name                    0
gender                                0
own_car_flag                          0
own_realty_flag                       0
children_count                        0
income                                0
loan_body                             0
annuity_payment                       0
goods_price                           0
type_suite_name                       0
income_type_name                      0
education_type_name                   0
family_status_name                    0
housing_type_name                     0
population_relative_region            0
days_birth                            0
days_employed                         0
registration_timestamp                0
publication_timestamp                 0
age_own_car                           0
mobile_flag                           0
employee_phone_flag                   0
work_phone_flag                       0
mobile_contact_flag                   0


In [8]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib

# Ambil data dari DB
#df = run_query("coba6.sql")

# Menghapus kolom ID
df.drop(columns=['reco_id_curr'], inplace=True)

# Mengisi missing values
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].median(), inplace=True)
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Label Encoding kolom kategorikal
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split Train vs Test Data
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Penskalaan fitur
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Resample
X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_df.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_train_resampled, y_train_resampled = manual_oversample(X_train_df, y_train)

In [9]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_resampled, y_train_resampled)

# Prediksi dengan kelas
y_pred = lr.predict(X_test_scaled)

# Prediksi dengan probabilitas
y_pred_prob = lr.predict_proba(X_test_scaled)[:, 1]

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
report = classification_report(y_test, y_pred)

print(f"Accuracy Score: {accuracy:.4f}")
print("Classification Report:\n", report)
print(f"ROC AUC Score: {roc_auc:.4f}")

# Menyimpan model, scaler, dan encoders
joblib.dump(lr, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'data_scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

Accuracy Score: 0.6711
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.66      0.67      1385
           1       0.67      0.68      0.68      1415

    accuracy                           0.67      2800
   macro avg       0.67      0.67      0.67      2800
weighted avg       0.67      0.67      0.67      2800

ROC AUC Score: 0.7289




['label_encoders.pkl']

In [100]:
from sklearn.ensemble import RandomForestClassifier

# ... [kode sebelumnya]

# RandomForest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Prediksi dengan kelas
y_pred_rf = rf.predict(X_test_scaled)

# Prediksi dengan probabilitas
y_pred_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]

# Evaluasi model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)
report_rf = classification_report(y_test, y_pred_rf)

print(f"\nRandom Forest Results:")
print(f"Accuracy Score: {accuracy_rf:.4f}")
print("Classification Report:\n", report_rf)
print(f"ROC AUC Score: {roc_auc_rf:.4f}")

# Menyimpan model RandomForest
joblib.dump(rf, 'random_forest_model.pkl')


Random Forest Results:
Accuracy Score: 0.6465
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.83      0.70       986
           1       0.74      0.47      0.57      1014

    accuracy                           0.65      2000
   macro avg       0.67      0.65      0.64      2000
weighted avg       0.67      0.65      0.63      2000

ROC AUC Score: 0.7252




['random_forest_model.pkl']

In [10]:
import xgboost as xgb

# Pelatihan model XGBoost
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Prediksi
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_pred_prob_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluasi model
from sklearn.metrics import accuracy_score, classification_report

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

print(f"\nXGBoost Results:")
print(f"Accuracy Score: {accuracy_xgb:.4f}")
print("Classification Report:\n", report_xgb)
print(f"ROC AUC Score: {roc_auc_xgb:.4f}")



XGBoost Results:
Accuracy Score: 0.6486
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.73      0.67      1385
           1       0.68      0.57      0.62      1415

    accuracy                           0.65      2800
   macro avg       0.65      0.65      0.65      2800
weighted avg       0.65      0.65      0.65      2800

ROC AUC Score: 0.7023


In [40]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Pelatihan model XGBoost
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="auc", n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Prediksi
y_pred_xgb = xgb_model.predict(X_test_scaled)
y_pred_prob_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluasi model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

print(f"\nXGBoost Results:")
print(f"Accuracy Score: {accuracy_xgb:.4f}")
print("Classification Report:\n", report_xgb)
print(f"ROC AUC Score: {roc_auc_xgb:.4f}")
# Menyimpan model XGBoost
joblib.dump(xgb_model, 'xgboost_model.pkl')



XGBoost Results:
Accuracy Score: 0.6707
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.70      0.68      1385
           1       0.69      0.64      0.66      1415

    accuracy                           0.67      2800
   macro avg       0.67      0.67      0.67      2800
weighted avg       0.67      0.67      0.67      2800

ROC AUC Score: 0.7358


['xgboost_model.pkl']

In [11]:
import lightgbm as lgb

# Pelatihan model LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train_resampled, y_train_resampled)

# Prediksi
y_pred_lgb = lgb_model.predict(X_test_scaled)
y_pred_prob_lgb = lgb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluasi model
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
roc_auc_lgb = roc_auc_score(y_test, y_pred_prob_lgb)
report_lgb = classification_report(y_test, y_pred_lgb)

print(f"\nLightGBM Results:")
print(f"Accuracy Score: {accuracy_lgb:.4f}")
print("Classification Report:\n", report_lgb)
print(f"ROC AUC Score: {roc_auc_lgb:.4f}")

[LightGBM] [Info] Number of positive: 5615, number of negative: 5615
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10105
[LightGBM] [Info] Number of data points in the train set: 11230, number of used features: 105
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

LightGBM Results:
Accuracy Score: 0.6729
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.72      0.68      1385
           1       0.69      0.63      0.66      1415

    accuracy                           0.67      2800
   macro avg       0.67      0.67      0.67      2800
weighted avg       0.67      0.67      0.67      2800

ROC AUC Score: 0.7310


In [25]:
import lightgbm as lgb

# Pelatihan model LightGBM
lgb_model = lgb.LGBMClassifier(metric="auc", n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
lgb_model.fit(X_train_resampled, y_train_resampled)

# Prediksi
y_pred_lgb = lgb_model.predict(X_test_scaled)
y_pred_prob_lgb = lgb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluasi model
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
roc_auc_lgb = roc_auc_score(y_test, y_pred_prob_lgb)
report_lgb = classification_report(y_test, y_pred_lgb)

print(f"\nLightGBM Results:")
print(f"Accuracy Score: {accuracy_lgb:.4f}")
print("Classification Report:\n", report_lgb)
print(f"ROC AUC Score: {roc_auc_lgb:.4f}")

# Menyimpan model LightGBM
joblib.dump(lgb_model, 'lightgbm_model.pkl')



[LightGBM] [Info] Number of positive: 5615, number of negative: 5615
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10105
[LightGBM] [Info] Number of data points in the train set: 11230, number of used features: 105
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

LightGBM Results:
Accuracy Score: 0.6736
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.71      0.68      1385
           1       0.69      0.64      0.66      1415

    accuracy                           0.67      2800
   macro avg       0.67      0.67      0.67      2800
weighted avg       0.67      0.67      0.67      2800

ROC AUC Score: 0.7382


['lightgbm_model.pkl']

In [39]:
from sklearn.svm import SVC

# Pelatihan model SVM
svm_model = SVC(kernel='linear', probability=True, random_state=42) # 'probability=True' agar bisa memanggil predict_proba
svm_model.fit(X_train_resampled, y_train_resampled)

# Prediksi
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_prob_svm = svm_model.predict_proba(X_test_scaled)[:, 1]

# Evaluasi model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
roc_auc_svm = roc_auc_score(y_test, y_pred_prob_svm)
report_svm = classification_report(y_test, y_pred_svm)

print(f"\nSVM Results:")
print(f"Accuracy Score: {accuracy_svm:.4f}")
print("Classification Report:\n", report_svm)
print(f"ROC AUC Score: {roc_auc_svm:.4f}")




SVM Results:
Accuracy Score: 0.6679
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.65      0.66      1385
           1       0.67      0.68      0.67      1415

    accuracy                           0.67      2800
   macro avg       0.67      0.67      0.67      2800
weighted avg       0.67      0.67      0.67      2800

ROC AUC Score: 0.7283


In [13]:
import tensorflow as tf

# Membangun model dengan TensorFlow
n_features = X_train_scaled.shape[1]

model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(n_features,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Melatih model
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluasi Model
y_pred = model.predict(X_test_scaled)
roc_score = roc_auc_score(y_test, y_pred)
print(f"ROC AUC Score: {roc_score:.4f}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [16]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import roc_auc_score

# ... (Kode sebelumnya untuk pra-pemrosesan data Anda)

# # Split Train vs Test Data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Penskalaan fitur
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# Membangun model Deep Learning dengan Keras
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Melatih model
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Mengevaluasi model
y_pred_raw = model.predict(X_test_scaled)
y_pred_class = (y_pred_raw > 0.5).astype(int).flatten()
accuracy = np.mean(y_pred_class == y_test)
print(f"Accuracy: {accuracy:.4f}")

y_pred_prob = model.predict(X_test_scaled).flatten()
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC Score: {roc_auc:.4f}")

# Menyimpan model
model.save("deep_learning_model.h5")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

  saving_api.save_model(


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import roc_auc_score
from kerastuner.tuners import RandomSearch

# ... (Kode pra-pemrosesan Anda)

# Split Train vs Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Penskalaan fitur
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(units=hp.Int('units_input',    
                                        min_value=32,    
                                        max_value=512,   
                                        step=32),
                           activation='relu',
                           input_shape=(X_train_scaled.shape[1],)))
    model.add(layers.Dropout(rate=hp.Float('dropout_input',
                                           min_value=0.0,
                                           max_value=0.5,
                                           step=0.05)))
    for i in range(hp.Int('n_layers', 1, 5)):  
        model.add(layers.Dense(units=hp.Int(f'units_{i}', 32, 512, 32),
                               activation='relu'))
        model.add(layers.Dropout(rate=hp.Float(f'dropout_{i}', 0.0, 0.5, step=0.05)))
    
    model.add(layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=keras.optimizers.Adam(
                    hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = RandomSearch(
    build_model,                 
    objective='val_accuracy',    
    max_trials=5,                
    executions_per_trial=3,     
    directory='keras_tuner_dir', 
    project_name='keras_tuner_demo'
)

tuner.search_space_summary()

tuner.search(X_train_scaled, y_train, epochs=10, validation_split=0.2)

tuner.results_summary()

best_model = tuner.get_best_models(num_models=1)[0]

# Mengevaluasi model terbaik
y_pred_prob = best_model.predict(X_test_scaled).flatten()
y_pred_class = (y_pred_prob > 0.5).astype(int)

accuracy = np.mean(y_pred_class == y_test)
print(f"Accuracy: {accuracy:.4f}")

roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"ROC AUC Score: {roc_auc:.4f}")

# Menyimpan model terbaik
best_model.save("deep_learning_best_model.h5")



Reloading Tuner from keras_tuner_dir/keras_tuner_demo/tuner0.json
Search space summary
Default search space size: 14
units_input (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
dropout_input (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.05, 'sampling': 'linear'}
n_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
dropout_0 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.05, 'sampling': 'linear'}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
dropout_1 (Float)
{'default': 0.0, 'conditi

  saving_api.save_model(


### Coba Data Baru

In [46]:
# Ambil data baru
df_predict = run_query("coba4.sql")

# Menghapus kolom ID
#new_data.drop(columns=['reco_id_curr'], inplace=True)

In [43]:
import pandas as pd
import joblib

# 2. Pra-pemrosesan data
# Simpan kolom 'reco_id_curr' untuk digabungkan lagi nanti
reco_id_curr = df_predict['reco_id_curr'].copy()
df_predict.drop(columns=['reco_id_curr'], inplace=True)

# Mengisi missing values
for col in df_predict.select_dtypes(include=['float64', 'int64']).columns:
    df_predict[col].fillna(df_predict[col].median(), inplace=True)
for col in df_predict.select_dtypes(include=['object']).columns:
    df_predict[col].fillna(df_predict[col].mode()[0], inplace=True)

# Label Encoding kolom kategorikal
label_encoders = joblib.load('label_encoders.pkl')
for col in df_predict.select_dtypes(include=['object']).columns:
    if col in label_encoders:
        df_predict[col] = label_encoders[col].transform(df_predict[col])

# Penskalaan fitur
scaler = joblib.load('data_scaler.pkl')
df_predict_scaled = scaler.transform(df_predict)

# 3. Prediksi menggunakan probabilitas
model_name = 'xgboost_model'
model = joblib.load(f'{model_name}.pkl')
predicted_proba = model.predict_proba(df_predict_scaled)[:, 1]

# 4. Gabungkan probabilitas prediksi dengan reco_id_curr dalam DataFrame
result_df = pd.DataFrame({
    'reco_id_curr': reco_id_curr,
    'predicted_proba': predicted_proba
})

print(result_df)

                         reco_id_curr  predicted_proba
0    38027ebd59e7c2614201a9ad29d91f33         0.098205
1    f7825cc89df0a3665baddede33a196f0         0.125623
2    73ed7df79781a75b16431fc4c9dd50e9         0.240492
3    bec10049923a0bb209a2044d06a96e86         0.332084
4    8095e640b1088676e43264eb470b4806         0.401018
..                                ...              ...
995  a613d20807ab1699d07446e7925e1d1e         0.608887
996  4d10b7b0ff7f5d4654c0a6d25661a19f         0.191180
997  b2c36586339db9b42f1f140ee10c1dee         0.534942
998  0de9581c9037f50a115908a386d859fd         0.447751
999  7a80d7cd89b7e5378e29ea8d624c392f         0.211924

[1000 rows x 2 columns]


In [47]:
import pandas as pd
import joblib
import tensorflow as tf

# 2. Pra-pemrosesan data
# Simpan kolom 'reco_id_curr' untuk digabungkan lagi nanti
reco_id_curr = df_predict['reco_id_curr'].copy()
df_predict.drop(columns=['reco_id_curr'], inplace=True)

# Mengisi missing values
for col in df_predict.select_dtypes(include=['float64', 'int64']).columns:
    df_predict[col].fillna(df_predict[col].median(), inplace=True)
for col in df_predict.select_dtypes(include=['object']).columns:
    df_predict[col].fillna(df_predict[col].mode()[0], inplace=True)

# Label Encoding kolom kategorikal
label_encoders = joblib.load('label_encoders.pkl')
for col in df_predict.select_dtypes(include=['object']).columns:
    if col in label_encoders:
        df_predict[col] = label_encoders[col].transform(df_predict[col])

# Penskalaan fitur
scaler = joblib.load('data_scaler.pkl')
df_predict_scaled = scaler.transform(df_predict)

# 3. Muat dan prediksi menggunakan model deep learning
model = tf.keras.models.load_model('deep_learning_best_model.h5')
predicted_proba = model.predict(df_predict_scaled)[:, 0]  # Mengambil probabilitas dari kelas positif

# 4. Gabungkan probabilitas prediksi dengan reco_id_curr dalam DataFrame
result_df = pd.DataFrame({
    'reco_id_curr': reco_id_curr,
    'predicted_proba': predicted_proba
})

print(result_df)





                         reco_id_curr  predicted_proba
0    38027ebd59e7c2614201a9ad29d91f33         0.280072
1    f7825cc89df0a3665baddede33a196f0         0.110867
2    73ed7df79781a75b16431fc4c9dd50e9         0.202840
3    bec10049923a0bb209a2044d06a96e86         0.163998
4    8095e640b1088676e43264eb470b4806         0.322483
..                                ...              ...
995  a613d20807ab1699d07446e7925e1d1e         0.501365
996  4d10b7b0ff7f5d4654c0a6d25661a19f         0.394263
997  b2c36586339db9b42f1f140ee10c1dee         0.439965
998  0de9581c9037f50a115908a386d859fd         0.291561
999  7a80d7cd89b7e5378e29ea8d624c392f         0.127701

[1000 rows x 2 columns]


In [48]:
len(result_df)

1000

In [50]:
# Baca example_df dari file CSV dan hanya ambil kolom reco_id_curr dan urut
example_df = pd.read_csv('./example_submission.csv', usecols=['reco_id_curr', 'urut'], sep=';')

# Rename kolom predicted_proba menjadi target pada df
result_df.rename(columns={'predicted_proba': 'target'}, inplace=True)

# Lakukan inner join antara df dan example_df berdasarkan kolom reco_id_curr
merged_df = pd.merge(result_df, example_df, on='reco_id_curr', how='inner')

# Urutkan merged_df berdasarkan kolom urut
merged_df.sort_values(by='urut', inplace=True)

current_time = datetime.now().strftime('%Y_%m_%d_%H_%M')
model_name = 'deep_learning'
filename = f"output_{model_name}_{current_time}.csv"
# Simpan DataFrame ke file CSV dengan hanya mengambil kolom reco_id_curr dan target
merged_df[['reco_id_curr', 'target']].to_csv(filename, index=False, sep=',')

print(f"CSV file telah dibuat! {filename}")

CSV file telah dibuat! output_deep_learning_2023_09_25_17_23.csv


In [31]:
import pkg_resources

installed_packages = pkg_resources.working_set
installed_packages_list = sorted(["%s==%s" % (i.key, i.version) for i in installed_packages])
for m in installed_packages_list:
    print(m)

absl-py==2.0.0
appnope==0.1.3
asttokens==2.4.0
astunparse==1.6.3
backcall==0.2.0
cachetools==5.3.1
certifi==2023.7.22
charset-normalizer==3.2.0
comm==0.1.4
contourpy==1.1.1
cycler==0.11.0
debugpy==1.8.0
decorator==5.1.1
dm-tree==0.1.8
exceptiongroup==1.1.3
executing==1.2.0
flatbuffers==23.5.26
fonttools==4.42.1
gast==0.4.0
google-auth-oauthlib==1.0.0
google-auth==2.23.0
google-pasta==0.2.0
grpcio==1.58.0
h5py==3.9.0
idna==3.4
ipykernel==6.25.2
ipython==8.15.0
jedi==0.19.0
joblib==1.3.2
jupyter-client==8.3.1
jupyter-core==5.3.1
keras-core==0.1.7
keras-tuner==1.4.0
keras==2.13.1
kiwisolver==1.4.5
kt-legacy==1.0.5
libclang==16.0.6
lightgbm==4.0.0
markdown-it-py==3.0.0
markdown==3.4.4
markupsafe==2.1.3
matplotlib-inline==0.1.6
matplotlib==3.7.1
mdurl==0.1.2
mysql-connector-python==8.1.0
namex==0.0.7
nest-asyncio==1.5.8
numpy==1.24.3
oauthlib==3.2.2
opt-einsum==3.3.0
packaging==23.1
pandas==1.5.3
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
pillow==10.0.1
pip==23.2.1
platformdirs==3.10.0
