In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

fairuzazaria_rtm_stuck_prediction_datasets_idt_path = kagglehub.dataset_download('fairuzazaria/rtm-stuck-prediction-datasets-idt')
fairuzazaria_rtm_stuck_prediction_datasets_idt_updated_path = kagglehub.dataset_download('fairuzazaria/rtm-stuck-prediction-datasets-idt-updated')
fairuzazaria_rtm_stuck_prediction_datasets_idt_adjusted_path = kagglehub.dataset_download('fairuzazaria/rtm-stuck-prediction-datasets-idt-adjusted')

print('Data source import complete.')


# **DATA MODELLING**

In [None]:
scaling    = False
step_in    = 60
step_out   = 60
model_nm   = "xgb"
model_iter = 10000
well_name  = "generalized"
generalize = True
additional = True
scale_type = "no_scale"

if scaling:
    scale_type = "minmax"

if generalize:
    well_name = "generalized"

# **1. PREPARATION**

## **1.1. IMPORT LIBRARIES**

In [None]:
import os
import csv
import glob
import h5py
import joblib
import random

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
from sklearn import metrics
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [None]:
from typing import Any
from json import loads, dumps
from datetime import datetime, date

In [None]:
from tqdm.notebook import tqdm
from multiprocessing import Lock

tqdm.set_lock(Lock())

## **1.2. PREPARE FUNCTIONS**

In [None]:
def load_h5_data(file_path, filtered=True, with_class = False):
    with h5py.File(file_path, "r") as f:
        X = f["X"][:]
        y = f["y"][:]

    if filtered:
        normal_mask = y == 0
        X = X[normal_mask]

    if with_class:
        return X.astype("float32"), y.astype("float32")

    return X.astype("float32")

In [None]:
def plot_conf_matrix(name: str, matrix: list):
    fig, ax = plt.subplots(figsize=(7, 6))
    disp = ConfusionMatrixDisplay(
        confusion_matrix=matrix,
        display_labels=[0,1]
    )

    disp.plot(cmap=plt.cm.Blues, ax=ax)
    plt.title(f"Confusion Matrix {well_name}")
    plt.savefig(f'{name}_conf_matrix.png')
    plt.show()

In [None]:
def plot_pred_difference(name: str, y_test: np.array, y_pred: np.array):
    fig = plt.figure(figsize=(6.55,6))

    plt.plot(y_test, label='Real data')
    plt.plot(y_pred, label=model_nm, alpha=0.6)
    plt.title(f'Predicted VS Test Data for {well_name}')
    plt.xlabel('Values', fontsize = 10)
    plt.ylabel('Class', fontsize = 10)
    plt.legend(loc='lower right', fontsize = 10)
    plt.xticks(fontsize = 10)
    plt.yticks(fontsize = 10)
    plt.savefig(f'{name}_prediction.png')
    plt.show()

In [None]:
def plot_pred_probs(name: str, y_pred_probs: np.array):
    fig = plt.figure(figsize=(6.7,6))
    plt.title(f"Prediction Probablities {well_name}")
    plt.hist(y_pred_probs)
    plt.savefig(f'{name}_prediction_prob.png')

In [None]:
def get_model_evaluation(y_test: np.array, y_pred: np.array):
    f1          = f1_score(y_test, y_pred, average='macro')
    recall      = recall_score(y_test, y_pred, zero_division=1, average='macro')
    accuracy    = accuracy_score(y_test, y_pred)
    precision   = precision_score(y_test, y_pred, zero_division=1, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f'Accuracy  : {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall    : {recall}')
    print(f'F1        : {f1}')
    print(f"AUC-ROC   : {roc_auc_score(y_test, y_pred):.3f}")
    print(f"AP        : {average_precision_score(y_test, y_pred):.3f}")

    print(f'Conf. matrix \n {conf_matrix}')

    return f1, recall, accuracy, precision, conf_matrix

In [None]:
def select_models(algorithm):
    if algorithm == 'mlp':
        model = MLPClassifier(
            verbose=1,
            max_iter=model_iter,
            batch_size=128,
            random_state=42,
            hidden_layer_sizes=(8,),
            learning_rate='constant',
            learning_rate_init=1e-5
        )

    elif algorithm == 'hgb':
        model = HistGradientBoostingClassifier(
            verbose=1,
            max_leaf_nodes=5,
            max_iter=model_iter,
            random_state=42,
        )

    elif algorithm == 'xgb':
        model = XGBClassifier(
            verbose=1,
            max_iter=model_iter,
            random_state=42,
        )

    elif algorithm == 'dt':
        model = DecisionTreeClassifier(
            random_state=42,
        )

    elif algorithm == 'svm':
        model = SVC(
            verbose=1,
            max_iter=-1,
            random_state=42,
        )

    elif algorithm == 'rf':
        model = RandomForestClassifier(
            verbose=1,
            random_state=42,
        )

    elif algorithm == 'lr':
        model = LogisticRegression(
            verbose=1,
            max_iter=model_iter,
            random_state=42,
            solver='lbfgs',
        )
    return model

In [None]:
def train_model(algorithm, X_train, X_test, y_train, y_test):
    #-- train model
    model = select_models(algorithm)
    model.fit(X_train, y_train)

    #-- save model
    model_name = f'{well_name}_{algorithm}_{step_in}_{step_out}.sav'
    model_path = os.path.join(os.getcwd(), model_name)
    joblib.dump(model, model_path)

    #-- evaluate
    y_probs = model.predict(X_test)
    y_pred  = (y_probs >= 0.5).astype(int)
    f1, recall, accuracy, precision, conf_matrix = get_model_evaluation(y_test, y_pred)

    #-- plot result
    plot_conf_matrix(well_name, conf_matrix)
    plot_pred_difference(well_name, y_test, y_pred)
    plot_pred_probs(well_name, y_probs)

    return model, y_probs, y_pred, conf_matrix

In [None]:
def feature_wise_minmax(X):
    num_samples, timesteps, num_features = X.shape
    X_scaled = np.zeros_like(X)

    scalers = []
    for i in range(num_features):
        scaler  = MinMaxScaler()
        feature = X[:, :, i].reshape(-1, 1)
        X_scaled[:, :, i] = scaler.fit_transform(feature).reshape(num_samples, timesteps)

        scalers.append(scaler)

    return X_scaled, scalers

In [None]:
def feature_wise_minmax_transform(X, scalers):
    num_samples, timesteps, num_features = X.shape
    X_scaled = np.zeros_like(X)

    for i in range(num_features):
        feature = X[:, :, i].reshape(-1, 1)
        X_scaled[:, :, i] = scalers[i].transform(feature).reshape(num_samples, timesteps)

    return X_scaled

# **2. DATA PREPARATION**

In [None]:
# -- load datasets
# base_path = os.path.join("rtm-stuck-prediction-datasets-idt-adjusted", "3.idt-adjusted", f"{step_in}{step_out}")

# dataset_train = [
#     f"/kaggle/input/{base_path}/well_a_train_adds_normal_{step_in}{step_out}_0_new.h5",
#     f"/kaggle/input/{base_path}/well_b_train_adds_normal_{step_in}{step_out}_0_new.h5",
#     f"/kaggle/input/{base_path}/well_d_train_adds_normal_{step_in}{step_out}_0_new.h5"
# ]

# dataset_test = [
#     f"/kaggle/input/{base_path}/well_a_test_adds_normal_{step_in}{step_out}_0_new.h5",
#     f"/kaggle/input/{base_path}/well_b_test_adds_normal_{step_in}{step_out}_0_new.h5",
#     f"/kaggle/input/{base_path}/well_d_test_adds_normal_{step_in}{step_out}_0_new.h5"
# ]

# dataset_train, dataset_test

base_path = os.path.join("rtm-stuck-prediction-datasets-idt-updated", "2. idt-updated", f"{step_in}{step_out}")

dataset_train = [
    f"/kaggle/input/{base_path}/well_a_train_normal_{step_in}{step_out}_0_new.h5",
    f"/kaggle/input/{base_path}/well_b_train_normal_{step_in}{step_out}_0_new.h5",
    f"/kaggle/input/{base_path}/well_d_train_normal_{step_in}{step_out}_0_new.h5"
]

dataset_test = [
    f"/kaggle/input/{base_path}/well_a_test_normal_{step_in}{step_out}_0_new.h5",
    f"/kaggle/input/{base_path}/well_b_test_normal_{step_in}{step_out}_0_new.h5",
    f"/kaggle/input/{base_path}/well_d_test_normal_{step_in}{step_out}_0_new.h5"
]

dataset_train, dataset_test

(['/kaggle/input/rtm-stuck-prediction-datasets-idt-updated/2. idt-updated/6060/well_a_train_normal_6060_0_new.h5',
  '/kaggle/input/rtm-stuck-prediction-datasets-idt-updated/2. idt-updated/6060/well_b_train_normal_6060_0_new.h5',
  '/kaggle/input/rtm-stuck-prediction-datasets-idt-updated/2. idt-updated/6060/well_d_train_normal_6060_0_new.h5'],
 ['/kaggle/input/rtm-stuck-prediction-datasets-idt-updated/2. idt-updated/6060/well_a_test_normal_6060_0_new.h5',
  '/kaggle/input/rtm-stuck-prediction-datasets-idt-updated/2. idt-updated/6060/well_b_test_normal_6060_0_new.h5',
  '/kaggle/input/rtm-stuck-prediction-datasets-idt-updated/2. idt-updated/6060/well_d_test_normal_6060_0_new.h5'])

In [None]:
#-- concatenate datasets
if generalize:
    X_train, y_train, X_test, y_test = [], [], [], []
    for i in tqdm(range(len(dataset_train))):
        X_tr, y_tr = load_h5_data(dataset_train[i], filtered=False, with_class=True)
        X_te, y_te = load_h5_data(dataset_test[i], filtered=False, with_class=True)

        X_tr = X_tr.reshape(X_tr.shape[0], -1)
        X_te = X_te.reshape(X_te.shape[0], -1)

        X_train.append(X_tr)
        y_train.append(y_tr)
        X_test.append(X_te)
        y_test.append(y_te)

    X_train = np.concatenate(X_train, axis=0)
    y_train = np.concatenate(y_train, axis=0)
    X_test  = np.concatenate(X_test, axis=0)
    y_test  = np.concatenate(y_test, axis=0)

else:
    X_train, y_train = load_h5_data(dataset_train[0], filtered=False, with_class = True)
    X_test, y_test   = load_h5_data(dataset_test[0], filtered=False, with_class = True)

timesteps, n_features = features, steps = X_train.shape[0], X_train.shape[1]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1434585, 540) (1434585,)
(327389, 540) (327389,)


In [None]:
if scaling:
    X_train_con, X_train_cat = X_train[:, :, :9], X_train[:, :, 9:]
    X_test_con, X_test_cat   = X_test[:, :, :9],  X_test[:, :, 9:]

    print(X_train_con.shape, X_train_cat.shape)
    print(X_test_con.shape, X_test_cat.shape)

In [None]:
if scaling:
    n_train_samples, n_train_timesteps, _ = X_train_con.shape
    n_test_samples, n_test_timesteps, _   = X_test_con.shape

    X_train_con, scaler = feature_wise_minmax(X_train_con)
    X_test_con = feature_wise_minmax_transform(X_test_con, scaler)

    X_train = np.concatenate([X_train_con, X_train_cat], axis=-1)
    X_test  = np.concatenate([X_test_con, X_test_cat], axis=-1)

    del X_train_con, X_train_cat
    del X_test_con, X_test_cat

    joblib.dump(scaler, "scaler_dep.pkl")

In [None]:
#-- make sure type is float
X_train = X_train.astype('float32')
X_test  = X_test.astype('float32')
y_train = y_train.astype('float32')
y_test  = y_test.astype('float32')

In [None]:
#-- get shapes
print(f'X_train : {X_train.shape}')
print(f'X_test  : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test  : {y_test.shape}')

X_train : (1434585, 540)
X_test  : (327389, 540)
y_train : (1434585,)
y_test  : (327389,)


In [None]:
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

In [None]:
seed_value = 42

np.random.seed(seed_value)
random.seed(seed_value)

# **3. DATA MODELLING**

In [None]:
#-- train model
print("training model . . .")
train_model(model_nm, X_train, X_test, y_train, y_test)