In [58]:
# Import the necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from joblib import load

#Load the trained model
model = load('models/gbm_pipeline.joblib')

In [59]:
DATA_PATH      = 'dataset/test.csv'       # ← your file
TIMESTAMP_COL  = 'Timestamp'
SENSOR_COLS    = ['Surface_Temperature_C']#,'Gearbox_Temperature_C']#,'Motor_Power_kW'
CATEGORICAL_COL = 'Well_Operating_Status'# autodetect if None

In [60]:
df = pd.read_csv(DATA_PATH)
print(df.head())

   Well_ID            Timestamp Well_Operating_Status Operational_Notes  \
0      101  2023-10-19 23:00:00             Operating               NaN   
1      101  2023-10-19 23:10:00             Operating               NaN   
2      101  2023-10-19 23:20:00             Operating               NaN   
3      101  2023-10-19 23:30:00             Operating               NaN   
4      101  2023-10-19 23:40:00             Operating               NaN   

   Acoustic_Sensor_Env_dB  Ambient_Temperature_C  Casing_Pressure_SensorB_psi  \
0               43.495883              22.330669                    51.428833   
1               41.019813              21.181358                    51.678282   
2               50.913909              28.858404                    53.043972   
3               52.627641              20.601888                    56.154578   
4               43.239641              28.007734                    51.343024   

   Casing_Pressure_psi  Gearbox_Temperature_C  Gearbox_Vibrati

In [61]:
df[TIMESTAMP_COL] = pd.to_datetime(df[TIMESTAMP_COL])
df = df.set_index(TIMESTAMP_COL).sort_index()

df = df.loc[:, SENSOR_COLS  + [CATEGORICAL_COL]]

df = pd.get_dummies(df, columns=[CATEGORICAL_COL], prefix='Well_Operating_Status', drop_first=True)

print(df.columns)

print(df.head())

Index(['Surface_Temperature_C', 'Well_Operating_Status_Operating',
       'Well_Operating_Status_Standby', 'Well_Operating_Status_Startup'],
      dtype='object')
                     Surface_Temperature_C  Well_Operating_Status_Operating  \
Timestamp                                                                     
2023-10-19 23:00:00              63.167678                             True   
2023-10-19 23:10:00              62.450938                             True   
2023-10-19 23:20:00              62.697741                             True   
2023-10-19 23:30:00              62.839280                             True   
2023-10-19 23:40:00              62.715691                             True   

                     Well_Operating_Status_Standby  \
Timestamp                                            
2023-10-19 23:00:00                          False   
2023-10-19 23:10:00                          False   
2023-10-19 23:20:00                          False   
2023-10-19 23

In [62]:
# interpolate small gaps & resample
sensors = (df[SENSOR_COLS]
      .resample('1H')
      .mean(numeric_only=True)
      .interpolate('time', limit=5))          # keep rows with labels

CATEGORICAL_COL_DUMMIES = [
       'Well_Operating_Status_Operating', 'Well_Operating_Status_Standby',
       'Well_Operating_Status_Startup']


flags = (
    df[CATEGORICAL_COL_DUMMIES]
      .resample("1min")
      .ffill()               # carry last known 0/1
      .astype(int)
)

df = sensors.join(flags)
print(f'Loaded {len(df):,} rows, sensors = {SENSOR_COLS}')
print(df.columns)

Loaded 1,753 rows, sensors = ['Surface_Temperature_C']
Index(['Surface_Temperature_C', 'Well_Operating_Status_Operating',
       'Well_Operating_Status_Standby', 'Well_Operating_Status_Startup'],
      dtype='object')


In [63]:
import numpy as np

# helper for rolling linear regression slope
def rolling_slope(x: np.ndarray) -> float:
    t = np.arange(len(x))
    # fit a line x ≈ a·t + b, return a
    # np.polyfit is overkill but fine for small windows
    return np.polyfit(t, x, 1)[0]

def make_all_features(
    frame,
    sensor_cols,
    lags=(1, 5, 10),
    rolls=(5,15,24),
):
    X = frame.copy()

    # 1) point-to-point diff
    # for c in sensor_cols:
    #     X[f'{c}_diff1'] = frame[c].diff()
    #
    # # 2) lag features
    # for l in lags:
    #     for c in sensor_cols:
    #         X[f'{c}_lag{l}'] = frame[c].shift(l)

    # 3) rolling mean & std (your existing)
    for w in rolls:
        for c in sensor_cols:
            X[f'{c}_mean{w}'] = frame[c].rolling(w).mean()
            X[f'{c}_std{w}']  = frame[c].rolling(w).std()

    # 4) rolling slope & slope change
    for w in rolls:
        slope_col = [f'{c}_slope{w}' for c in sensor_cols]
        X[slope_col] = (
            frame[sensor_cols]
              .rolling(window=w, min_periods=w)
              .apply(rolling_slope, raw=True)
        )
        # second derivative: change of slope
        for c in sensor_cols:
            X[f'{c}_slopechg{w}'] = X[f'{c}_slope{w}'].diff()

    # 5) multi-scale slope differences (short vs long)
    # e.g. slope5 – slope15
    for short, long in [(rolls[0], rolls[1])]:
        for c in sensor_cols:
            X[f'{c}_dslope{short}v{long}'] = (
                X[f'{c}_slope{short}'] - X[f'{c}_slope{long}']
            )

    return X


In [64]:
df_feat = make_all_features(df, SENSOR_COLS)
print(df_feat.head())

                     Surface_Temperature_C  Well_Operating_Status_Operating  \
Timestamp                                                                     
2023-10-19 23:00:00              62.783500                                1   
2023-10-20 00:00:00              63.363303                                1   
2023-10-20 01:00:00              64.538264                                1   
2023-10-20 02:00:00              62.913938                                1   
2023-10-20 03:00:00              63.022475                                1   

                     Well_Operating_Status_Standby  \
Timestamp                                            
2023-10-19 23:00:00                              0   
2023-10-20 00:00:00                              0   
2023-10-20 01:00:00                              0   
2023-10-20 02:00:00                              0   
2023-10-20 03:00:00                              0   

                     Well_Operating_Status_Startup  \
Timestamp    

In [65]:
X_raw   = df_feat.copy()

# Drop columns with all NaN values
X_raw = X_raw.dropna(axis=1, how="all")

# Drop rows with NaNs introduced by lag/roll
mask = X_raw.notna().all(axis=1)
X_raw= X_raw[mask]

print(X_raw.columns)
print(X_raw.shape)

Index(['Surface_Temperature_C', 'Well_Operating_Status_Operating',
       'Well_Operating_Status_Standby', 'Well_Operating_Status_Startup',
       'Surface_Temperature_C_mean5', 'Surface_Temperature_C_std5',
       'Surface_Temperature_C_mean15', 'Surface_Temperature_C_std15',
       'Surface_Temperature_C_mean24', 'Surface_Temperature_C_std24',
       'Surface_Temperature_C_slope5', 'Surface_Temperature_C_slopechg5',
       'Surface_Temperature_C_slope15', 'Surface_Temperature_C_slopechg15',
       'Surface_Temperature_C_slope24', 'Surface_Temperature_C_slopechg24',
       'Surface_Temperature_C_dslope5v15'],
      dtype='object')
(1729, 17)


In [66]:
y_prob = model.predict_proba(X_raw)[:, 1]
y_pred = (y_prob > 0.8).astype(int)
print(type(y_pred))
print(y_pred)

<class 'numpy.ndarray'>
[0 0 0 ... 0 0 0]


In [68]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_sensors(df, height_per_sensor=250):
    """
    Plot each column of df in its own subplot (shared x-axis).

    Parameters
    ----------
    df : pandas.DataFrame
        DateTimeIndex + one numeric column per sensor.
    height_per_sensor : int, optional
        Pixel height allotted to each subplot (default 250).

    Returns
    -------
    fig : plotly.graph_objects.Figure
    """
    n = len(df.columns)                     # <— works for any “n sensors”
    fig = make_subplots(
        rows=n,
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.015              # tighter spacing
    )

    for i, col in enumerate(df.columns, start=1):
        fig.add_trace(
            go.Scatter(
                x=df.index,
                y=df[col],
                name=col,
                line_width=1
            ),
            row=i,
            col=1
        )
        fig.update_yaxes(title_text=col, row=i, col=1)

    fig.update_layout(
        height=max(400, n * height_per_sensor),   # scale canvas height
        showlegend=False,
        hovermode="x unified",
        margin=dict(t=30, b=30, l=60, r=20)
    )
    return fig

# ------- USAGE -------
plot_df = X_raw.copy()
# y_pred is your array or Series of predictions
plot_df.insert(
    loc=0,
    column='Monitoring_Status_pred',
    value=y_pred
)

#Move the Monitoring_Status_pred to the begining

#plot_df = plot_df.set_index('Timestamp').sort_index()


fig = plot_sensors(plot_df)      # df = your cleaned DataFrame
fig.show()