In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import numpy as np

# Load Data

In [6]:
df = pd.read_csv('/water_leak_detection_1000_rows.csv')

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values(by='Timestamp').reset_index(drop=True)

# Feature Engineering

In [7]:
features = ['Pressure (bar)', 'Flow Rate (L/s)', 'Temperature (°C)']
target = 'Leak Status'
window_size = 5

for col in features:
    df[f'{col}_roll_mean'] = df[col].rolling(window=window_size).mean()
    df[f'{col}_roll_std'] = df[col].rolling(window=window_size).std()
    df[f'{col}_lag_1'] = df[col].shift(1)
    df[f'{col}_diff'] = df[col].diff()

# Clean NaN Values

In [8]:
df = df.dropna()

# Define Features (X) and Target (y)

In [9]:
engineered_features = [
       'Pressure (bar)', 'Flow Rate (L/s)', 'Temperature (°C)',
       'Pressure (bar)_roll_mean', 'Pressure (bar)_roll_std','Pressure (bar)_lag_1', 'Pressure (bar)_diff',
       'Flow Rate (L/s)_roll_mean', 'Flow Rate (L/s)_roll_std','Flow Rate (L/s)_lag_1', 'Flow Rate (L/s)_diff',
       'Temperature (°C)_roll_mean', 'Temperature (°C)_roll_std','Temperature (°C)_lag_1', 'Temperature (°C)_diff'
]

X = df[engineered_features]
y = df[target]

# Split Data (Time-Series Aware)

In [10]:
split_percent = 0.8
split_index = int(len(df) * split_percent)
X_train, y_train = X.iloc[:split_index], y.iloc[:split_index]
X_test, y_test = X.iloc[split_index:], y.iloc[split_index:]
print(f"Data split: {len(X_train)} train, {len(X_test)} test samples.")
print(f"Original training labels:\n{y_train.value_counts()}")

Data split: 796 train, 200 test samples.
Original training labels:
Leak Status
0    780
1     16
Name: count, dtype: int64


# Data Scaling

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle Imbalance with SMOTE

In [12]:
print("Applying SMOTE to training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
print(f"Resampled training labels:\n{y_train_resampled.value_counts()}")

Applying SMOTE to training data...
Resampled training labels:
Leak Status
0    780
1    780
Name: count, dtype: int64


# Train Model

In [14]:
model = XGBClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
model.fit(X_train_resampled, y_train_resampled)

print("Model trained.")

Model trained.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# Realistic Evaluation

In [15]:
preds = model.predict(X_test_scaled)
print("\nModel Evaluation Report (XGBoost)")
print(classification_report(y_test, preds))
print("\nConfusion Matrix (XGBoost)")
cm = confusion_matrix(y_test, preds)
print(pd.DataFrame(cm,
                   index=[f'Actual {i}' for i in np.unique(y_test)],
                   columns=[f'Pred {i}' for i in np.unique(y_test)]))


Model Evaluation Report (XGBoost)
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       198
           1       0.40      1.00      0.57         2

    accuracy                           0.98       200
   macro avg       0.70      0.99      0.78       200
weighted avg       0.99      0.98      0.99       200


Confusion Matrix (XGBoost)
          Pred 0  Pred 1
Actual 0     195       3
Actual 1       0       2


# Save Model and Scaler

In [17]:
print("Saving model and scaler")
joblib.dump(model, 'leak_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Successfully saved 'leak_model.pkl' and 'scaler.pkl'.")

Saving model and scaler
Successfully saved 'leak_model.pkl' and 'scaler.pkl'.
