In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# -----------------------------------
# 1. LOAD DATA
# -----------------------------------
df = pd.read_csv("synthetic_sensor_data.csv", parse_dates=["timestamp"])

# Ensure correct ordering
df = df.sort_values(["machine_id", "timestamp"]).reset_index(drop=True)

# -----------------------------------
# 2. CREATE TARGET: fail_within_24h
# -----------------------------------
HORIZON_HOURS = 24

def add_failure_horizon_labels(group: pd.DataFrame) -> pd.DataFrame:
    group = group.sort_values("timestamp").copy()
    
    group["failure_time"] = group["timestamp"].where(group["failed"] == 1)
    group["next_failure_time"] = group["failure_time"].bfill()
    
    time_diff = group["next_failure_time"] - group["timestamp"]
    group["time_to_next_failure_hours"] = time_diff.dt.total_seconds() / 3600
    
    group["fail_within_24h"] = (
        (group["time_to_next_failure_hours"] >= 0) &
        (group["time_to_next_failure_hours"] <= HORIZON_HOURS)
    ).astype(int)
    
    return group

df = df.groupby("machine_id", group_keys=False).apply(add_failure_horizon_labels)

print(df["failed"].value_counts())
print(df["fail_within_24h"].value_counts())

# -----------------------------------
# 3. FEATURE ENGINEERING (rolling stats)
# -----------------------------------
# We have data every 10 minutes:
FREQ_MIN = 10
STEPS_PER_HOUR = int(60 / FREQ_MIN)    # 6
WINDOW_6H = 6 * STEPS_PER_HOUR         # 36
WINDOW_24H = 24 * STEPS_PER_HOUR       # 144

sensor_cols = ["temp_c", "vibration_ms2", "pressure_psi", "load_pct", "rpm"]

df = df.sort_values(["machine_id", "timestamp"]).reset_index(drop=True)

grouped = df.groupby("machine_id", group_keys=False)

for col in sensor_cols:
    # 6-hour rolling stats
    df[f"{col}_mean_6h"] = grouped[col].rolling(window=WINDOW_6H, min_periods=WINDOW_6H).mean().reset_index(level=0, drop=True)
    df[f"{col}_std_6h"]  = grouped[col].rolling(window=WINDOW_6H, min_periods=WINDOW_6H).std().reset_index(level=0, drop=True)
    df[f"{col}_max_6h"]  = grouped[col].rolling(window=WINDOW_6H, min_periods=WINDOW_6H).max().reset_index(level=0, drop=True)
    
    # 24-hour rolling stats
    df[f"{col}_mean_24h"] = grouped[col].rolling(window=WINDOW_24H, min_periods=WINDOW_24H).mean().reset_index(level=0, drop=True)
    df[f"{col}_std_24h"]  = grouped[col].rolling(window=WINDOW_24H, min_periods=WINDOW_24H).std().reset_index(level=0, drop=True)
    
    # Trend features (delta from 24h mean)
    df[f"{col}_delta_24h"] = df[col] - df[f"{col}_mean_24h"]

# Example spike feature for temperature: count of high-temp points in last 24h
df["temp_high"] = (df["temp_c"] > 80).astype(int)
df["temp_high_count_24h"] = grouped["temp_high"].rolling(window=WINDOW_24H, min_periods=WINDOW_24H).sum().reset_index(level=0, drop=True)

# -----------------------------------
# 4. CLEAN UP & DROP EARLY ROWS WITHOUT FULL HISTORY
# -----------------------------------
# Drop rows where 24h features are NaN (initial warm-up period per machine)
feature_cols = [c for c in df.columns if any(k in c for k in ["mean_6h", "std_6h", "max_6h", "mean_24h", "std_24h", "delta_24h", "temp_high_count_24h"])]

df_model = df.dropna(subset=feature_cols + ["fail_within_24h"]).copy()

print("Rows after dropping initial warm-up period:", len(df_model))

# -----------------------------------
# 5. TRAIN-TEST SPLIT (TIME-BASED)
# -----------------------------------
time_threshold = df_model["timestamp"].quantile(0.8)

train = df_model[df_model["timestamp"] <= time_threshold]
test  = df_model[df_model["timestamp"] > time_threshold]

print("Original train distribution:")
print(train["fail_within_24h"].value_counts())

# -----------------------------------
# 5B. FIX CLASS IMBALANCE (UNDERSAMPLE MAJORITY)
# -----------------------------------
train_pos = train[train["fail_within_24h"] == 1]
train_neg = train[train["fail_within_24h"] == 0]

# Undersample majority class to ratio (pos : neg = 1 : 5)
neg_sample_size = min(len(train_neg), len(train_pos) * 5)
train_neg_sampled = train_neg.sample(n=neg_sample_size, random_state=42)

train_balanced = pd.concat([train_pos, train_neg_sampled], axis=0)
train_balanced = train_balanced.sample(frac=1.0, random_state=42)  # shuffle

X_train = train_balanced[feature_cols]
y_train = train_balanced["fail_within_24h"]

X_test = test[feature_cols]
y_test = test["fail_within_24h"]

print("\nBalanced train distribution:")
print(y_train.value_counts())

# -----------------------------------
# 6. TRAIN MODEL (NO class_weight needed now)
# -----------------------------------
clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train, y_train)

# -----------------------------------
# 7. EVALUATE WITH CUSTOM THRESHOLD
# -----------------------------------
y_proba = clf.predict_proba(X_test)[:, 1]

# Lower threshold improves recall (important for failure prediction)
THRESHOLD = 0.20
y_pred = (y_proba >= THRESHOLD).astype(int)

# Attach probabilities back to the test dataframe
test_with_proba = test.copy()
test_with_proba["proba_24h"] = y_proba
test_with_proba["pred_24h"] = y_pred

# Sort by highest predicted probability
top_risky = test_with_proba.sort_values("proba_24h", ascending=False).head(5)
print(top_risky[["machine_id", "timestamp", "fail_within_24h", "proba_24h"]])

print("\nClassification report with threshold=0.20:")
print(classification_report(y_test, y_pred, digits=3))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# -----------------------------------
# 8. SAVE MODEL FOR API/APP
# -----------------------------------
model_artifact = {
    "model": clf,
    "feature_cols": feature_cols
}

joblib.dump(model_artifact, "pd_24h_model.joblib")
print("\nSaved model to pd_24h_model.joblib")


  df = df.groupby("machine_id", group_keys=False).apply(add_failure_horizon_labels)


Label distribution (fail_within_24h):
fail_within_24h
0    430020
1      2030
Name: count, dtype: int64
Rows after dropping initial warm-up period: 424900
Original train distribution:
fail_within_24h
0    338790
1      1160
Name: count, dtype: int64

Balanced train distribution:
fail_within_24h
0    5800
1    1160
Name: count, dtype: int64
        machine_id           timestamp  fail_within_24h  proba_24h
361423          41 2024-02-19 14:20:00                0      0.895
361438          41 2024-02-19 16:50:00                0      0.895
361437          41 2024-02-19 16:40:00                0      0.895
361424          41 2024-02-19 14:30:00                0      0.885
361420          41 2024-02-19 13:50:00                0      0.885

Classification report with threshold=0.20:
              precision    recall  f1-score   support

           0      0.995     0.910     0.951     84080
           1      0.057     0.526     0.103       870

    accuracy                          0.906     

In [5]:
import json

# Pick the most risky sample
row = top_risky.iloc[0]

# Build the feature dict in the same order as feature_cols
feature_payload = {col: float(row[col]) for col in feature_cols}

print("Top risky sample probability:", row["proba_24h"])
print("Label (fail_within_24h):", row["fail_within_24h"])

# Print JSON for FastAPI
json_payload = {"features": feature_payload}
print(json.dumps(json_payload, indent=2))

Top risky sample probability: 0.895
Label (fail_within_24h): 0
{
  "features": {
    "temp_c_mean_6h": 59.658417932856224,
    "temp_c_std_6h": 1.227856431125955,
    "temp_c_max_6h": 61.83235335345177,
    "temp_c_mean_24h": 59.7622625791732,
    "temp_c_std_24h": 1.4584602959242088,
    "temp_c_delta_24h": -0.05232565141596979,
    "vibration_ms2_mean_6h": 1.6945291476061946,
    "vibration_ms2_std_6h": 0.21788714455398325,
    "vibration_ms2_max_6h": 2.1976394144275138,
    "vibration_ms2_mean_24h": 1.6536826792463808,
    "vibration_ms2_std_24h": 0.2102923553458617,
    "vibration_ms2_delta_24h": -0.3635887987839963,
    "pressure_psi_mean_6h": 292.4262603557566,
    "pressure_psi_std_6h": 5.344716665226278,
    "pressure_psi_max_6h": 302.9737722082797,
    "pressure_psi_mean_24h": 291.53699760743785,
    "pressure_psi_std_24h": 4.695917341293183,
    "pressure_psi_delta_24h": -1.6470987837406028,
    "load_pct_mean_6h": 62.26964684641051,
    "load_pct_std_6h": 5.117594154391088,
