In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_absolute_error
import altair as alt

In [2]:
# Load the CSV files into Pandas Dataframes
folder = "../outlier_tolerance=5_grouping_time_window=200_anomaly_threshold=6_start_date=2022-01-01_end_date=2026-01-01"

df1 = pd.read_csv(os.path.join(folder, "HTOL-09_alerts.csv"))
df1['machine_id'] = 'HTOL-09'

df2 = pd.read_csv(os.path.join(folder, "HTOL-10_alerts.csv"))
df2['machine_id'] = 'HTOL-10'

df3 = pd.read_csv(os.path.join(folder, "HTOL-11_alerts.csv"))
df3['machine_id'] = 'HTOL-11'

df4 = pd.read_csv(os.path.join(folder, "HTOL-12_alerts.csv"))
df4['machine_id'] = 'HTOL-12'

df5 = pd.read_csv(os.path.join(folder, "HTOL-13_alerts.csv"))
df5['machine_id'] = 'HTOL-13'

df6 = pd.read_csv(os.path.join(folder, "HTOL-14_alerts.csv"))
df6['machine_id'] = 'HTOL-14'

df7 = pd.read_csv(os.path.join(folder, "HTOL-15_alerts.csv"))
df7['machine_id'] = 'HTOL-15'

In [3]:
# Combine all DataFrames
df = pd.concat([df1, df2, df3, df4, df5, df6, df7], ignore_index=True)

# Convert 'Time' to datetime
df['Time'] = pd.to_datetime(df['Time'])

# Sort by 'Time' and 'machine_id'
df.sort_values(['machine_id', 'Time'], inplace=True)

In [4]:
# Feature Engineering
def create_features(group):
    # Days since last alert (for each alert type)
    for alert_type in ['LOW', 'MEDIUM', 'HIGH', 'SIGMA']:
        group[f'days_since_{alert_type.lower()}'] = group['Time'].sub(
            group.loc[group['ALERT'] == alert_type, 'Time'].shift()
        ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert

    # Rolling statistics of 'ChlPrs'
    group['ChlPrs_mean_3'] = group['ChlPrs'].rolling(window=3).mean().fillna(method='bfill')
    group['ChlPrs_std_3'] = group['ChlPrs'].rolling(window=3).std().fillna(method='bfill')

    return group

In [5]:
df = df.groupby('machine_id').apply(create_features).reset_index(drop=True)

  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  group['ChlPrs_mean_3'] = group['ChlPrs'].rolling(window=3).mean().fillna(method='bfill')
  group['ChlPrs_std_3'] = group['ChlPrs'].rolling(window=3).std().fillna(method='bfill')
  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  ).dt.days.fillna(method='ffill').fillna(9999)  # Large value for no prior alert
  group['ChlPrs_mean_3'] = group['ChlPrs'].rolling(window=3).mean().fillna(method='bfill')
  group['ChlPrs_std_3'] = group['ChlPrs'].rolling(window=3).std().fillna(

In [6]:
# Target Variables
# Next alert type (one-hot encoded)
df['next_alert'] = df.groupby('machine_id')['ALERT'].shift(-1).fillna('NONE')

# Filter out 'NONE' alerts
df = df[df['next_alert'] != 'NONE'].copy()

le = LabelEncoder()
df['next_alert_encoded'] = le.fit_transform(df['next_alert'])

# Days until next alert
df['days_until_next_alert'] = df.groupby('machine_id')['Time'].diff(-1).dt.days.fillna(0).abs()

In [7]:
# Prepare data for model training
X = df[['ChlPrs', 'ChlPrs_mean_3', 'ChlPrs_std_3',
        'days_since_low', 'days_since_medium', 'days_since_high', 'days_since_sigma']]
y_alert_type = df['next_alert_encoded']
y_days_until_alert = df['days_until_next_alert']

In [8]:
# Drop rows with NaN values (introduced during feature creation)
df.dropna(inplace=True)

In [9]:
# Split data into training and testing sets
X_train, X_test, y_alert_type_train, y_alert_type_test, y_days_until_alert_train, y_days_until_alert_test = train_test_split(
    X, y_alert_type, y_days_until_alert, test_size=0.2, random_state=42
)

In [10]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Train models
# 1. Alert Type Classification
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_alert_type_train)

In [12]:
# 2. Days Until Next Alert Regression
# (You might explore other regression models here, e.g., Gradient Boosting, SVR, etc.)
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train_scaled, y_days_until_alert_train)


In [13]:
# Evaluate models
# 1. Alert Type Classification
y_alert_type_pred = rf_classifier.predict(X_test_scaled)
print("Alert Type Classification Report:\n", classification_report(y_alert_type_test, y_alert_type_pred))

# 2. Days Until Next Alert Regression
y_days_until_alert_pred = rf_regressor.predict(X_test_scaled)
mae = mean_absolute_error(y_days_until_alert_test, y_days_until_alert_pred)
print(f"Mean Absolute Error (Days Until Next Alert): {mae}")

Alert Type Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.97      0.99      0.98        72
           2       0.40      0.67      0.50         3
           3       0.60      0.60      0.60         5

    accuracy                           0.92        83
   macro avg       0.49      0.56      0.52        83
weighted avg       0.89      0.92      0.90        83

Mean Absolute Error (Days Until Next Alert): 2.947598967297763


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# 1. Create the test_results dataframe
test_results = pd.DataFrame({
    'Time': X_test.index,
    'Actual Alert': y_alert_type_test,
    'Predicted Alert': y_alert_type_pred
})

# 2. Decode the alert labels
test_results['Actual Alert'] = le.inverse_transform(test_results['Actual Alert'])
test_results['Predicted Alert'] = le.inverse_transform(test_results['Predicted Alert'])

# Melt the dataframe to have 'Actual' and 'Predicted' in one column
melted_df = test_results.melt('Time', var_name='Alert Type', value_name='Alert')

# 3-8. Create the scatter plot
chart = alt.Chart(melted_df).mark_point().encode(
    x='Time:T',
    y=alt.Y('Alert:N', axis=alt.Axis(title='Alert Type')),
    color=alt.Color('Alert:N', legend=alt.Legend(title='Alert')),
    shape=alt.Shape('Alert Type:N', legend=alt.Legend(title='Alert Type')),
    tooltip=['Time', 'Alert Type', 'Alert']
).properties(
    title='Actual vs Predicted Alerts Over Time'
).interactive()

chart.save('actual_vs_predicted_alerts_over_time.json')
chart

In [15]:
# Example Prediction
new_data = pd.DataFrame({
    'ChlPrs': [34.5],
    'ChlPrs_mean_3': [34.0],
    'ChlPrs_std_3': [0.5],
    'days_since_low': [10],
    'days_since_medium': [50],
    'days_since_high': [200],
    'days_since_sigma': [30],
})

In [16]:
new_data_scaled = scaler.transform(new_data)

predicted_alert_type = le.inverse_transform(rf_classifier.predict(new_data_scaled))[0]
predicted_days_until_alert = rf_regressor.predict(new_data_scaled)[0]

print(f"Predicted Next Alert Type: {predicted_alert_type}")
print(f"Predicted Days Until Next Alert: {predicted_days_until_alert}")

Predicted Next Alert Type: LOW
Predicted Days Until Next Alert: 3.57
