In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import statistics
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [3]:
import pandas as pd
import numpy as np
import os
import joblib
from datetime import datetime
from typing import Dict, List, Union, Tuple

In [4]:

class AlertPredictor:
    def __init__(self, model_type='xgboost'):
        """
        Initializes the AlertPredictor with the specified model type ('xgboost' or 'randomforest').
        """
        self.model_type = model_type
        self.models = {}
        self.scalers = {}
        self.alert_types = ['LOW', 'MEDIUM', 'HIGH', ] # 'SIGMA']
        self.features = ['ChlPrs',
                        #  'hour',
                        #  'day_of_week',
                        #  'month',
                        #  'is_weekend',
                         'rolling_mean', 'rolling_std'] + [f'time_since_{at}' for at in self.alert_types]

    def load_and_preprocess_data(self, folder):
        """
        Loads and preprocesses data from CSV files in the specified folder.
        """
        dfs = []
        for i in range(9, 16):
            file_name = f"HTOL-{i:02d}_alerts.csv"
            df = pd.read_csv(os.path.join(folder, file_name))
            df['machine_id'] = f'HTOL-{i:02d}'
            dfs.append(df)

        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df['Time'] = pd.to_datetime(combined_df['Time'])
        combined_df = combined_df.sort_values(['machine_id', 'Time'])

        return combined_df

    def engineer_features(self, df):
        """
        Engineers features from the preprocessed data.
        """
        df['hour'] = df['Time'].dt.hour
        df['day_of_week'] = df['Time'].dt.dayofweek
        df['month'] = df['Time'].dt.month
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

        # Calculate rolling statistics
        df['rolling_mean'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).mean().reset_index(0, drop=True)
        df['rolling_std'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).std().reset_index(0, drop=True)

        # Calculate time since last alert for each type
        for alert_type in self.alert_types:
            df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
                lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)).reset_index(level=0, drop=True)
            df[f'time_since_{alert_type}'] = df[f'time_since_{alert_type}'].dt.total_seconds() / 3600  # Convert to hours

        return df

    def prepare_data_for_classification(self, df, target_alert_type, prediction_window):
        """
        Prepares the data for training the classification model.
        """
        df['target'] = df.groupby('machine_id').apply(
            lambda x: (x['ALERT'] == target_alert_type).rolling(window=prediction_window).max().shift(-prediction_window + 1)).reset_index(level=0,
                                                                                                                                           drop=True)

        X = df[self.features]
        y = df['target'].fillna(0)  # Fill NaN with 0 (no alert)

        return X, y

    def train_and_evaluate_classifier(self, X, y, test_size=0.2):
        """
        Trains and evaluates the classification model.
        """
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        if self.model_type == 'xgboost':
            # XGBoost configuration for imbalanced classification
            model = XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=6,
                min_child_weight=1,
                gamma=0,
                subsample=0.8,
                colsample_bytree=0.8,
                scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),  # Handle class imbalance
                random_state=42,
                eval_metric='logloss',
                early_stopping_rounds=10,
            )
            model.fit(X_train_scaled, y_train, eval_set=[(X_test_scaled, y_test)], verbose=0)
        elif self.model_type == 'randomforest':
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_train_scaled, y_train)
        else:
            raise ValueError("Invalid model_type. Choose 'xgboost' or 'randomforest'.")

        y_pred = model.predict(X_test_scaled)
        print(classification_report(y_test, y_pred))

        return model, scaler

    def train(self, folder, prediction_window=7):
        """
        Trains the models for each alert type.
        """
        df = self.load_and_preprocess_data(folder)
        df = self.engineer_features(df)

        for alert_type in self.alert_types:
            print(f"\nTraining model for {alert_type} alerts:")
            X, y = self.prepare_data_for_classification(df, alert_type, prediction_window)
            model, scaler = self.train_and_evaluate_classifier(X, y)
            self.models[alert_type] = model
            self.scalers[alert_type] = scaler

    def predict(self, new_data):
        """
        Makes predictions on new data.
        """
        predictions = {}
        for alert_type in self.alert_types:
            X_new = new_data[self.features]
            X_new_scaled = self.scalers[alert_type].transform(X_new)
            alert_probability = self.models[alert_type].predict_proba(X_new_scaled)[0, 1]
            predictions[alert_type] = alert_probability
        return predictions

    def visualize_alerts(self, df, target_alert_type, prediction_window, probability_threshold=0.7):
        """
        Visualizes actual alerts and high-risk periods.
        """
        X = df[self.features]
        X_scaled = self.scalers[target_alert_type].transform(X)

        df['alert_probability'] = self.models[target_alert_type].predict_proba(X_scaled)[:, 1]
        df['high_risk'] = df['alert_probability'] > probability_threshold

        plt.figure(figsize=(20, 15))
        machines = df['machine_id'].unique()
        n_machines = len(machines)

        for i, machine_id in enumerate(machines):
            machine_df = df[df['machine_id'] == machine_id]

            # Plot actual alerts
            alerts = machine_df[machine_df['ALERT'] == target_alert_type]
            plt.scatter(alerts['Time'], [i - 0.2] * len(alerts), marker='o', s=100,
                        label=f'Actual {target_alert_type} Alert' if i == 0 else "")

            # Plot high-risk periods
            high_risk_periods = machine_df[machine_df['high_risk']]
            plt.scatter(high_risk_periods['Time'], [i + 0.2] * len(high_risk_periods), marker='x', s=100,
                        label=f'High Risk Period ({target_alert_type})' if i == 0 else "")

            plt.text(df['Time'].min(), i, machine_id, va='center', ha='right', fontweight='bold')

        plt.yticks(range(n_machines), machines)
        plt.xlabel('Date')
        plt.ylabel('Machine ID')
        plt.title(f'Actual Alerts vs High Risk Periods for {target_alert_type} Alerts')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [5]:
class ProductionAlertPredictor:
    def __init__(self):
        """
        Initialize the production predictor that handles both XGBoost and Random Forest models.
        """
        self.models = {}
        self.scalers = {}
        self.alert_types = ['LOW', 'MEDIUM', 'HIGH']
        self.features = [
            'ChlPrs',
            # 'hour',
            # 'day_of_week',
            # 'month',
            # 'is_weekend',
            'rolling_mean',
            'rolling_std'
        ] + [f'time_since_{at}' for at in self.alert_types]

    def save_models(self, xgb_predictor: AlertPredictor, rf_predictor: AlertPredictor,
                   save_dir: str) -> None:
        """
        Save trained models and scalers to disk.

        Args:
            xgb_predictor: Trained XGBoost AlertPredictor instance
            rf_predictor: Trained Random Forest AlertPredictor instance
            save_dir: Directory to save the models
        """
        os.makedirs(save_dir, exist_ok=True)

        # Save metadata
        metadata = {
            'features': self.features,
            'alert_types': self.alert_types,
            'timestamp': datetime.now().isoformat(),
        }
        joblib.dump(metadata, os.path.join(save_dir, 'metadata.joblib'))

        # Save models and scalers
        for alert_type in self.alert_types:
            # Save XGBoost models and scalers
            joblib.dump(
                xgb_predictor.models[alert_type],
                os.path.join(save_dir, f'xgboost_{alert_type.lower()}_model.joblib')
            )
            joblib.dump(
                xgb_predictor.scalers[alert_type],
                os.path.join(save_dir, f'xgboost_{alert_type.lower()}_scaler.joblib')
            )

            # Save Random Forest models and scalers
            joblib.dump(
                rf_predictor.models[alert_type],
                os.path.join(save_dir, f'randomforest_{alert_type.lower()}_model.joblib')
            )
            joblib.dump(
                rf_predictor.scalers[alert_type],
                os.path.join(save_dir, f'randomforest_{alert_type.lower()}_scaler.joblib')
            )

    def load_models(self, load_dir: str) -> None:
        """
        Load saved models and scalers from disk.

        Args:
            load_dir: Directory containing the saved models
        """
        # Load metadata
        metadata = joblib.load(os.path.join(load_dir, 'metadata.joblib'))
        self.features = metadata['features']
        self.alert_types = metadata['alert_types']

        # Initialize nested dictionaries for models and scalers
        self.models = {'xgboost': {}, 'randomforest': {}}
        self.scalers = {'xgboost': {}, 'randomforest': {}}

        # Load models and scalers
        for alert_type in self.alert_types:
            # Load XGBoost
            self.models['xgboost'][alert_type] = joblib.load(
                os.path.join(load_dir, f'xgboost_{alert_type.lower()}_model.joblib')
            )
            self.scalers['xgboost'][alert_type] = joblib.load(
                os.path.join(load_dir, f'xgboost_{alert_type.lower()}_scaler.joblib')
            )

            # Load Random Forest
            self.models['randomforest'][alert_type] = joblib.load(
                os.path.join(load_dir, f'randomforest_{alert_type.lower()}_model.joblib')
            )
            self.scalers['randomforest'][alert_type] = joblib.load(
                os.path.join(load_dir, f'randomforest_{alert_type.lower()}_scaler.joblib')
            )

    def prepare_features(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Prepare features for prediction.

        Args:
            data: DataFrame containing at minimum 'Time', 'ChlPrs', and 'machine_id' columns

        Returns:
            DataFrame with engineered features
        """
        df = data.copy()

        # Time-based features
        df['Time'] = pd.to_datetime(df['Time'])
        df['hour'] = df['Time'].dt.hour
        df['day_of_week'] = df['Time'].dt.dayofweek
        df['month'] = df['Time'].dt.month
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

        # Rolling statistics
        df['rolling_mean'] = df.groupby('machine_id')['ChlPrs'].rolling(
            window=24, min_periods=1).mean().reset_index(0, drop=True)
        df['rolling_std'] = df.groupby('machine_id')['ChlPrs'].rolling(
            window=24, min_periods=1).std().reset_index(0, drop=True)

        # Time since last alert features
        for alert_type in self.alert_types:
            if 'ALERT' in df.columns:
                df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
                    lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)
                ).reset_index(level=0, drop=True)
                df[f'time_since_{alert_type}'] = df[f'time_since_{alert_type}'].dt.total_seconds() / 3600
            else:
                # For new data without alert history, use a large value
                df[f'time_since_{alert_type}'] = 168  # One week in hours

        return df[self.features]

    def predict(self, data: pd.DataFrame, model_type: str = 'xgboost') -> Dict[str, Dict[str, float]]:
        """
        Make predictions using the loaded models.

        Args:
            data: DataFrame containing required features
            model_type: 'xgboost' or 'randomforest'

        Returns:
            Dictionary containing predictions for each machine and alert type
        """
        if model_type not in ['xgboost', 'randomforest']:
            raise ValueError("model_type must be 'xgboost' or 'randomforest'")

        # Prepare features
        X = self.prepare_features(data)

        # Make predictions for each machine and alert type
        predictions = {}
        for machine_id in data['machine_id'].unique():
            machine_data = X[data['machine_id'] == machine_id]
            machine_predictions = {}

            for alert_type in self.alert_types:
                # Scale the features
                X_scaled = self.scalers[model_type][alert_type].transform(machine_data)

                # Get prediction probabilities
                probs = self.models[model_type][alert_type].predict_proba(X_scaled)[:, 1]

                # Store the average probability for this alert type
                machine_predictions[alert_type] = float(probs.mean())

            predictions[machine_id] = machine_predictions

        return predictions

def save_trained_models(xgb_predictor: AlertPredictor, rf_predictor: AlertPredictor,
                       save_dir: str) -> None:
    """
    Convenience function to save trained models.
    """
    production_predictor = ProductionAlertPredictor()
    production_predictor.save_models(xgb_predictor, rf_predictor, save_dir)
    print(f"Models saved successfully to {save_dir}")

def load_production_predictor(load_dir: str) -> ProductionAlertPredictor:
    """
    Convenience function to load saved models.
    """
    production_predictor = ProductionAlertPredictor()
    production_predictor.load_models(load_dir)
    return production_predictor

In [6]:
folder = "../../../outlier_tolerance=5_grouping_time_window=200_anomaly_threshold=6_start_date=2022-01-01_end_date=2026-01-01"
prediction_window = 7  # days
output_dir = "production_models_solo"

# Train models
xgb_predictor = AlertPredictor(model_type='xgboost')
xgb_predictor.train(folder, prediction_window)

rf_predictor = AlertPredictor(model_type='randomforest')
rf_predictor.train(folder, prediction_window)

# Save the trained models
save_trained_models(xgb_predictor, rf_predictor, output_dir)

  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(


KeyboardInterrupt: 

In [29]:
def get_final_verdict(predictor, new_data):
    LOW_thresh = 0.7
    MEDIUM_thresh = 0.6
    HIGH_thresh = 0.7
    """
    Predicts alerts based on two models (XGBoost and Random Forest) and
    returns a final verdict based on a probability_threshold. An alert is raised only
    if both models predict the alert type with probability above the probability_threshold.

    Args:
      predictor: An object with a predict method that takes the data and
                 model_type as arguments and returns predictions.
      new_data: The input data for prediction.
      probability_threshold: The probability probability_threshold for an alert to be considered valid.

    Returns:
      A dictionary with machine IDs as keys and a list of alerts as values.
    """

    xgb_predictions = predictor.predict(new_data, model_type='xgboost')
    rf_predictions = predictor.predict(new_data, model_type='randomforest')
    final_verdicts = {}

    for machine_id, xgb_preds in xgb_predictions.items():
        final_verdicts[machine_id] = []
        rf_preds = rf_predictions[machine_id]

        for alert_type in xgb_preds:
            threshold = LOW_thresh if alert_type == "LOW" else MEDIUM_thresh if alert_type == "MEDIUM" else HIGH_thresh
            if  statistics.mean([
                # xgb_preds[alert_type],
                rf_preds[alert_type]]) > threshold:
                final_verdicts[machine_id].append(alert_type)

    return final_verdicts

In [47]:
# Load the production predictor
predictor = load_production_predictor(output_dir)

# Example new data
new_data = pd.DataFrame({
    'Time': ['2024-05-12 12:23:00', '2024-05-12 12:23:00', '2024-05-12 12:23:00', '2024-05-12 12:23:00', '2024-05-12 12:23:00'],
    'machine_id': ['HTOL-09', 'HTOL-09', 'HTOL-09', 'HTOL-09',  'HTOL-09'],
    'ChlPrs': [32, 34, 36, 39, 45]
})

# Get predictions from both models
xgb_predictions = predictor.predict(new_data, model_type='xgboost')
rf_predictions = predictor.predict(new_data, model_type='randomforest')

# Print predictions
for machine_id, predictions in xgb_predictions.items():
    print(f"\nPredictions for {machine_id}:")
    print("XGBoost predictions:")
    for alert_type, prob in predictions.items():
        print(f"{alert_type}: {prob:.3f}")

    print("\nRandom Forest predictions:")
    rf_preds = rf_predictions[machine_id]
    for alert_type, prob in rf_preds.items():
        print(f"{alert_type}: {prob:.3f}")

get_final_verdict(predictor, new_data)


Predictions for HTOL-09:
XGBoost predictions:
LOW: 0.941
MEDIUM: 0.094
HIGH: 0.196

Random Forest predictions:
LOW: 0.706
MEDIUM: 0.494
HIGH: 0.360


{'HTOL-09': ['LOW']}