In [16]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [17]:
import pandas as pd
import numpy as np
import os
import pickle
import json
from datetime import datetime
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [18]:

class AlertPredictor:
    def __init__(self, model_type='xgboost'):
        """
        Initializes the AlertPredictor with the specified model type ('xgboost' or 'randomforest').
        """
        self.model_type = model_type
        self.models = {}
        self.scalers = {}
        self.alert_types = ['LOW', 'MEDIUM', 'HIGH', ] # 'SIGMA']
        self.features = ['ChlPrs',
                        #  'hour',
                        #  'day_of_week',
                        #  'month',
                        #  'is_weekend',
                         'rolling_mean', 'rolling_std'] + [f'time_since_{at}' for at in self.alert_types]

    def load_and_preprocess_data(self, folder):
        """
        Loads and preprocesses data from CSV files in the specified folder.
        """
        dfs = []
        for i in range(9, 16):
            file_name = f"HTOL-{i:02d}_alerts.csv"
            df = pd.read_csv(os.path.join(folder, file_name))
            df['machine_id'] = f'HTOL-{i:02d}'
            dfs.append(df)

        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df['Time'] = pd.to_datetime(combined_df['Time'])
        combined_df = combined_df.sort_values(['machine_id', 'Time'])

        return combined_df

    def engineer_features(self, df):
        """
        Engineers features from the preprocessed data.
        """
        df['hour'] = df['Time'].dt.hour
        df['day_of_week'] = df['Time'].dt.dayofweek
        df['month'] = df['Time'].dt.month
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

        # Calculate rolling statistics
        df['rolling_mean'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).mean().reset_index(0, drop=True)
        df['rolling_std'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).std().reset_index(0, drop=True)

        # Calculate time since last alert for each type
        for alert_type in self.alert_types:
            df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
                lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)).reset_index(level=0, drop=True)
            df[f'time_since_{alert_type}'] = df[f'time_since_{alert_type}'].dt.total_seconds() / 3600  # Convert to hours

        return df

    def prepare_data_for_classification(self, df, target_alert_type, prediction_window):
        """
        Prepares the data for training the classification model.
        """
        df['target'] = df.groupby('machine_id').apply(
            lambda x: (x['ALERT'] == target_alert_type).rolling(window=prediction_window).max().shift(-prediction_window + 1)).reset_index(level=0,
                                                                                                                                           drop=True)

        X = df[self.features]
        y = df['target'].fillna(0)  # Fill NaN with 0 (no alert)

        return X, y

    def train_and_evaluate_classifier(self, X, y, test_size=0.2):
        """
        Trains and evaluates the classification model.
        """
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        if self.model_type == 'xgboost':
            # XGBoost configuration for imbalanced classification
            model = XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=6,
                min_child_weight=1,
                gamma=0,
                subsample=0.8,
                colsample_bytree=0.8,
                scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),  # Handle class imbalance
                random_state=42,
                eval_metric='logloss',
                early_stopping_rounds=10,
            )
            model.fit(X_train_scaled, y_train, eval_set=[(X_test_scaled, y_test)], verbose=0)
        elif self.model_type == 'randomforest':
            model = RandomForestClassifier(n_estimators=100, random_state=42)
            model.fit(X_train_scaled, y_train)
        else:
            raise ValueError("Invalid model_type. Choose 'xgboost' or 'randomforest'.")

        y_pred = model.predict(X_test_scaled)
        print(classification_report(y_test, y_pred))

        return model, scaler

    def train(self, folder, prediction_window=7):
        """
        Trains the models for each alert type.
        """
        df = self.load_and_preprocess_data(folder)
        df = self.engineer_features(df)

        for alert_type in self.alert_types:
            print(f"\nTraining model for {alert_type} alerts:")
            X, y = self.prepare_data_for_classification(df, alert_type, prediction_window)
            model, scaler = self.train_and_evaluate_classifier(X, y)
            self.models[alert_type] = model
            self.scalers[alert_type] = scaler

    def predict(self, new_data):
        """
        Makes predictions on new data.
        """
        predictions = {}
        for alert_type in self.alert_types:
            X_new = new_data[self.features]
            X_new_scaled = self.scalers[alert_type].transform(X_new)
            alert_probability = self.models[alert_type].predict_proba(X_new_scaled)[0, 1]
            predictions[alert_type] = alert_probability
        return predictions

    def visualize_alerts(self, df, target_alert_type, prediction_window, probability_threshold=0.7):
        """
        Visualizes actual alerts and high-risk periods.
        """
        X = df[self.features]
        X_scaled = self.scalers[target_alert_type].transform(X)

        df['alert_probability'] = self.models[target_alert_type].predict_proba(X_scaled)[:, 1]
        df['high_risk'] = df['alert_probability'] > probability_threshold

        plt.figure(figsize=(20, 15))
        machines = df['machine_id'].unique()
        n_machines = len(machines)

        for i, machine_id in enumerate(machines):
            machine_df = df[df['machine_id'] == machine_id]

            # Plot actual alerts
            alerts = machine_df[machine_df['ALERT'] == target_alert_type]
            plt.scatter(alerts['Time'], [i - 0.2] * len(alerts), marker='o', s=100,
                        label=f'Actual {target_alert_type} Alert' if i == 0 else "")

            # Plot high-risk periods
            high_risk_periods = machine_df[machine_df['high_risk']]
            plt.scatter(high_risk_periods['Time'], [i + 0.2] * len(high_risk_periods), marker='x', s=100,
                        label=f'High Risk Period ({target_alert_type})' if i == 0 else "")

            plt.text(df['Time'].min(), i, machine_id, va='center', ha='right', fontweight='bold')

        plt.yticks(range(n_machines), machines)
        plt.xlabel('Date')
        plt.ylabel('Machine ID')
        plt.title(f'Actual Alerts vs High Risk Periods for {target_alert_type} Alerts')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [19]:
class ProductionAlertPredictor:
    def __init__(self, model_types=['xgboost', 'randomforest']):
        """
        Initializes the ProductionAlertPredictor with specified model types.

        Args:
            model_types (list): List of model types to use ('xgboost' and/or 'randomforest')
        """
        self.model_types = model_types
        self.models = {model_type: {} for model_type in model_types}
        self.scalers = {model_type: {} for model_type in model_types}
        self.alert_types = ['LOW', 'MEDIUM', 'HIGH']
        self.features = [
            'ChlPrs',
            # 'hour',
            # 'day_of_week',
            # 'month',
            # 'is_weekend',
            'rolling_mean',
            'rolling_std'] + [f'time_since_{at}' for at in self.alert_types]

    def engineer_features(self, df):
        """
        Engineers features from the input DataFrame.
        """
        if isinstance(df['Time'].iloc[0], str):
            df['Time'] = pd.to_datetime(df['Time'])

        df['hour'] = df['Time'].dt.hour
        df['day_of_week'] = df['Time'].dt.dayofweek
        df['month'] = df['Time'].dt.month
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

        # Calculate rolling statistics
        df['rolling_mean'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).mean().reset_index(0, drop=True)
        df['rolling_std'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).std().reset_index(0, drop=True)

        # Calculate time since last alert for each type
        for alert_type in self.alert_types:
            df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
                lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)
            ).reset_index(level=0, drop=True)
            df[f'time_since_{alert_type}'] = df[f'time_since_{alert_type}'].dt.total_seconds() / 3600

        return df[self.features]

    def save_models(self, output_dir):
        """
        Saves trained models, scalers, and metadata to disk.

        Args:
            output_dir (str): Directory to save the models and related files
        """
        os.makedirs(output_dir, exist_ok=True)

        # Save metadata
        metadata = {
            'model_types': self.model_types,
            'alert_types': self.alert_types,
            'features': self.features,
            'timestamp': datetime.now().isoformat()
        }

        with open(os.path.join(output_dir, 'metadata.json'), 'w') as f:
            json.dump(metadata, f, indent=4)

        # Save models and scalers
        for model_type in self.model_types:
            model_dir = os.path.join(output_dir, model_type)
            os.makedirs(model_dir, exist_ok=True)

            for alert_type in self.alert_types:
                # Save model
                model_path = os.path.join(model_dir, f'{alert_type}_model.pkl')
                with open(model_path, 'wb') as f:
                    pickle.dump(self.models[model_type][alert_type], f)

                # Save scaler
                scaler_path = os.path.join(model_dir, f'{alert_type}_scaler.pkl')
                with open(scaler_path, 'wb') as f:
                    pickle.dump(self.scalers[model_type][alert_type], f)

    @classmethod
    def load_models(cls, model_dir):
        """
        Loads trained models, scalers, and metadata from disk.

        Args:
            model_dir (str): Directory containing the saved models and related files

        Returns:
            ProductionAlertPredictor: Initialized instance with loaded models
        """
        # Load metadata
        with open(os.path.join(model_dir, 'metadata.json'), 'r') as f:
            metadata = json.load(f)

        # Initialize predictor with metadata
        predictor = cls(model_types=metadata['model_types'])
        predictor.alert_types = metadata['alert_types']
        predictor.features = metadata['features']

        # Load models and scalers
        for model_type in predictor.model_types:
            model_type_dir = os.path.join(model_dir, model_type)

            for alert_type in predictor.alert_types:
                # Load model
                model_path = os.path.join(model_type_dir, f'{alert_type}_model.pkl')
                with open(model_path, 'rb') as f:
                    predictor.models[model_type][alert_type] = pickle.load(f)

                # Load scaler
                scaler_path = os.path.join(model_type_dir, f'{alert_type}_scaler.pkl')
                with open(scaler_path, 'rb') as f:
                    predictor.scalers[model_type][alert_type] = pickle.load(f)

        return predictor

    def predict(self, new_data, threshold=0.7, ensemble_method='unanimous'):
        """
        Makes predictions using the ensemble of models.

        Args:
            new_data (pd.DataFrame): DataFrame containing new data to predict on
            threshold (float): Probability threshold for positive prediction
            ensemble_method (str): How to combine predictions ('unanimous' or 'majority')

        Returns:
            dict: Dictionary containing predictions and probabilities for each alert type
        """
        if not isinstance(new_data, pd.DataFrame):
            raise ValueError("new_data must be a pandas DataFrame")

        # Engineer features for the new data
        X_new = self.engineer_features(new_data)

        results = {}
        for alert_type in self.alert_types:
            model_predictions = []
            model_probabilities = []

            # Get predictions from each model
            for model_type in self.model_types:
                X_scaled = self.scalers[model_type][alert_type].transform(X_new)
                probabilities = self.models[model_type][alert_type].predict_proba(X_scaled)[:, 1]
                predictions = (probabilities >= threshold).astype(int)

                model_predictions.append(predictions)
                model_probabilities.append(probabilities)

            # Combine predictions based on ensemble method
            if ensemble_method == 'unanimous':
                final_predictions = np.all(model_predictions, axis=0)
            elif ensemble_method == 'majority':
                final_predictions = np.mean(model_predictions, axis=0) >= 0.5
            else:
                raise ValueError("ensemble_method must be 'unanimous' or 'majority'")

            # Average probabilities across models
            avg_probabilities = np.mean(model_probabilities, axis=0)

            results[alert_type] = {
                'prediction': final_predictions[0],
                'probability': avg_probabilities[0],
                'model_probabilities': {
                    model_type: probs[0]
                    for model_type, probs in zip(self.model_types, model_probabilities)
                }
            }

        return results

def train_production_models(data_folder, output_dir, prediction_window=7):
    """
    Trains and saves production models.

    Args:
        data_folder (str): Folder containing training data
        output_dir (str): Directory to save trained models
        prediction_window (int): Prediction window in days
    """
    # Initialize original predictor instances
    xgb_predictor = AlertPredictor(model_type='xgboost')
    rf_predictor = AlertPredictor(model_type='randomforest')

    # Train both models
    xgb_predictor.train(data_folder, prediction_window)
    rf_predictor.train(data_folder, prediction_window)

    # Initialize production predictor
    prod_predictor = ProductionAlertPredictor(['xgboost', 'randomforest'])

    # Transfer trained models and scalers
    for alert_type in prod_predictor.alert_types:
        prod_predictor.models['xgboost'][alert_type] = xgb_predictor.models[alert_type]
        prod_predictor.scalers['xgboost'][alert_type] = xgb_predictor.scalers[alert_type]

        prod_predictor.models['randomforest'][alert_type] = rf_predictor.models[alert_type]
        prod_predictor.scalers['randomforest'][alert_type] = rf_predictor.scalers[alert_type]

    # Save the production models
    prod_predictor.save_models(output_dir)
    return prod_predictor

In [20]:
# Train and save the models
output_dir = "production_models_comb"
folder = "../../../outlier_tolerance=5_grouping_time_window=200_anomaly_threshold=6_start_date=2022-01-01_end_date=2026-01-01"
prediction_window = 7  # days

predictor = train_production_models(
    data_folder=folder,
    output_dir=output_dir,
    prediction_window=prediction_window
)

  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(



Training model for LOW alerts:


  df['target'] = df.groupby('machine_id').apply(


              precision    recall  f1-score   support

         0.0       1.00      0.81      0.89    265420
         1.0       0.01      0.79      0.02       521

    accuracy                           0.81    265941
   macro avg       0.50      0.80      0.46    265941
weighted avg       1.00      0.81      0.89    265941


Training model for MEDIUM alerts:


  df['target'] = df.groupby('machine_id').apply(


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    265918
         1.0       0.02      0.91      0.05        23

    accuracy                           1.00    265941
   macro avg       0.51      0.95      0.52    265941
weighted avg       1.00      1.00      1.00    265941


Training model for HIGH alerts:


  df['target'] = df.groupby('machine_id').apply(


              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00    265931
         1.0       0.00      0.50      0.01        10

    accuracy                           0.99    265941
   macro avg       0.50      0.75      0.50    265941
weighted avg       1.00      0.99      1.00    265941



  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(



Training model for LOW alerts:


  df['target'] = df.groupby('machine_id').apply(


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    265420
         1.0       0.97      0.15      0.25       521

    accuracy                           1.00    265941
   macro avg       0.99      0.57      0.63    265941
weighted avg       1.00      1.00      1.00    265941


Training model for MEDIUM alerts:


  df['target'] = df.groupby('machine_id').apply(


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    265918
         1.0       0.75      0.13      0.22        23

    accuracy                           1.00    265941
   macro avg       0.87      0.57      0.61    265941
weighted avg       1.00      1.00      1.00    265941


Training model for HIGH alerts:


  df['target'] = df.groupby('machine_id').apply(


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    265931
         1.0       1.00      0.20      0.33        10

    accuracy                           1.00    265941
   macro avg       1.00      0.60      0.67    265941
weighted avg       1.00      1.00      1.00    265941



In [53]:
# Load the saved models
predictor = ProductionAlertPredictor.load_models("production_models_comb")

# Make predictions on new data
new_data = pd.DataFrame({
    'Time': ['2024-11-01 00:00:00'],
    'machine_id': ['HTOL-14'],
    'ChlPrs': [150],
    'ALERT': ['NONE']
})

# Get predictions with unanimous agreement (both models must agree)
predictions = predictor.predict(
    new_data,
    threshold=0.7,
    ensemble_method='unanimous'
)

# Print predictions
for alert_type, result in predictions.items():
    print(f"\n{alert_type} Alert:")
    print(f"Prediction: {'Yes' if result['prediction'] else 'No'}")
    print(f"Average Probability: {result['probability']:.3f}")
    print("Individual Model Probabilities:")
    for model, prob in result['model_probabilities'].items():
        print(f"  - {model}: {prob:.3f}")


LOW Alert:
Prediction: No
Average Probability: 0.000
Individual Model Probabilities:
  - xgboost: 0.000
  - randomforest: 0.000

MEDIUM Alert:
Prediction: No
Average Probability: 0.015
Individual Model Probabilities:
  - xgboost: 0.029
  - randomforest: 0.000

HIGH Alert:
Prediction: No
Average Probability: 0.354
Individual Model Probabilities:
  - xgboost: 0.468
  - randomforest: 0.240


  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
