In [70]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle
import json
from datetime import datetime
from tqdm import tqdm

In [71]:
total_files = 1
total_files += 1

In [72]:
class SequenceAlertPredictor:
    def __init__(self, model_type='xgboost', sequence_length=24):
        print(f"\nInitializing {model_type.upper()} predictor with sequence length {sequence_length}")
        self.model_type = model_type
        self.sequence_length = sequence_length
        self.models = {}
        self.scalers = {}
        self.alert_types = ['LOW', 'MEDIUM', 'HIGH']
        self.base_features = ['ChlPrs', 'rolling_mean', 'rolling_std']
        self.features = []
        for feature in self.base_features:
            self.features.extend([f'{feature}_t{i}' for i in range(sequence_length)])
        self.features.extend([f'time_since_{at}' for at in self.alert_types])
        print(f"Total features initialized: {len(self.features)}")

    def load_and_preprocess_data(self, folder):
        print("\nLoading and preprocessing data:")
        dfs = []

        for i in tqdm(range(9, 9 + total_files), desc="Loading files"):
            file_name = f"HTOL-{i:02d}_alerts.csv"
            file_path = os.path.join(folder, file_name)
            print(f"\nProcessing {file_name}...")

            df = pd.read_csv(file_path)
            print(f"- Loaded {len(df)} rows from {file_name}")
            df['machine_id'] = f'HTOL-{i:02d}'
            dfs.append(df)

        print("\nCombining datasets...")
        combined_df = pd.concat(dfs, ignore_index=True)
        combined_df['Time'] = pd.to_datetime(combined_df['Time'])
        combined_df = combined_df.sort_values(['machine_id', 'Time'])

        print(f"Total combined dataset size: {len(combined_df)} rows")
        return combined_df

    def create_sequences(self, df):
        print("\nCreating sequences...")
        sequences = []
        labels = []

        total_machines = len(df['machine_id'].unique())
        for idx, machine_id in enumerate(df['machine_id'].unique(), 1):
            print(f"\nProcessing machine {machine_id} ({idx}/{total_machines})")
            machine_data = df[df['machine_id'] == machine_id].copy()

            sequence_count = len(machine_data) - self.sequence_length
            print(f"- Creating {sequence_count} sequences for {machine_id}")

            for i in tqdm(range(sequence_count), desc="Creating sequences"):
                sequence = machine_data.iloc[i:i + self.sequence_length]
                target_row = machine_data.iloc[i + self.sequence_length]

                sequence_features = {}
                for feature in self.base_features:
                    for j in range(self.sequence_length):
                        sequence_features[f'{feature}_t{j}'] = sequence.iloc[j][feature]

                for alert_type in self.alert_types:
                    sequence_features[f'time_since_{alert_type}'] = target_row[f'time_since_{alert_type}']

                sequences.append(sequence_features)
                labels.append(1 if target_row['ALERT'] == self.current_alert_type else 0)

        print(f"\nTotal sequences created: {len(sequences)}")
        print(f"Positive samples: {sum(labels)}")
        print(f"Negative samples: {len(labels) - sum(labels)}")
        return pd.DataFrame(sequences), np.array(labels)

    def engineer_features(self, df):
        print("\nEngineering features...")

        print("- Calculating rolling statistics")
        df['rolling_mean'] = df.groupby('machine_id')['ChlPrs'].rolling(
            window=24, min_periods=1).mean().reset_index(0, drop=True)
        df['rolling_std'] = df.groupby('machine_id')['ChlPrs'].rolling(
            window=24, min_periods=1).std().reset_index(0, drop=True)

        print("- Computing time since last alerts")
        for alert_type in tqdm(self.alert_types, desc="Processing alert types"):
            df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
                lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)
            ).reset_index(level=0, drop=True)
            df[f'time_since_{alert_type}'] = df[f'time_since_{alert_type}'].dt.total_seconds() / 3600

        print("Feature engineering complete")
        return df

    def train_and_evaluate_classifier(self, X, y, test_size=0.2):
        print("\nTraining and evaluating classifier:")
        print(f"- Total samples: {len(X)}")
        print(f"- Training set size: {int(len(X) * (1-test_size))}")
        print(f"- Test set size: {int(len(X) * test_size)}")

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        print("Scaling features...")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        if self.model_type == 'xgboost':
            print("\nTraining XGBoost classifier...")
            model = XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=6,
                scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
                random_state=42
            )
        else:
            print("\nTraining Random Forest classifier...")
            model = RandomForestClassifier(
                n_estimators=100,
                class_weight='balanced',
                random_state=42
            )

        print("Fitting model...")
        model.fit(X_train_scaled, y_train)

        print("\nEvaluating model performance:")
        y_pred = model.predict(X_test_scaled)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

        return model, scaler

    def train(self, folder):
        print(f"\n{'='*50}")
        print(f"Starting training process for {self.model_type.upper()} model")
        print(f"{'='*50}")

        print("\nStep 1: Loading and preprocessing data")
        df = self.load_and_preprocess_data(folder)

        print("\nStep 2: Engineering features")
        df = self.engineer_features(df)

        for alert_type in self.alert_types:
            print(f"\n{'='*50}")
            print(f"Training model for {alert_type} alerts")
            print(f"{'='*50}")

            self.current_alert_type = alert_type
            print("\nStep 3: Creating sequences")
            X, y = self.create_sequences(df)

            print("\nStep 4: Training and evaluating model")
            model, scaler = self.train_and_evaluate_classifier(X, y)
            self.models[alert_type] = model
            self.scalers[alert_type] = scaler

            print(f"\nCompleted training for {alert_type} alerts")

        print(f"\n{'='*50}")
        print(f"Training complete for all alert types")
        print(f"{'='*50}")

In [73]:

class ProductionSequencePredictor:
    def __init__(self, model_types=['xgboost', 'randomforest'], sequence_length=24):
        """
        Initializes the ProductionSequencePredictor with multiple model types.
        """
        self.model_types = model_types
        self.sequence_length = sequence_length
        self.models = {model_type: {} for model_type in model_types}
        self.scalers = {model_type: {} for model_type in model_types}
        self.alert_types = ['LOW', 'MEDIUM', 'HIGH']
        self.base_features = ['ChlPrs', 'rolling_mean', 'rolling_std']
        self.features = []
        for feature in self.base_features:
            self.features.extend([f'{feature}_t{i}' for i in range(sequence_length)])
        self.features.extend([f'time_since_{at}' for at in self.alert_types])

    def prepare_sequence(self, df):
        """
        Prepares a sequence of data points for prediction.
        """
        if len(df) < self.sequence_length:
            raise ValueError(f"Input data must contain at least {self.sequence_length} points")

        sequence = df.iloc[-self.sequence_length:]
        sequence_features = {}

        for feature in self.base_features:
            for j in range(self.sequence_length):
                sequence_features[f'{feature}_t{j}'] = sequence.iloc[j][feature]

        for alert_type in self.alert_types:
            sequence_features[f'time_since_{alert_type}'] = df.iloc[-1][f'time_since_{alert_type}']

        return pd.DataFrame([sequence_features])

    def save_models(self, output_dir):
        """
        Saves trained models and metadata to disk.
        """
        os.makedirs(output_dir, exist_ok=True)

        metadata = {
            'model_types': self.model_types,
            'alert_types': self.alert_types,
            'sequence_length': self.sequence_length,
            'features': self.features,
            'timestamp': datetime.now().isoformat()
        }

        with open(os.path.join(output_dir, 'metadata.json'), 'w') as f:
            json.dump(metadata, f, indent=4)

        for model_type in self.model_types:
            model_dir = os.path.join(output_dir, model_type)
            os.makedirs(model_dir, exist_ok=True)

            for alert_type in self.alert_types:
                with open(os.path.join(model_dir, f'{alert_type}_model.pkl'), 'wb') as f:
                    pickle.dump(self.models[model_type][alert_type], f)
                with open(os.path.join(model_dir, f'{alert_type}_scaler.pkl'), 'wb') as f:
                    pickle.dump(self.scalers[model_type][alert_type], f)

    @classmethod
    def load_models(cls, model_dir):
        """
        Loads trained models from disk.
        """
        with open(os.path.join(model_dir, 'metadata.json'), 'r') as f:
            metadata = json.load(f)

        predictor = cls(
            model_types=metadata['model_types'],
            sequence_length=metadata['sequence_length']
        )

        for model_type in predictor.model_types:
            model_type_dir = os.path.join(model_dir, model_type)

            for alert_type in predictor.alert_types:
                with open(os.path.join(model_type_dir, f'{alert_type}_model.pkl'), 'rb') as f:
                    predictor.models[model_type][alert_type] = pickle.load(f)
                with open(os.path.join(model_type_dir, f'{alert_type}_scaler.pkl'), 'rb') as f:
                    predictor.scalers[model_type][alert_type] = pickle.load(f)

        return predictor

    def predict(self, df, threshold=0.7):
        """
        Makes predictions using the ensemble of models.
        """
        sequence = self.prepare_sequence(df)
        results = {}

        for alert_type in self.alert_types:
            model_predictions = []
            model_probabilities = []

            for model_type in self.model_types:
                X_scaled = self.scalers[model_type][alert_type].transform(sequence)
                probabilities = self.models[model_type][alert_type].predict_proba(X_scaled)[:, 1]
                predictions = (probabilities >= threshold).astype(int)

                model_predictions.append(predictions)
                model_probabilities.append(probabilities)

            # Unanimous ensemble prediction
            final_predictions = np.all(model_predictions, axis=0)
            avg_probabilities = np.mean(model_probabilities, axis=0)

            results[alert_type] = {
                'prediction': final_predictions[0],
                'probability': avg_probabilities[0],
                'model_probabilities': {
                    model_type: probs[0]
                    for model_type, probs in zip(self.model_types, model_probabilities)
                }
            }

        return results

In [74]:
def train_production_models(data_folder, output_dir, sequence_length=24):
    print(f"\n{'='*70}")
    print("TRAINING PRODUCTION MODELS")
    print(f"{'='*70}")
    print(f"Configuration:")
    print(f"- Data folder: {data_folder}")
    print(f"- Output directory: {output_dir}")
    print(f"- Sequence length: {sequence_length}")

    print("\nStep 1: Training XGBoost models")
    xgb_predictor = SequenceAlertPredictor(model_type='xgboost', sequence_length=sequence_length)
    xgb_predictor.train(data_folder)

    print("\nStep 2: Training Random Forest models")
    rf_predictor = SequenceAlertPredictor(model_type='randomforest', sequence_length=sequence_length)
    rf_predictor.train(data_folder)

    print("\nStep 3: Initializing production predictor")
    prod_predictor = ProductionSequencePredictor(['xgboost', 'randomforest'], sequence_length)

    print("\nStep 4: Combining models")
    for alert_type in tqdm(prod_predictor.alert_types, desc="Combining models"):
        prod_predictor.models['xgboost'][alert_type] = xgb_predictor.models[alert_type]
        prod_predictor.scalers['xgboost'][alert_type] = xgb_predictor.scalers[alert_type]
        prod_predictor.models['randomforest'][alert_type] = rf_predictor.models[alert_type]
        prod_predictor.scalers['randomforest'][alert_type] = rf_predictor.scalers[alert_type]

    print("\nStep 5: Saving models")
    prod_predictor.save_models(output_dir)

    print(f"\n{'='*70}")
    print("TRAINING COMPLETE")
    print(f"{'='*70}")
    return prod_predictor

def visualize_predictions(predictor, df, alert_type, window_size=168):  # 1 week
    """
    Visualizes predictions vs actual alerts.
    """
    predictions = []
    actual_alerts = []
    timestamps = []

    for i in range(predictor.sequence_length, len(df)):
        sequence_df = df.iloc[max(0, i-window_size):i]
        if len(sequence_df) >= predictor.sequence_length:
            pred = predictor.predict(sequence_df)
            predictions.append(pred[alert_type]['probability'])
            actual_alerts.append(1 if df.iloc[i]['ALERT'] == alert_type else 0)
            timestamps.append(df.iloc[i]['Time'])

    plt.figure(figsize=(15, 6))
    plt.plot(timestamps, predictions, label='Prediction Probability', color='blue', alpha=0.6)
    plt.scatter([t for t, a in zip(timestamps, actual_alerts) if a == 1],
                [1 for a in actual_alerts if a == 1],
                color='red', label='Actual Alerts', marker='x', s=100)
    plt.axhline(y=0.7, color='r', linestyle='--', alpha=0.3, label='Threshold (0.7)')
    plt.xlabel('Time')
    plt.ylabel('Alert Probability')
    plt.title(f'Predicted vs Actual {alert_type} Alerts')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [75]:

def train_production_models(data_folder, output_dir, sequence_length=24):
    """
    Trains and saves production models.
    """
    print("Initiating XGB")
    xgb_predictor = SequenceAlertPredictor(model_type='xgboost', sequence_length=sequence_length)
    print("Initiating RF")
    rf_predictor = SequenceAlertPredictor(model_type='randomforest', sequence_length=sequence_length)

    print("Training XGB")
    xgb_predictor.train(data_folder)
    print("Training RF")
    rf_predictor.train(data_folder)

    prod_predictor = ProductionSequencePredictor(['xgboost', 'randomforest'], sequence_length)

    for alert_type in prod_predictor.alert_types:
        prod_predictor.models['xgboost'][alert_type] = xgb_predictor.models[alert_type]
        prod_predictor.scalers['xgboost'][alert_type] = xgb_predictor.scalers[alert_type]
        prod_predictor.models['randomforest'][alert_type] = rf_predictor.models[alert_type]
        prod_predictor.scalers['randomforest'][alert_type] = rf_predictor.scalers[alert_type]

    prod_predictor.save_models(output_dir)
    return prod_predictor

def visualize_predictions(predictor, df, alert_type, window_size=168):  # 1 week
    """
    Visualizes predictions vs actual alerts.
    """
    predictions = []
    actual_alerts = []
    timestamps = []

    for i in range(predictor.sequence_length, len(df)):
        sequence_df = df.iloc[max(0, i-window_size):i]
        if len(sequence_df) >= predictor.sequence_length:
            pred = predictor.predict(sequence_df)
            predictions.append(pred[alert_type]['probability'])
            actual_alerts.append(1 if df.iloc[i]['ALERT'] == alert_type else 0)
            timestamps.append(df.iloc[i]['Time'])

    plt.figure(figsize=(15, 6))
    plt.plot(timestamps, predictions, label='Prediction Probability', color='blue', alpha=0.6)
    plt.scatter([t for t, a in zip(timestamps, actual_alerts) if a == 1],
                [1 for a in actual_alerts if a == 1],
                color='red', label='Actual Alerts', marker='x', s=100)
    plt.axhline(y=0.7, color='r', linestyle='--', alpha=0.3, label='Threshold (0.7)')
    plt.xlabel('Time')
    plt.ylabel('Alert Probability')
    plt.title(f'Predicted vs Actual {alert_type} Alerts')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [76]:
output_dir = "production_sequence_models"
data_folder = "../../../outlier_tolerance=5_grouping_time_window=200_anomaly_threshold=6_start_date=2022-01-01_end_date=2026-01-01"
sequence_length = 24  # 24 hours of data

predictor = train_production_models(
    data_folder=data_folder,
    output_dir=output_dir,
    sequence_length=sequence_length
)

Initiating XGB

Initializing XGBOOST predictor with sequence length 24
Total features initialized: 75
Initiating RF

Initializing RANDOMFOREST predictor with sequence length 24
Total features initialized: 75
Training XGB

Starting training process for XGBOOST model

Step 1: Loading and preprocessing data

Loading and preprocessing data:


Loading files:   0%|          | 0/2 [00:00<?, ?it/s]


Processing HTOL-09_alerts.csv...
- Loaded 100165 rows from HTOL-09_alerts.csv

Processing HTOL-10_alerts.csv...


Loading files: 100%|██████████| 2/2 [00:00<00:00,  8.06it/s]

- Loaded 268208 rows from HTOL-10_alerts.csv

Combining datasets...
Total combined dataset size: 368373 rows

Step 2: Engineering features

Engineering features...
- Calculating rolling statistics





- Computing time since last alerts


  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(
Processing alert types: 100%|██████████| 3/3 [00:00<00:00, 10.87it/s]


Feature engineering complete

Training model for LOW alerts

Step 3: Creating sequences

Creating sequences...

Processing machine HTOL-09 (1/2)
- Creating 100141 sequences for HTOL-09


Creating sequences: 100%|██████████| 100141/100141 [06:36<00:00, 252.54it/s]



Processing machine HTOL-10 (2/2)
- Creating 268184 sequences for HTOL-10


Creating sequences: 100%|██████████| 268184/268184 [17:24<00:00, 256.65it/s]



Total sequences created: 368325
Positive samples: 78
Negative samples: 368247

Step 4: Training and evaluating model

Training and evaluating classifier:
- Total samples: 368325
- Training set size: 294660
- Test set size: 73665
Scaling features...


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count



Training XGBoost classifier...
Fitting model...

Evaluating model performance:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     73649
           1       1.00      1.00      1.00        16

    accuracy                           1.00     73665
   macro avg       1.00      1.00      1.00     73665
weighted avg       1.00      1.00      1.00     73665


Completed training for LOW alerts

Training model for MEDIUM alerts

Step 3: Creating sequences

Creating sequences...

Processing machine HTOL-09 (1/2)
- Creating 100141 sequences for HTOL-09


Creating sequences: 100%|██████████| 100141/100141 [06:29<00:00, 257.27it/s]



Processing machine HTOL-10 (2/2)
- Creating 268184 sequences for HTOL-10


Creating sequences: 100%|██████████| 268184/268184 [17:03<00:00, 262.00it/s]



Total sequences created: 368325
Positive samples: 4
Negative samples: 368321

Step 4: Training and evaluating model

Training and evaluating classifier:
- Total samples: 368325
- Training set size: 294660
- Test set size: 73665
Scaling features...


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count



Training XGBoost classifier...
Fitting model...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Evaluating model performance:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     73663
           1       0.00      0.00      0.00         2

    accuracy                           1.00     73665
   macro avg       0.50      0.50      0.50     73665
weighted avg       1.00      1.00      1.00     73665


Completed training for MEDIUM alerts

Training model for HIGH alerts

Step 3: Creating sequences

Creating sequences...

Processing machine HTOL-09 (1/2)
- Creating 100141 sequences for HTOL-09


Creating sequences:   7%|▋         | 6521/100141 [00:24<05:55, 263.51it/s]


KeyboardInterrupt: 

In [None]:
# After training the models, we'll use a portion of the original data for testing
def create_test_visualization(data_folder, predictor, test_size=0.2):
    """
    Creates visualizations using a portion of the training data as a test set.

    Args:
        data_folder: Path to the data folder
        predictor: Trained ProductionSequencePredictor instance
        test_size: Fraction of data to use for testing (default 0.2 = 20%)
    """
    print("\nPreparing data for visualization...")

    # Load the original data
    dfs = []
    for i in range(9, 9 + total_files):
        file_name = f"HTOL-{i:02d}_alerts.csv"
        df = pd.read_csv(os.path.join(data_folder, file_name))
        df['machine_id'] = f'HTOL-{i:02d}'
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df['Time'] = pd.to_datetime(combined_df['Time'])

    # Sort by time and calculate features needed for prediction
    combined_df = combined_df.sort_values(['machine_id', 'Time'])

    # Calculate rolling statistics
    combined_df['rolling_mean'] = combined_df.groupby('machine_id')['ChlPrs'].rolling(
        window=24, min_periods=1).mean().reset_index(0, drop=True)
    combined_df['rolling_std'] = combined_df.groupby('machine_id')['ChlPrs'].rolling(
        window=24, min_periods=1).std().reset_index(0, drop=True)

    # Calculate time since last alerts
    for alert_type in predictor.alert_types:
        combined_df[f'time_since_{alert_type}'] = combined_df.groupby('machine_id').apply(
            lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)
        ).reset_index(level=0, drop=True)
        combined_df[f'time_since_{alert_type}'] = combined_df[f'time_since_{alert_type}'].dt.total_seconds() / 3600

    # Select a random machine for visualization
    unique_machines = combined_df['machine_id'].unique()
    test_machine = np.random.choice(unique_machines)
    print(f"\nSelected machine {test_machine} for visualization")

    # Get the last 20% of data for the selected machine
    machine_df = combined_df[combined_df['machine_id'] == test_machine].copy()
    split_idx = int(len(machine_df) * (1 - test_size))
    test_df = machine_df.iloc[split_idx:]

    print(f"Test set size: {len(test_df)} records")
    print(f"Date range: {test_df['Time'].min()} to {test_df['Time'].max()}")

    # Create visualizations for each alert type
    for alert_type in predictor.alert_types:
        print(f"\nCreating visualization for {alert_type} alerts...")
        visualize_predictions(predictor, test_df, alert_type)

        # Print some statistics
        actual_alerts = sum(test_df['ALERT'] == alert_type)
        print(f"Number of actual {alert_type} alerts in test set: {actual_alerts}")

# Loading saved models and making predictions
predictor = ProductionSequencePredictor.load_models(output_dir)

# Then create visualizations using a portion of the training data
create_test_visualization(data_folder, predictor, test_size=0.2)