In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import joblib
import json
from datetime import datetime
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
from tqdm import tqdm
import os
from pathlib import Path
import time
import os

# Load and preprocess data
def load_and_preprocess_data(folder):
    dfs = []
    for i in range(9, 16):
        file_name = f"HTOL-{i:02d}_alerts.csv"
        df = pd.read_csv(os.path.join(folder, file_name))
        df['machine_id'] = f'HTOL-{i:02d}'
        dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df['Time'] = pd.to_datetime(combined_df['Time'])
    combined_df = combined_df.sort_values(['machine_id', 'Time'])

    return combined_df

# Feature engineering
def engineer_features(df):
    df['hour'] = df['Time'].dt.hour
    df['day_of_week'] = df['Time'].dt.dayofweek
    df['month'] = df['Time'].dt.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    # Calculate rolling statistics
    df['rolling_mean'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).mean().reset_index(0, drop=True)
    df['rolling_std'] = df.groupby('machine_id')['ChlPrs'].rolling(window=24, min_periods=1).std().reset_index(0, drop=True)

    # Calculate time since last alert for each type
    alert_types = ['LOW', 'MEDIUM', 'HIGH', 'SIGMA']
    for alert_type in alert_types:
        df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)).reset_index(level=0, drop=True)
        df[f'time_since_{alert_type}'] = df[f'time_since_{alert_type}'].dt.total_seconds() / 3600  # Convert to hours

    return df

class AlertPredictor:
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.feature_names = ['ChlPrs',
                            #   'hour',
                            #   'day_of_week',
                            #   'month',
                            # 'is_weekend',
                            'rolling_mean', 'rolling_std', 'time_since_LOW',
                            'time_since_MEDIUM', 'time_since_HIGH', 'time_since_SIGMA']
        self.alert_types = ['LOW', 'MEDIUM', 'HIGH', 'SIGMA']
        self.metrics = {}

    def train_models(self, df, prediction_window=7, progress_bar=None):
        """Train models for each alert type and save them with progress tracking"""
        self.metrics = {}

        # Create models directory if it doesn't exist
        Path("models").mkdir(exist_ok=True)

        for alert_type in self.alert_types:
            if progress_bar:
                progress_bar.write(f"Training model for {alert_type} alerts...")

            # Prepare data
            X, y = self._prepare_data(df, alert_type, prediction_window)

            # Initialize and fit scaler
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)

            # Train model with progress updates
            model = RandomForestClassifier(n_estimators=100, random_state=42, verbose=0)
            model.fit(X_scaled, y)

            # Save model and scaler
            self.models[alert_type] = model
            self.scalers[alert_type] = scaler

            # Calculate and store performance metrics
            y_pred_proba = model.predict_proba(X_scaled)[:, 1]
            metrics = self._calculate_metrics(y, y_pred_proba)
            self.metrics[alert_type] = metrics

            # Save models and scalers
            self._save_model(alert_type, model, scaler)

            if progress_bar:
                progress_bar.progress((self.alert_types.index(alert_type) + 1) / len(self.alert_types))

        return self.metrics

    def predict(self, new_data):
        """Make predictions for new data"""
        predictions = {}

        for alert_type in self.alert_types:
            if alert_type not in self.models:
                self._load_model(alert_type)

            X_new = new_data[self.feature_names]
            X_scaled = self.scalers[alert_type].transform(X_new)
            probabilities = self.models[alert_type].predict_proba(X_scaled)[:, 1]
            predictions[alert_type] = probabilities

        return predictions

    def _prepare_data(self, df, target_alert_type, prediction_window):
        """Prepare data for training"""
        df['target'] = df.groupby('machine_id').apply(
            lambda x: (x['ALERT'] == target_alert_type)
            .rolling(window=prediction_window)
            .max()
            .shift(-prediction_window+1)
        ).reset_index(level=0, drop=True)

        X = df[self.feature_names]
        y = df['target'].fillna(0)

        return X, y

    def _calculate_metrics(self, y_true, y_pred_proba):
        """Calculate performance metrics"""
        fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
        roc_auc = auc(fpr, tpr)

        # Calculate predictions using 0.5 threshold for basic metrics
        y_pred = (y_pred_proba >= 0.5).astype(int)

        return {
            'roc_auc': roc_auc,
            'roc_curve': {'fpr': fpr.tolist(), 'tpr': tpr.tolist(), 'thresholds': thresholds.tolist()},
            'confusion_matrix': confusion_matrix(y_true, y_pred).tolist(),
            'classification_report': classification_report(y_true, y_pred, output_dict=True)
        }

    def _save_model(self, alert_type, model, scaler):
        """Save model and scaler to disk"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        base_path = f"models/{alert_type}_{timestamp}"

        joblib.dump(model, f"{base_path}_model.joblib")
        joblib.dump(scaler, f"{base_path}_scaler.joblib")

        # Save metrics
        with open(f"{base_path}_metrics.json", 'w') as f:
            json.dump(self.metrics[alert_type], f)

    def _load_model(self, alert_type):
        """Load latest model and scaler from disk"""
        model_files = list(Path("models").glob(f"{alert_type}_*_model.joblib"))
        latest_model = max(model_files, key=os.path.getctime)
        latest_scaler = Path(str(latest_model).replace('_model.joblib', '_scaler.joblib'))

        self.models[alert_type] = joblib.load(latest_model)
        self.scalers[alert_type] = joblib.load(latest_scaler)

# Example usage
if __name__ == "__main__":
    # Initialize predictor
    predictor = AlertPredictor()

    folder = "../../../outlier_tolerance=5_grouping_time_window=200_anomaly_threshold=6_start_date=2022-01-01_end_date=2026-01-01"
    prediction_window = 7  # days
    probability_threshold = 0.7  # Adjust this value to balance between over-prediction and missed alerts

    df = load_and_preprocess_data(folder)
    df = engineer_features(df)

    # Train models and get performance metrics
    metrics = predictor.train_models(df)  # df is your training data

    # Save metrics to JSON for visualization
    with open('model_metrics.json', 'w') as f:
        json.dump(metrics, f)

    # Example of making predictions
    new_data = df.iloc[-10:].copy()  # Last 10 rows as example
    predictions = predictor.predict(new_data)

    print("\nPrediction Examples:")
    for alert_type, probs in predictions.items():
        print(f"{alert_type} Alert Probabilities:", probs)

  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)).reset_index(level=0, drop=True)
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)).reset_index(level=0, drop=True)
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)).reset_index(level=0, drop=True)
  df[f'time_since_{alert_type}'] = df.groupby('machine_id').apply(lambda x: x['Time'] - x[x['ALERT'] == alert_type]['Time'].shift(1)).reset_index(level=0, drop=True)
  df['target'] = df.groupby('machine_id').apply(
