In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, mean_absolute_error
from datetime import timedelta, datetime
import matplotlib.pyplot as plt

In [2]:
# Load the CSV files into Pandas Dataframes
folder = "../outlier_tolerance=5_grouping_time_window=200_anomaly_threshold=6_start_date=2022-01-01_end_date=2026-01-01"

df1 = pd.read_csv(os.path.join(folder, "HTOL-09_alerts.csv"))
df1['machine_id'] = 'HTOL-09'

df2 = pd.read_csv(os.path.join(folder, "HTOL-10_alerts.csv"))
df2['machine_id'] = 'HTOL-10'

df3 = pd.read_csv(os.path.join(folder, "HTOL-11_alerts.csv"))
df3['machine_id'] = 'HTOL-11'

df4 = pd.read_csv(os.path.join(folder, "HTOL-12_alerts.csv"))
df4['machine_id'] = 'HTOL-12'

df5 = pd.read_csv(os.path.join(folder, "HTOL-13_alerts.csv"))
df5['machine_id'] = 'HTOL-13'

df6 = pd.read_csv(os.path.join(folder, "HTOL-14_alerts.csv"))
df6['machine_id'] = 'HTOL-14'

df7 = pd.read_csv(os.path.join(folder, "HTOL-15_alerts.csv"))
df7['machine_id'] = 'HTOL-15'

In [3]:
# Concatenate all DataFrames into a single DataFrame
df = pd.concat([df1, df2, df3, df4, df5, df6, df7], ignore_index=True)

# Convert 'Time' column to datetime objects
df['Time'] = pd.to_datetime(df['Time'])

# Sort the DataFrame by 'machine_id' and 'Time'
df.sort_values(by=['machine_id', 'Time'], inplace=True)

In [4]:
# **Feature Engineering**

# 1. **Time-based features:**
df['dayofweek'] = df['Time'].dt.dayofweek
df['hour'] = df['Time'].dt.hour

In [5]:
df.head()

Unnamed: 0,Time,ChlPrs,alert_index,ALERT,file_name,machine_id,dayofweek,hour
0,2024-03-14 09:50:49,32.66,0,,HTOL-09-20240314095049.csv,HTOL-09,3,9
1,2024-03-14 09:50:49,32.63,1,,HTOL-09-20240314095049.csv,HTOL-09,3,9
2,2024-03-14 09:50:50,32.58,2,,HTOL-09-20240314095049.csv,HTOL-09,3,9
3,2024-03-14 09:50:51,32.69,3,,HTOL-09-20240314095049.csv,HTOL-09,3,9
4,2024-03-14 09:50:53,32.62,4,,HTOL-09-20240314095049.csv,HTOL-09,3,9


In [6]:
# 2. **Lagged features:**
# Create lagged features for 'ChlPrs'
for i in range(1, 4):  # Create lags for the past 3 hours
    df[f'ChlPrs_lag_{i}'] = df.groupby('machine_id')['ChlPrs'].shift(i).fillna(method='bfill')

# 3. **Rolling window statistics:**
df['rolling_mean_24h'] = (
    df.groupby('machine_id')['ChlPrs']
    .rolling('24h')
    .mean()
    .reset_index(level=0, drop=True)
)
df['rolling_std_24h'] = (
    df.groupby('machine_id')['ChlPrs']
    .rolling('24h')
    .std()
    .reset_index(level=0, drop=True)
)

  df[f'ChlPrs_lag_{i}'] = df.groupby('machine_id')['ChlPrs'].shift(i).fillna(method='bfill')
  df[f'ChlPrs_lag_{i}'] = df.groupby('machine_id')['ChlPrs'].shift(i).fillna(method='bfill')
  df[f'ChlPrs_lag_{i}'] = df.groupby('machine_id')['ChlPrs'].shift(i).fillna(method='bfill')


ValueError: window must be an integer 0 or greater

In [None]:
# 4. **Time to next alert:**
# Calculate the time to the next alert of each type
for alert_type in ['LOW', 'MEDIUM', 'HIGH', 'SIGMA']:
    # Get indices where the current alert is of the specified type
    alert_indices = df[df['ALERT'] == alert_type].index

    # Calculate the time difference between each alert and the next alert
    time_to_next_alert = (
        df['Time'].iloc[alert_indices].values[:, None] - df['Time'].iloc[alert_indices + 1:].values
    )
    time_to_next_alert = np.where(time_to_next_alert > timedelta(0), np.nan, time_to_next_alert)
    time_to_next_alert = np.abs(time_to_next_alert.min(axis=0))

    # Convert time difference to days
    time_to_next_alert = [t.total_seconds() / (24 * 60 * 60) for t in time_to_next_alert]

    # Assign the time to next alert to a new column
    df.loc[alert_indices, f'time_to_next_{alert_type}'] = time_to_next_alert
