## Import Libraries

In [48]:
### Import Packages ###
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from scipy.fftpack import fft
from datetime import datetime, timedelta

#Jlab Packages
from data_utils import get_traces
from beam_settings_parser_hdf5 import BeamConfigParserHDF5
from beam_settings_prep import BeamConfigPreProcessor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Tensorflow 
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    Input, LSTM, Dense, Bidirectional, RepeatVector, TimeDistributed, Lambda
)

pd.options.display.max_columns = None
pd.options.display.max_rows = None

### BPM Config ###

In [49]:
class BPMDataConfig:

    def __init__(self):
        self.beam_settings_data_path = "/work/data_science/suf_sns/beam_configurations_data/processed_data/clean_beam_config_processed_df.csv"
        self.beam_param_parser_cfg = {"data_location": "/work/data_science/suf_sns/beam_configurations_data/hdf5_sept2024/"}
        self.beam_settings_prep_cfg = {
            "rescale": False,
            "beam_config": [
                'FE_IS:Match:TunerPos',
                'LEBT:Chop_N:V_Set',
                'LEBT:Chop_P:V_Set',
                'LEBT:Focus_1:V_Set',
                'LEBT:Focus_2:V_Set',
                'LEBT:Steer_A:V_Set',
                'LEBT:Steer_B:V_Set',
                'LEBT:Steer_C:V_Set',
                'LEBT:Steer_D:V_Set',
                'Src:Accel:V_Set',
                'Src:H2:Flw_Set',
                'Src:Ign:Pwr_Set',
                'Src:RF_Gnd:Pwr_Set',
                'ICS_Chop:RampDown:PW',
                'ICS_Chop:RampUp:PWChange',
                'ICS_MPS:Gate_Source:Offset',
                'ICS_Tim:Chop_Flavor1:BeamOn',
                'ICS_Tim:Chop_Flavor1:OnPulseWidth',
                'ICS_Tim:Chop_Flavor1:RampUp',
                'ICS_Tim:Chop_Flavor1:StartPulseWidth',
                'ICS_Tim:Gate_BeamRef:GateWidth',
                'ICS_Tim:Gate_BeamOn:RR'
            ]
        }
        self.beam_config = [
            'timestamps',
            'FE_IS:Match:TunerPos',
            'LEBT:Chop_N:V_Set',
            'LEBT:Chop_P:V_Set',
            'LEBT:Focus_1:V_Set',
            'LEBT:Focus_2:V_Set',
            'LEBT:Steer_A:V_Set',
            'LEBT:Steer_B:V_Set',
            'LEBT:Steer_C:V_Set',
            'LEBT:Steer_D:V_Set',
            'Src:Accel:V_Set',
            'Src:H2:Flw_Set',
            'Src:Ign:Pwr_Set',
            'Src:RF_Gnd:Pwr_Set',
            'ICS_Chop:RampDown:PW',
            'ICS_Chop:RampUp:PWChange',
            'ICS_MPS:Gate_Source:Offset',
            'ICS_Tim:Chop_Flavor1:BeamOn',
            'ICS_Tim:Chop_Flavor1:OnPulseWidth',
            'ICS_Tim:Chop_Flavor1:RampUp',
            'ICS_Tim:Chop_Flavor1:StartPulseWidth',
            'ICS_Tim:Gate_BeamRef:GateWidth',
            'ICS_Tim:Gate_BeamOn:RR'
        ]

        self.column_to_add = [
    'FE_IS:Match:TunerPos',
    'LEBT:Chop_N:V_Set',
    'LEBT:Chop_P:V_Set',
    'LEBT:Focus_1:V_Set',
    'LEBT:Focus_2:V_Set',
    'LEBT:Steer_A:V_Set',
    'LEBT:Steer_B:V_Set',
    'LEBT:Steer_C:V_Set',
    'LEBT:Steer_D:V_Set',
    'Src:Accel:V_Set',
    'Src:H2:Flw_Set',
    'Src:Ign:Pwr_Set',
    'Src:RF_Gnd:Pwr_Set',
    'ICS_Tim:Gate_BeamOn:RR',
    'ICS_Chop-RampDown-PW',
    'ICS_Chop-RampUp-PWChange',
    'ICS_Tim-Gate_BeamRef-GateWidth'
]

        self.rename_mappings = {
    'ICS_Chop-RampDown-PW': 'ICS_Chop:RampDown:PW',
    'ICS_Chop-RampUp-PWChange': 'ICS_Chop:RampUp:PWChange',
    'ICS_MPS-Gate_Source-Offset': 'ICS_MPS:Gate_Source:Offset',
    'ICS_Chop-BeamOn-Width': 'ICS_Tim:Chop_Flavor1:BeamOn',
    'ICS_Chop-BeamOn-PW': 'ICS_Tim:Chop_Flavor1:OnPulseWidth',
    'ICS_Chop-RampUp-Width': 'ICS_Tim:Chop_Flavor1:RampUp',
    'ICS_Chop-RampUp-PW': 'ICS_Tim:Chop_Flavor1:StartPulseWidth',
    'ICS_Tim-Gate_BeamRef-GateWidth': 'ICS_Tim:Gate_BeamRef:GateWidth'
}


    def configs_hist(self, dataframe, timestamp):
        subset_columns = dataframe.columns.tolist()
        subset_columns.remove(timestamp)
        df_shifted = dataframe[subset_columns].shift(1)
        mask = (dataframe[subset_columns] == df_shifted).all(axis=1)
        dataframe = dataframe[~mask]

        dataframe['time_diff'] = dataframe[timestamp].diff()
        dataframe['timestamps_trm'] = dataframe[timestamp] + dataframe['time_diff'].shift(-1) - timedelta(seconds=0.000001)

        subset_columns.insert(0, timestamp)
        subset_columns.insert(1, "timestamps_trm")

        return dataframe[subset_columns]

    def summary(self, text, df):
        print(f'{text} shape: {df.shape}')

        # Filter for numeric columns only
        numeric_cols = df.select_dtypes(include=['number'])

        summ = pd.DataFrame(numeric_cols.dtypes, columns=['dtypes'])
        summ['null'] = numeric_cols.isnull().sum()
        summ['unique'] = numeric_cols.nunique()
        summ['min'] = numeric_cols.min()
        summ['median'] = numeric_cols.median()
        summ['max'] = numeric_cols.max()
        summ['mean'] = numeric_cols.mean()
        summ['std'] = numeric_cols.std()
        summ['duplicate'] = df.duplicated().sum()

        return summ


    def update_beam_config(self,beam_config_df):
        for col in self.column_to_add:
            if col not in beam_config_df.columns:
                beam_config_df[col] = np.nan

        beam_config_df.rename(columns=self.rename_mappings, inplace=True)
        return beam_config_df


# Create an instance of BPMDataConfig
dc = BPMDataConfig()


In [50]:
beam_config_df = pd.read_csv(dc.beam_settings_data_path)
beam_config_df = beam_config_df.drop("Unnamed: 0", axis=1, errors='ignore')
beam_config_df['timestamps'] = pd.to_datetime(beam_config_df['timestamps'])


In [51]:
### Sep24 hdf5 beam settings ###
parser = BeamConfigParserHDF5(dc.beam_param_parser_cfg)
data, _ = parser.run()

Provided file is not hdf5 format, skipping:  .ipynb_checkpoints
BeamParamParser: Number of samples parsed
FE_IS:Match:TunerPos 633
ICS_Chop:RampDown:PW 41
ICS_Chop:RampUp:PWChange 869
ICS_MPS:Gate_Source:Offset 262
ICS_Tim:Chop_Flavor1:BeamOn 1780
ICS_Tim:Chop_Flavor1:OnPulseWidth 2564
ICS_Tim:Chop_Flavor1:RampUp 52
ICS_Tim:Chop_Flavor1:StartPulseWidth 16
ICS_Tim:Gate_BeamOn:RR 12437
ICS_Tim:Gate_BeamRef:GateWidth 1818
LEBT:Chop_N:V_Set 6557
LEBT:Chop_P:V_Set 6561
LEBT:Focus_1:V_Set 3018
LEBT:Focus_2:V_Set 3018
LEBT:Steer_A:V_Set 6550
LEBT:Steer_B:V_Set 6557
LEBT:Steer_C:V_Set 6554
LEBT:Steer_D:V_Set 6551
Src:Accel:V_Set 3408
Src:H2:Flw_Set 2959
Src:Ign:Pwr_Set 2942
Src:RF_Gnd:Pwr_Set 5


In [52]:
### Get Prepared datasets ###
prep = BeamConfigPreProcessor(dc.beam_settings_prep_cfg)
prepared_settings, run_cfg = prep.run(data)


['FE_IS:Match:TunerPos', 'LEBT:Chop_N:V_Set', 'LEBT:Chop_P:V_Set', 'LEBT:Focus_1:V_Set', 'LEBT:Focus_2:V_Set', 'LEBT:Steer_A:V_Set', 'LEBT:Steer_B:V_Set', 'LEBT:Steer_C:V_Set', 'LEBT:Steer_D:V_Set', 'Src:Accel:V_Set', 'Src:H2:Flw_Set', 'Src:Ign:Pwr_Set', 'Src:RF_Gnd:Pwr_Set', 'ICS_Chop:RampDown:PW', 'ICS_Chop:RampUp:PWChange', 'ICS_MPS:Gate_Source:Offset', 'ICS_Tim:Chop_Flavor1:BeamOn', 'ICS_Tim:Chop_Flavor1:OnPulseWidth', 'ICS_Tim:Chop_Flavor1:RampUp', 'ICS_Tim:Chop_Flavor1:StartPulseWidth', 'ICS_Tim:Gate_BeamRef:GateWidth', 'ICS_Tim:Gate_BeamOn:RR']
Length of beam param df:  48909


In [53]:
# Convert configs to list
beam_config_df = dc.update_beam_config(beam_config_df)
bpm = pd.concat([beam_config_df, prepared_settings])

In [54]:
dc.summary('bpm_summary', bpm)

bpm_summary shape: (67764, 23)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
ICS_Tim:Chop_Flavor1:BeamOn,float64,0,645,0.0,886.0,1000.0,794.016616,256.706192,827
ICS_Tim:Chop_Flavor1:StartPulseWidth,float64,0,36,0.0,18.0,55.0,18.404389,3.075782,827
ICS_Chop:RampUp:PWChange,float64,0,27,0.0,4.0,4095.0,10.059028,97.181057,827
ICS_Chop:RampDown:PW,float64,0,16,0.0,18.0,45.0,18.192654,1.649764,827
ICS_MPS:Gate_Source:Offset,float64,0,140,-949.96748,-20.0,0.0,-30.24084,77.447447,827
ICS_Tim:Chop_Flavor1:RampUp,float64,0,49,0.0,98.0,99.0,90.025781,26.656475,827
ICS_Tim:Chop_Flavor1:OnPulseWidth,float64,0,39,0.0,40.0,55.0,33.201124,11.371221,827
ICS_Tim:Gate_BeamRef:GateWidth,float64,0,664,0.0,986.0,1052.0,825.438979,343.844063,827
FE_IS:Match:TunerPos,float32,18855,3,58.009998,58.040001,58.040001,58.035648,0.020661,827
LEBT:Chop_N:V_Set,float32,18855,2,2.43,2.7,2.7,2.699995,0.001783,827


### DCM Configs

In [55]:
class DCMDatConfig:

    def __init__(self):
        self.dataset1_loc = "/work/data_science/suf_sns/DCM_Errant/"
        self.dataset2_loc = "/w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024"
        self.start_date = 20220218
        self.end_date = 20220318
        self.anomaly_type = "00110000"  # --  48
        self.length_of_waveform = 10000
        self.exclude_dates = [20220220, 20220221, 20220222, 20220223, 20220301, 20220308, 20220309, 20220315]  # Fixed date

        self.filtered_normal_files = []
        self.filtered_anomaly_files = []
        self.filtered_normal_files2 = []
        self.filtered_anomaly_files2 = []
        self.traces = []
        self.timestamps = []
        

    
    def GetSepFilteredFiles(self):
        subfolders = [f.path for f in os.scandir(self.dataset2_loc) if f.is_dir()]
        for directory in subfolders:
            if "normal" in directory or "anomal" in directory:
                for root, _, files in os.walk(directory):
                    for file in files:
                        if ".gz" in file:
                            if 'normal' in directory:
                                self.filtered_normal_files2.append(os.path.join(root, file))
                            elif "anomal" in directory:
                                self.filtered_anomaly_files2.append(os.path.join(root, file))

        print('Number of available normal files:', len(self.filtered_normal_files2))
        print('Number of available anomaly files:', len(self.filtered_anomaly_files2))
        return self.filtered_normal_files2, self.filtered_anomaly_files2

    def GetTracesAndTs(self,filtered_files):
        index = np.random.randint(0, len(filtered_files))
        filepath = filtered_files[index]
        print(index)
        try:
            self.traces, self.timestamps = get_traces(filepath, var_id="Trace2", begin=3000, shift=self.length_of_waveform, data_type=0)
        except Exception as e:
            print("Error in reading the file: ", filepath)
            print("Error:", e)
        return self.traces, self.timestamps
        

dcm = DCMDatConfig()


In [56]:
### Print Sep Data Details ###
filtered_normal_files_sep,filtered_anomaly_files_sep=dcm.GetSepFilteredFiles()

Number of available normal files: 10699
Number of available anomaly files: 20592


## Get Traces and Timestamps 

In [57]:
traces_norm,timestamps_norm=dcm.GetTracesAndTs(filtered_normal_files_sep)

4021


In [58]:
traces_anorm,timestamps_anorm=dcm.GetTracesAndTs(filtered_anomaly_files_sep)

7662
Non zero alarm value (48) in a named normal file at:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/mnt/pool0/Instruments/DCML/autostream/2024_09/29_16/20240929_165914.4051_DCML.bin.gz
Next timestamp is 0:00:00.133328 sec apart


## Merge Datasets

In [59]:
class MergeDatasets:
    
    def __init__(self):
        self.length_of_waveform = 10000
    
    def process_files(self,file_list, label, flag_value, data_type, alarm=None):
        traces, timestamps, flag, file = [], [], [], []

        for dcml in file_list[:50]:
            if alarm is not None:
                tmp_trace, tmp_timestamp = get_traces(
                dcml, var_id="Trace2", begin=3000, shift=self.length_of_waveform, data_type=data_type, alarm=alarm
            )
            else:
                tmp_trace, tmp_timestamp = get_traces(
                dcml, var_id="Trace1", begin=3000, shift=self.length_of_waveform, data_type=data_type
            )

            if not tmp_trace or not tmp_timestamp:
                print(f"Skipping {dcml} due to empty trace/timestamp")
                continue

            tmp_trace = np.array(tmp_trace)
            tmp_timestamp = np.array(tmp_timestamp)

            if len(tmp_trace) > 1:
                tmp_trace = tmp_trace[1:]
            if len(tmp_timestamp) > 1:
                tmp_timestamp = tmp_timestamp[1:]

            traces.extend(tmp_trace.tolist())
            flag.extend([flag_value] * len(tmp_trace))
            file.extend([label] * len(tmp_trace))
            timestamps.extend(tmp_timestamp.tolist())

        return pd.DataFrame({
        'anomoly_flag': flag,
        'file': file,
        'timestamps': timestamps,
        'traces': traces
    })


md = MergeDatasets()

In [60]:
# Process all datasets
dcm_Sep24_N = md.process_files(file_list=filtered_normal_files_sep, label='Sep24', flag_value=0, data_type=0)

Non zero alarm value (48) in a named normal file at:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/normal/pool0/Instruments/DCML/autostream/2024_09/16_08/20240916_082726.0116_DCML.bin.gz
Next timestamp is 0:00:00.099995 sec apart
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/normal/pool0/Instruments/DCML/autostream/2024_09/16_08/20240916_082726.0116_DCML.bin.gz due to empty trace/timestamp
Non zero alarm value (48) in a named normal file at:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/normal/pool0/Instruments/DCML/autostream/2024_09/16_08/20240916_083944.2832_DCML.bin.gz
Next timestamp is 0:00:00.099996 sec apart
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/normal/pool0/Instruments/DCML/autostream/2024_09/16_08/20240916_083944.2832_DCML.bin.gz due to empty trace/timestamp


In [61]:
dcm_Sep24_A = md.process_files(file_list=filtered_anomaly_files_sep,label="Sep24", flag_value=1, data_type=-1,alarm=48)

File:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_183444.5928_DCML.bin.gz   bWidth below threshold: 52
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_183444.5928_DCML.bin.gz due to empty trace/timestamp


  if not tmp_trace or not tmp_timestamp:


Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_185817.2886_DCML.bin.gz due to empty trace/timestamp
File:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_182423.6166_DCML.bin.gz   bWidth below threshold: 30
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_182423.6166_DCML.bin.gz due to empty trace/timestamp
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/01_15/20240901_153409.4191_DCML.bin.gz due to empty trace/timestamp
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/01_15/20240901_151450.1136_DCML.bin.gz due to empty trace/timestamp
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/01_15/20240901_152410.2755_DCML.bin.gz due to empty trace/timestamp
Skipping /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/01_15/20240901_153652.7629_DCML.bin.gz due to empty trace/timestamp
S

In [62]:
# Combine all into one DataFrame
dcm=pd.concat([dcm_Sep24_N, dcm_Sep24_A], ignore_index=True)

In [63]:
merged_df = pd.merge_asof(
    dcm.sort_values("timestamps"), 
    bpm.sort_values("timestamps"), 
    on="timestamps", 
    direction="nearest"
)
merged_df.columns

Index(['anomoly_flag', 'file', 'timestamps', 'traces',
       'ICS_Tim:Chop_Flavor1:BeamOn', 'ICS_Tim:Chop_Flavor1:StartPulseWidth',
       'ICS_Chop:RampUp:PWChange', 'ICS_Chop:RampDown:PW',
       'ICS_MPS:Gate_Source:Offset', 'ICS_Tim:Chop_Flavor1:RampUp',
       'ICS_Tim:Chop_Flavor1:OnPulseWidth', 'ICS_Tim:Gate_BeamRef:GateWidth',
       'FE_IS:Match:TunerPos', 'LEBT:Chop_N:V_Set', 'LEBT:Chop_P:V_Set',
       'LEBT:Focus_1:V_Set', 'LEBT:Focus_2:V_Set', 'LEBT:Steer_A:V_Set',
       'LEBT:Steer_B:V_Set', 'LEBT:Steer_C:V_Set', 'LEBT:Steer_D:V_Set',
       'Src:Accel:V_Set', 'Src:H2:Flw_Set', 'Src:Ign:Pwr_Set',
       'Src:RF_Gnd:Pwr_Set', 'ICS_Tim:Gate_BeamOn:RR'],
      dtype='object')

In [64]:
merged_df.head()

Unnamed: 0,anomoly_flag,file,timestamps,traces,ICS_Tim:Chop_Flavor1:BeamOn,ICS_Tim:Chop_Flavor1:StartPulseWidth,ICS_Chop:RampUp:PWChange,ICS_Chop:RampDown:PW,ICS_MPS:Gate_Source:Offset,ICS_Tim:Chop_Flavor1:RampUp,ICS_Tim:Chop_Flavor1:OnPulseWidth,ICS_Tim:Gate_BeamRef:GateWidth,FE_IS:Match:TunerPos,LEBT:Chop_N:V_Set,LEBT:Chop_P:V_Set,LEBT:Focus_1:V_Set,LEBT:Focus_2:V_Set,LEBT:Steer_A:V_Set,LEBT:Steer_B:V_Set,LEBT:Steer_C:V_Set,LEBT:Steer_D:V_Set,Src:Accel:V_Set,Src:H2:Flw_Set,Src:Ign:Pwr_Set,Src:RF_Gnd:Pwr_Set,ICS_Tim:Gate_BeamOn:RR
0,1,Sep24,2024-09-01 15:05:05.036102,"[-0.000244140625, -0.0005371093866415322, -0.0...",908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
1,1,Sep24,2024-09-01 15:07:16.847712,"[-9.765625145519152e-05, -0.000634765659924596...",908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
2,1,Sep24,2024-09-01 15:14:51.180287,"[-0.0005859375232830644, -0.000390625005820766...",908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
3,1,Sep24,2024-09-01 15:20:13.334597,"[-0.00048828125, 0.0003906250058207661, 9.7656...",908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
4,1,Sep24,2024-09-01 15:23:58.859281,"[9.765625145519152e-05, -0.0002929687616415322...",908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002


## Data Preprocessing

In [65]:
class DataPreprocessor:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()

    def check_nan(self):
        nan_counts = self.df.isna().sum()
        return nan_counts[nan_counts > 0]

    def remove_nan(self):
        self.df.dropna(inplace=True)
        return self.df

    def check_duplicates(self):
        df_copy = self.df.applymap(lambda x: tuple(x) if isinstance(x, np.ndarray) else x)
        return df_copy.duplicated().sum()


    def remove_duplicates(self):
        self.df.drop_duplicates(inplace=True)
        return self.df

    def check_outliers(self):
        outlier_dict = {}
        for col in self.df.select_dtypes(include=[np.number]).columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = self.df[(self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))].shape[0]
            if outliers > 0:
                outlier_dict[col] = outliers
        return outlier_dict

    def remove_outliers(self):
        for col in self.df.select_dtypes(include=[np.number]).columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            self.df = self.df[(self.df[col] >= lower_bound) & (self.df[col] <= upper_bound)]
        return self.df

    def convert_float64_to_float32(self):
        float64_cols = self.df.select_dtypes(include=['float64']).columns
        self.df[float64_cols] = self.df[float64_cols].astype('float32')
        return self.df

    def rename_columns(self, rename_dict):
        self.df.rename(columns=rename_dict, inplace=True)
        return self.df

    def get_dataframe(self):
        return self.df


In [66]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2855 entries, 0 to 2854
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   anomoly_flag                          2855 non-null   int64         
 1   file                                  2855 non-null   object        
 2   timestamps                            2855 non-null   datetime64[ns]
 3   traces                                2855 non-null   object        
 4   ICS_Tim:Chop_Flavor1:BeamOn           2855 non-null   float64       
 5   ICS_Tim:Chop_Flavor1:StartPulseWidth  2855 non-null   float64       
 6   ICS_Chop:RampUp:PWChange              2855 non-null   float64       
 7   ICS_Chop:RampDown:PW                  2855 non-null   float64       
 8   ICS_MPS:Gate_Source:Offset            2855 non-null   float64       
 9   ICS_Tim:Chop_Flavor1:RampUp           2855 non-null   float64       
 10  

In [67]:
processed_df=merged_df[merged_df.columns[~merged_df.columns.isin(['timestamps','traces'])]]
processed_df.head()

Unnamed: 0,anomoly_flag,file,ICS_Tim:Chop_Flavor1:BeamOn,ICS_Tim:Chop_Flavor1:StartPulseWidth,ICS_Chop:RampUp:PWChange,ICS_Chop:RampDown:PW,ICS_MPS:Gate_Source:Offset,ICS_Tim:Chop_Flavor1:RampUp,ICS_Tim:Chop_Flavor1:OnPulseWidth,ICS_Tim:Gate_BeamRef:GateWidth,FE_IS:Match:TunerPos,LEBT:Chop_N:V_Set,LEBT:Chop_P:V_Set,LEBT:Focus_1:V_Set,LEBT:Focus_2:V_Set,LEBT:Steer_A:V_Set,LEBT:Steer_B:V_Set,LEBT:Steer_C:V_Set,LEBT:Steer_D:V_Set,Src:Accel:V_Set,Src:H2:Flw_Set,Src:Ign:Pwr_Set,Src:RF_Gnd:Pwr_Set,ICS_Tim:Gate_BeamOn:RR
0,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
1,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
2,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
3,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
4,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002


In [68]:
processed_df['anomoly_flag'].value_counts()

anomoly_flag
0    2832
1      23
Name: count, dtype: int64

In [69]:
preprocessor = DataPreprocessor(processed_df)
print("NaN values before removal:\n", preprocessor.check_nan())
preprocessor.remove_nan()
print("Duplicate rows before removal:", preprocessor.check_duplicates())
#preprocessor.remove_duplicates()
#print("Outliers before removal:\n", preprocessor.check_outliers())
#preprocessor.remove_outliers()
preprocessor.convert_float64_to_float32()
cleaned_df = preprocessor.get_dataframe()

cleaned_df.head()

NaN values before removal:
 Series([], dtype: int64)
Duplicate rows before removal: 2847


Unnamed: 0,anomoly_flag,file,ICS_Tim:Chop_Flavor1:BeamOn,ICS_Tim:Chop_Flavor1:StartPulseWidth,ICS_Chop:RampUp:PWChange,ICS_Chop:RampDown:PW,ICS_MPS:Gate_Source:Offset,ICS_Tim:Chop_Flavor1:RampUp,ICS_Tim:Chop_Flavor1:OnPulseWidth,ICS_Tim:Gate_BeamRef:GateWidth,FE_IS:Match:TunerPos,LEBT:Chop_N:V_Set,LEBT:Chop_P:V_Set,LEBT:Focus_1:V_Set,LEBT:Focus_2:V_Set,LEBT:Steer_A:V_Set,LEBT:Steer_B:V_Set,LEBT:Steer_C:V_Set,LEBT:Steer_D:V_Set,Src:Accel:V_Set,Src:H2:Flw_Set,Src:Ign:Pwr_Set,Src:RF_Gnd:Pwr_Set,ICS_Tim:Gate_BeamOn:RR
0,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
1,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
2,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
3,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002
4,1,Sep24,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002


In [70]:
cleaned_df['traces']=merged_df['traces']
cleaned_df['timestamps']=merged_df['timestamps']
cleaned_df.drop(columns=['file','anomoly_flag'],inplace=True,axis=1)
cleaned_df.head()

Unnamed: 0,ICS_Tim:Chop_Flavor1:BeamOn,ICS_Tim:Chop_Flavor1:StartPulseWidth,ICS_Chop:RampUp:PWChange,ICS_Chop:RampDown:PW,ICS_MPS:Gate_Source:Offset,ICS_Tim:Chop_Flavor1:RampUp,ICS_Tim:Chop_Flavor1:OnPulseWidth,ICS_Tim:Gate_BeamRef:GateWidth,FE_IS:Match:TunerPos,LEBT:Chop_N:V_Set,LEBT:Chop_P:V_Set,LEBT:Focus_1:V_Set,LEBT:Focus_2:V_Set,LEBT:Steer_A:V_Set,LEBT:Steer_B:V_Set,LEBT:Steer_C:V_Set,LEBT:Steer_D:V_Set,Src:Accel:V_Set,Src:H2:Flw_Set,Src:Ign:Pwr_Set,Src:RF_Gnd:Pwr_Set,ICS_Tim:Gate_BeamOn:RR,traces,timestamps
0,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,"[-0.000244140625, -0.0005371093866415322, -0.0...",2024-09-01 15:05:05.036102
1,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,"[-9.765625145519152e-05, -0.000634765659924596...",2024-09-01 15:07:16.847712
2,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,"[-0.0005859375232830644, -0.000390625005820766...",2024-09-01 15:14:51.180287
3,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,"[-0.00048828125, 0.0003906250058207661, 9.7656...",2024-09-01 15:20:13.334597
4,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,"[9.765625145519152e-05, -0.0002929687616415322...",2024-09-01 15:23:58.859281


In [71]:
# Convert 'timestamps' to numeric features
cleaned_df["timestamp_seconds"] = pd.to_datetime(cleaned_df["timestamps"], errors="coerce").astype(int) / 10**9
cleaned_df["time_diff"] = cleaned_df["timestamp_seconds"].diff().fillna(0)

In [72]:
# Convert 'traces' from string format to numerical lists
cleaned_df["traces"] = cleaned_df["traces"].apply(lambda x: np.array(eval(x)) if isinstance(x, str) else np.array(x))

In [73]:
# Handle 10K-Dimensional Traces
def extract_trace_features(trace_row):
    """Extracts key statistical features from a single trace row."""
    downsampled_trace = trace_row[::20]  # Reduce from 10,000 to 500 points
    mean_val = np.mean(downsampled_trace)
    std_val = np.std(downsampled_trace)
    peak_val = np.max(downsampled_trace)
    fft_val = np.abs(fft(downsampled_trace)[1])  # First FFT component
    return np.hstack([downsampled_trace[:50], mean_val, std_val, peak_val, fft_val]) 

In [74]:
# Extract features from all traces
trace_features = np.array(cleaned_df["traces"].apply(extract_trace_features).tolist())

In [75]:
print(trace_features)

[[-2.44140625e-04 -2.44140625e-04  5.85937523e-04 ...  1.68642052e-02
   4.53125015e-02  3.07835087e+00]
 [-9.76562515e-05 -8.30078148e-04 -2.92968762e-04 ...  1.68854003e-02
   4.48730476e-02  3.06309772e+00]
 [-5.85937523e-04 -9.27734363e-04  8.78906285e-04 ...  1.69316765e-02
   4.59472649e-02  3.09694285e+00]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  2.02897669e-02
   5.48828132e-02  3.02304540e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  2.01799932e-02
   5.41503914e-02  2.99976261e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  2.02763235e-02
   5.45410179e-02  3.01789859e+00]]


In [76]:
# Reduce dimensionality using PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
trace_features_pca = pca.fit_transform(trace_features)

In [77]:
# Merge PCA-reduced traces with dataframe
trace_feature_names = [f"PCA_Trace_{i}" for i in range(trace_features_pca.shape[1])]
df_pca = pd.DataFrame(trace_features_pca, columns=trace_feature_names)
df = pd.concat([cleaned_df.drop(columns=["traces"], errors="ignore"), df_pca], axis=1)

In [78]:
df.head()

Unnamed: 0,ICS_Tim:Chop_Flavor1:BeamOn,ICS_Tim:Chop_Flavor1:StartPulseWidth,ICS_Chop:RampUp:PWChange,ICS_Chop:RampDown:PW,ICS_MPS:Gate_Source:Offset,ICS_Tim:Chop_Flavor1:RampUp,ICS_Tim:Chop_Flavor1:OnPulseWidth,ICS_Tim:Gate_BeamRef:GateWidth,FE_IS:Match:TunerPos,LEBT:Chop_N:V_Set,LEBT:Chop_P:V_Set,LEBT:Focus_1:V_Set,LEBT:Focus_2:V_Set,LEBT:Steer_A:V_Set,LEBT:Steer_B:V_Set,LEBT:Steer_C:V_Set,LEBT:Steer_D:V_Set,Src:Accel:V_Set,Src:H2:Flw_Set,Src:Ign:Pwr_Set,Src:RF_Gnd:Pwr_Set,ICS_Tim:Gate_BeamOn:RR,timestamps,timestamp_seconds,time_diff,PCA_Trace_0,PCA_Trace_1,PCA_Trace_2,PCA_Trace_3,PCA_Trace_4,PCA_Trace_5,PCA_Trace_6,PCA_Trace_7,PCA_Trace_8,PCA_Trace_9,PCA_Trace_10,PCA_Trace_11,PCA_Trace_12,PCA_Trace_13,PCA_Trace_14,PCA_Trace_15,PCA_Trace_16,PCA_Trace_17,PCA_Trace_18,PCA_Trace_19,PCA_Trace_20,PCA_Trace_21,PCA_Trace_22,PCA_Trace_23,PCA_Trace_24,PCA_Trace_25,PCA_Trace_26,PCA_Trace_27,PCA_Trace_28,PCA_Trace_29,PCA_Trace_30,PCA_Trace_31,PCA_Trace_32,PCA_Trace_33,PCA_Trace_34,PCA_Trace_35,PCA_Trace_36,PCA_Trace_37,PCA_Trace_38,PCA_Trace_39,PCA_Trace_40,PCA_Trace_41,PCA_Trace_42,PCA_Trace_43,PCA_Trace_44,PCA_Trace_45,PCA_Trace_46,PCA_Trace_47,PCA_Trace_48,PCA_Trace_49
0,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,2024-09-01 15:05:05.036102,1725203000.0,0.0,1.085129,-0.013925,0.004142,-0.000198,0.001542,-0.000574,-7e-06,-0.000551,0.000508,-0.000292,0.001378,0.000939,-0.000171,0.000842,0.000386,0.000415,-0.000829,6.4e-05,2.1e-05,0.000458,0.000168,0.001159,-0.000317,-0.000222,0.000661,0.000594,-0.00041,-0.00059,0.000509,-0.000611,0.00064,-8.4e-05,-2.8e-05,-6.6e-05,-5e-05,6.7e-05,-2.7e-05,1.2e-05,-6e-06,5e-06,7e-06,-1.524567e-11,-2.689521e-11,2.436059e-11,4.412525e-13,-3.685424e-12,-5.845645e-13,1.702712e-12,-1.868892e-12,-6.377615e-12
1,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,2024-09-01 15:07:16.847712,1725203000.0,131.81161,1.069876,-0.014271,0.003903,1e-06,0.0012,-0.000777,-7e-06,0.000205,0.000851,-0.000257,0.000609,0.000903,-0.000733,-0.001168,-0.001719,-0.000417,0.000138,0.001061,-0.000126,0.001304,-0.000866,-0.001079,-7.7e-05,-0.000951,-0.000333,0.000548,-0.000227,-0.000191,-0.000424,6.2e-05,1e-05,0.000185,6.6e-05,-2e-05,-4e-06,-3.3e-05,-1.3e-05,-2.3e-05,1e-06,1e-06,2e-06,-6.59836e-12,2.820239e-11,-1.910236e-11,-7.171617e-14,2.819118e-12,2.571289e-13,-1.431392e-12,1.182208e-12,5.571759e-12
2,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,2024-09-01 15:14:51.180287,1725204000.0,454.332575,1.103724,-0.013663,0.004861,-0.001867,0.000459,0.000237,0.000165,0.00046,0.001239,-0.001787,-0.001673,-0.001328,0.000482,9.9e-05,0.001085,0.001199,-0.000802,0.000455,-0.000555,0.00055,-0.000857,-0.000507,0.000673,-5e-06,0.000142,0.000191,0.000811,0.000433,0.000298,3.9e-05,0.000295,-0.000217,-4.9e-05,-2.2e-05,2.9e-05,9e-06,1.3e-05,1.4e-05,3.8e-05,1.4e-05,7e-06,-6.786025e-12,2.53984e-11,-3.285811e-11,-4.930654e-13,4.543896e-12,6.642436e-13,-2.443525e-12,2.137564e-12,9.563127e-12
3,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,2024-09-01 15:20:13.334597,1725204000.0,322.15431,1.099433,-0.014245,0.004403,2.9e-05,0.00066,0.000996,0.000896,0.002398,0.000335,-0.001056,-0.00151,0.000175,-9e-05,0.000104,0.000465,0.000191,0.001148,0.000178,0.000493,-0.00034,-0.000354,0.000582,-0.00141,0.000731,-0.000151,-0.000158,-5.4e-05,4.4e-05,-0.000615,-0.000253,0.000346,0.00035,0.00014,-5.5e-05,9e-06,-4.1e-05,-3.6e-05,-1.7e-05,-2.4e-05,-1.3e-05,1e-06,-3.149995e-12,3.853334e-11,-2.535747e-11,-8.127301e-13,4.049044e-12,4.159204e-13,-2.08195e-12,1.874951e-12,8.211572e-12
4,908.0,18.0,3.0,18.0,-24.0,98.0,42.0,1008.0,58.029999,2.7,2.7,46.0,39.0,1.7,2.1,1.6,2.0,65.0,32.200001,300.0,4.717,59.900002,2024-09-01 15:23:58.859281,1725204000.0,225.524684,1.093401,-0.013828,0.003878,-0.001091,0.000268,-0.001254,-0.000322,-0.001338,0.000443,0.000995,-0.000778,0.00126,-0.001742,0.001471,-2.2e-05,-0.000945,0.0001,0.000322,0.000618,-0.001415,-0.000509,-0.000797,-0.000199,-0.000319,0.000216,-0.000105,0.000556,0.00033,0.000267,-0.000628,-0.000151,-4.1e-05,-1.5e-05,6e-06,4.2e-05,-5e-06,2e-05,-1.7e-05,-2e-05,9e-06,-2e-06,5.217367e-12,-2.652401e-11,2.668263e-11,8.964163e-14,-4.542889e-12,-4.779342e-13,2.179492e-12,-1.905503e-12,-7.833505e-12


In [79]:
# Sampling function for latent space
def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [80]:
# Sequential Inputs for VAE-BiLSTM 
window_size = 100  # Use last 100 time steps
latent_dim = 16  # Size of VAE latent space

In [81]:
# Extract rolling sequences for pulse traces and time-based features
X_train_combined = []
for i in range(window_size, len(df)):
    past_pulses = df.iloc[i - window_size:i][trace_feature_names + ["time_diff"]]
    X_train_combined.append(past_pulses.values)

# X_train_combined = np.array(X_train_combined)  # Convert to NumPy array
X_train_combined = np.array(X_train_combined, dtype=np.float32)
X_train_combined = np.nan_to_num(X_train_combined)

In [82]:
# Build VAE-BiLSTM Model 
from tensorflow.keras import Model
class MyVAE(Model):
    def __init__(self, window_size, num_features, latent_dim=16):
        super(MyVAE, self).__init__()
        self.window_size = window_size
        self.num_features = num_features
        self.latent_dim = latent_dim

        # --- Encoder Layers ---
        self.encoder_bilstm_1 = Bidirectional(LSTM(64, return_sequences=True))
        self.encoder_bilstm_2 = Bidirectional(LSTM(32, return_sequences=False))
        self.z_mean_dense = Dense(latent_dim, name="z_mean")
        self.z_log_var_dense = Dense(latent_dim, name="z_log_var")

        # --- Decoder Layers ---
        self.repeat_vector = RepeatVector(window_size)
        self.decoder_bilstm_1 = Bidirectional(LSTM(32, return_sequences=True))
        self.decoder_bilstm_2 = Bidirectional(LSTM(64, return_sequences=True))
        self.output_dense = TimeDistributed(Dense(num_features))

    def encode(self, x):
        x = self.encoder_bilstm_1(x)
        x = self.encoder_bilstm_2(x)
        z_mean = self.z_mean_dense(x)
        z_log_var = self.z_log_var_dense(x)
        return z_mean, z_log_var

    def reparameterize(self, z_mean, z_log_var):
        epsilon = tf.random.normal(shape=(tf.shape(z_mean)[0], self.latent_dim))
        z = z_mean + tf.exp(0.5 * z_log_var) * epsilon
        return z

    def decode(self, z):
        x = self.repeat_vector(z)
        x = self.decoder_bilstm_1(x)
        x = self.decoder_bilstm_2(x)
        return self.output_dense(x)

    def call(self, inputs):
        # 1) Encode
        z_mean, z_log_var = self.encode(inputs)
        # 2) Sample z
        z = self.reparameterize(z_mean, z_log_var)
        # 3) Decode
        x_recon = self.decode(z)

        # --- Compute KL Divergence ---
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var),
            axis=-1
        )
        # Add KL loss to the model
        self.add_loss(kl_loss)

        return x_recon


In [83]:
window_size = 100
num_features = 51
vae_model = MyVAE(window_size=window_size, num_features=num_features, latent_dim=16)

In [84]:
# Tensorboard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

AttributeError: type object 'datetime.datetime' has no attribute 'datetime'

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-5)
vae_model.compile(optimizer='sgd', loss='mae')

In [None]:
history=vae_model.fit(X_train_combined, X_train_combined, epochs=50, batch_size=16, validation_split=0.1,callbacks=[tensorboard_callback])

In [None]:
plt.figure(figsize=(8,6))
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.title("Validation Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.plot(history.history["loss"], label="Training Loss")
plt.title("Training Loss per Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Anomaly Detection 
# Compute reconstruction error for each pulse
X_pred_combined = vae_model.predict(X_train_combined)
reconstruction_errors = np.mean(np.abs(X_train_combined - X_pred_combined), axis=(1, 2))  # Mean absolute error

In [None]:
# Define anomaly threshold (e.g., 95th percentile)
threshold = np.percentile(reconstruction_errors, 95)
anomalies = reconstruction_errors > threshold

In [None]:
# Store anomaly results
df_anomalies_combined = pd.DataFrame({
    "Timestamp": df["timestamps"].iloc[window_size:],  # Align with pulse times
    "Reconstruction_Error": reconstruction_errors,
    "Anomaly": anomalies
})

In [None]:
# Time-series plot of anomaly scores
plt.figure(figsize=(12, 6))
plt.plot(df_anomalies_combined["Timestamp"], df_anomalies_combined["Reconstruction_Error"], marker='o', linestyle='-', color='r', label="Reconstruction Error")
plt.axhline(threshold, color='blue', linestyle='dashed', label="Anomaly Threshold")
plt.xlabel("Timestamp")
plt.ylabel("Anomaly Score")
plt.title("Anomaly Scores Over Time (Including Timestamps)")
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Scatter Plot: Anomaly Score vs. Timestamp
plt.figure(figsize=(12, 6))
sns.scatterplot(x=df_anomalies_combined["Timestamp"], y=df_anomalies_combined["Reconstruction_Error"], hue=df_anomalies_combined["Anomaly"], palette="coolwarm", s=100, edgecolor="k")
plt.xlabel("Timestamp")
plt.ylabel("Anomaly Score")
plt.title("Scatter Plot of Anomalous Pulses Over Time")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
# Display top anomalies
print("Top 20 Anomalous Pulses:")
print(df_anomalies_combined.sort_values(by="Reconstruction_Error", ascending=False).head(20))