## Import Libraries

In [104]:
### Import Packages ###
import pandas as pd
import numpy as np
import struct
import os
import matplotlib.pyplot as plt
import datetime
from datetime import datetime, timedelta

#Jlab Packages
from data_utils import get_traces
from beam_settings_parser_hdf5 import BeamConfigParserHDF5
from beam_settings_prep import BeamConfigPreProcessor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### BPM Config ###

In [105]:
class BPMDataConfig:

    def __init__(self):
        self.beam_settings_data_path = "/work/data_science/suf_sns/beam_configurations_data/processed_data/clean_beam_config_processed_df.csv"
        self.beam_param_parser_cfg = {"data_location": "/work/data_science/suf_sns/beam_configurations_data/hdf5_sept2024/"}
        self.beam_settings_prep_cfg = {
            "rescale": False,
            "beam_config": [
                'FE_IS:Match:TunerPos',
                'LEBT:Chop_N:V_Set',
                'LEBT:Chop_P:V_Set',
                'LEBT:Focus_1:V_Set',
                'LEBT:Focus_2:V_Set',
                'LEBT:Steer_A:V_Set',
                'LEBT:Steer_B:V_Set',
                'LEBT:Steer_C:V_Set',
                'LEBT:Steer_D:V_Set',
                'Src:Accel:V_Set',
                'Src:H2:Flw_Set',
                'Src:Ign:Pwr_Set',
                'Src:RF_Gnd:Pwr_Set',
                'ICS_Chop:RampDown:PW',
                'ICS_Chop:RampUp:PWChange',
                'ICS_MPS:Gate_Source:Offset',
                'ICS_Tim:Chop_Flavor1:BeamOn',
                'ICS_Tim:Chop_Flavor1:OnPulseWidth',
                'ICS_Tim:Chop_Flavor1:RampUp',
                'ICS_Tim:Chop_Flavor1:StartPulseWidth',
                'ICS_Tim:Gate_BeamRef:GateWidth',
                'ICS_Tim:Gate_BeamOn:RR'
            ]
        }
        self.beam_config = [
            'timestamps',
            'FE_IS:Match:TunerPos',
            'LEBT:Chop_N:V_Set',
            'LEBT:Chop_P:V_Set',
            'LEBT:Focus_1:V_Set',
            'LEBT:Focus_2:V_Set',
            'LEBT:Steer_A:V_Set',
            'LEBT:Steer_B:V_Set',
            'LEBT:Steer_C:V_Set',
            'LEBT:Steer_D:V_Set',
            'Src:Accel:V_Set',
            'Src:H2:Flw_Set',
            'Src:Ign:Pwr_Set',
            'Src:RF_Gnd:Pwr_Set',
            'ICS_Chop:RampDown:PW',
            'ICS_Chop:RampUp:PWChange',
            'ICS_MPS:Gate_Source:Offset',
            'ICS_Tim:Chop_Flavor1:BeamOn',
            'ICS_Tim:Chop_Flavor1:OnPulseWidth',
            'ICS_Tim:Chop_Flavor1:RampUp',
            'ICS_Tim:Chop_Flavor1:StartPulseWidth',
            'ICS_Tim:Gate_BeamRef:GateWidth',
            'ICS_Tim:Gate_BeamOn:RR'
        ]

        self.column_to_add = [
    'FE_IS:Match:TunerPos',
    'LEBT:Chop_N:V_Set',
    'LEBT:Chop_P:V_Set',
    'LEBT:Focus_1:V_Set',
    'LEBT:Focus_2:V_Set',
    'LEBT:Steer_A:V_Set',
    'LEBT:Steer_B:V_Set',
    'LEBT:Steer_C:V_Set',
    'LEBT:Steer_D:V_Set',
    'Src:Accel:V_Set',
    'Src:H2:Flw_Set',
    'Src:Ign:Pwr_Set',
    'Src:RF_Gnd:Pwr_Set',
    'ICS_Tim:Gate_BeamOn:RR',
    'ICS_Chop-RampDown-PW',
    'ICS_Chop-RampUp-PWChange',
    'ICS_Tim-Gate_BeamRef-GateWidth'
]

        self.rename_mappings = {
    'ICS_Chop-RampDown-PW': 'ICS_Chop:RampDown:PW',
    'ICS_Chop-RampUp-PWChange': 'ICS_Chop:RampUp:PWChange',
    'ICS_MPS-Gate_Source-Offset': 'ICS_MPS:Gate_Source:Offset',
    'ICS_Chop-BeamOn-Width': 'ICS_Tim:Chop_Flavor1:BeamOn',
    'ICS_Chop-BeamOn-PW': 'ICS_Tim:Chop_Flavor1:OnPulseWidth',
    'ICS_Chop-RampUp-Width': 'ICS_Tim:Chop_Flavor1:RampUp',
    'ICS_Chop-RampUp-PW': 'ICS_Tim:Chop_Flavor1:StartPulseWidth',
    'ICS_Tim-Gate_BeamRef-GateWidth': 'ICS_Tim:Gate_BeamRef:GateWidth'
}


    def configs_hist(self, dataframe, timestamp):
        subset_columns = dataframe.columns.tolist()
        subset_columns.remove(timestamp)
        df_shifted = dataframe[subset_columns].shift(1)
        mask = (dataframe[subset_columns] == df_shifted).all(axis=1)
        dataframe = dataframe[~mask]

        dataframe['time_diff'] = dataframe[timestamp].diff()
        dataframe['timestamps_trm'] = dataframe[timestamp] + dataframe['time_diff'].shift(-1) - timedelta(seconds=0.000001)

        subset_columns.insert(0, timestamp)
        subset_columns.insert(1, "timestamps_trm")

        return dataframe[subset_columns]

    def summary(self, text, df):
        print(f'{text} shape: {df.shape}')

        # Filter for numeric columns only
        numeric_cols = df.select_dtypes(include=['number'])

        summ = pd.DataFrame(numeric_cols.dtypes, columns=['dtypes'])
        summ['null'] = numeric_cols.isnull().sum()
        summ['unique'] = numeric_cols.nunique()
        summ['min'] = numeric_cols.min()
        summ['median'] = numeric_cols.median()
        summ['max'] = numeric_cols.max()
        summ['mean'] = numeric_cols.mean()
        summ['std'] = numeric_cols.std()
        summ['duplicate'] = df.duplicated().sum()

        return summ


    def update_beam_config(self,beam_config_df):
        for col in self.column_to_add:
            if col not in beam_config_df.columns:
                beam_config_df[col] = np.nan

        beam_config_df.rename(columns=self.rename_mappings, inplace=True)
        return beam_config_df


# Create an instance of BPMDataConfig
dc = BPMDataConfig()


In [106]:
beam_config_df = pd.read_csv(dc.beam_settings_data_path)
beam_config_df = beam_config_df.drop("Unnamed: 0", axis=1, errors='ignore')
beam_config_df['timestamps'] = pd.to_datetime(configs['timestamps'])


In [107]:
### Sep24 hdf5 beam settings ###
parser = BeamConfigParserHDF5(dc.beam_param_parser_cfg)
data, _ = parser.run()

Provided file is not hdf5 format, skipping:  .ipynb_checkpoints
BeamParamParser: Number of samples parsed
FE_IS:Match:TunerPos 633
ICS_Chop:RampDown:PW 41
ICS_Chop:RampUp:PWChange 869
ICS_MPS:Gate_Source:Offset 262
ICS_Tim:Chop_Flavor1:BeamOn 1780
ICS_Tim:Chop_Flavor1:OnPulseWidth 2564
ICS_Tim:Chop_Flavor1:RampUp 52
ICS_Tim:Chop_Flavor1:StartPulseWidth 16
ICS_Tim:Gate_BeamOn:RR 12437
ICS_Tim:Gate_BeamRef:GateWidth 1818
LEBT:Chop_N:V_Set 6557
LEBT:Chop_P:V_Set 6561
LEBT:Focus_1:V_Set 3018
LEBT:Focus_2:V_Set 3018
LEBT:Steer_A:V_Set 6550
LEBT:Steer_B:V_Set 6557
LEBT:Steer_C:V_Set 6554
LEBT:Steer_D:V_Set 6551
Src:Accel:V_Set 3408
Src:H2:Flw_Set 2959
Src:Ign:Pwr_Set 2942
Src:RF_Gnd:Pwr_Set 5


In [108]:
### Get Prepared datasets ###
prep = BeamConfigPreProcessor(dc.beam_settings_prep_cfg)
prepared_settings, run_cfg = prep.run(data)


['FE_IS:Match:TunerPos', 'LEBT:Chop_N:V_Set', 'LEBT:Chop_P:V_Set', 'LEBT:Focus_1:V_Set', 'LEBT:Focus_2:V_Set', 'LEBT:Steer_A:V_Set', 'LEBT:Steer_B:V_Set', 'LEBT:Steer_C:V_Set', 'LEBT:Steer_D:V_Set', 'Src:Accel:V_Set', 'Src:H2:Flw_Set', 'Src:Ign:Pwr_Set', 'Src:RF_Gnd:Pwr_Set', 'ICS_Chop:RampDown:PW', 'ICS_Chop:RampUp:PWChange', 'ICS_MPS:Gate_Source:Offset', 'ICS_Tim:Chop_Flavor1:BeamOn', 'ICS_Tim:Chop_Flavor1:OnPulseWidth', 'ICS_Tim:Chop_Flavor1:RampUp', 'ICS_Tim:Chop_Flavor1:StartPulseWidth', 'ICS_Tim:Gate_BeamRef:GateWidth', 'ICS_Tim:Gate_BeamOn:RR']
Length of beam param df:  48909


In [109]:
# Convert configs to list
beam_config_df = dc.update_beam_config(beam_config_df)
bpm = pd.concat([beam_config_df, prepared_settings])

In [110]:
dc.summary('bpm_summary', bpm)

bpm_summary shape: (67764, 23)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
ICS_Tim:Chop_Flavor1:BeamOn,float64,0,645,0.0,886.0,1000.0,794.016616,256.706192,827
ICS_Tim:Chop_Flavor1:StartPulseWidth,float64,0,36,0.0,18.0,55.0,18.404389,3.075782,827
ICS_Chop:RampUp:PWChange,float64,0,27,0.0,4.0,4095.0,10.059028,97.181057,827
ICS_Chop:RampDown:PW,float64,0,16,0.0,18.0,45.0,18.192654,1.649764,827
ICS_MPS:Gate_Source:Offset,float64,0,140,-949.96748,-20.0,0.0,-30.24084,77.447447,827
ICS_Tim:Chop_Flavor1:RampUp,float64,0,49,0.0,98.0,99.0,90.025781,26.656475,827
ICS_Tim:Chop_Flavor1:OnPulseWidth,float64,0,39,0.0,40.0,55.0,33.201124,11.371221,827
ICS_Tim:Gate_BeamRef:GateWidth,float64,0,664,0.0,986.0,1052.0,825.438979,343.844063,827
FE_IS:Match:TunerPos,float32,18855,3,58.009998,58.040001,58.040001,58.035648,0.020661,827
LEBT:Chop_N:V_Set,float32,18855,2,2.43,2.7,2.7,2.699995,0.001783,827


### DCM Configs

In [123]:
class DCMDatConfig:

    def __init__(self):
        self.dataset1_loc = "/work/data_science/suf_sns/DCM_Errant/"
        self.dataset2_loc = "/w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024"
        self.start_date = 20220218
        self.end_date = 20220318
        self.anomaly_type = "00110000"  # --  48
        self.length_of_waveform = 10000
        self.exclude_dates = [20220220, 20220221, 20220222, 20220223, 20220301, 20220308, 20220309, 20220315]  # Fixed date

        self.filtered_normal_files = []
        self.filtered_anomaly_files = []
        self.filtered_normal_files2 = []
        self.filtered_anomaly_files2 = []
        self.traces = []
        self.timestamps = []
        

    def GetFebFilteredFiles(self):
        for root, _, files in os.walk(self.dataset1_loc):
            for file in files:
                if '.bin' in file and 'DCML' in file:
                    try:
                        date = int(file[:8])  # Extract date from filename
                        if self.start_date <= date <= self.end_date and date not in self.exclude_dates:
                            if '00000000' in file:
                                self.filtered_normal_files.append(os.path.join(root, file))
                            elif self.anomaly_type in file:
                                self.filtered_anomaly_files.append(os.path.join(root, file))
                    except ValueError:
                        print(f"Error in filename: {file} - Date could not be read!")

        print('Number of available normal files:', len(self.filtered_normal_files))
        print('Number of available anomaly files:', len(self.filtered_anomaly_files))
        return self.filtered_normal_files, self.filtered_anomaly_files

    def GetSepFilteredFiles(self):
        subfolders = [f.path for f in os.scandir(self.dataset2_loc) if f.is_dir()]
        for directory in subfolders:
            if "normal" in directory or "anomal" in directory:
                for root, _, files in os.walk(directory):
                    for file in files:
                        if ".gz" in file:
                            if 'normal' in directory:
                                self.filtered_normal_files2.append(os.path.join(root, file))
                            elif "anomal" in directory:
                                self.filtered_anomaly_files2.append(os.path.join(root, file))

        print('Number of available normal files:', len(self.filtered_normal_files2))
        print('Number of available anomaly files:', len(self.filtered_anomaly_files2))
        return self.filtered_normal_files2, self.filtered_anomaly_files2

    def GetTracesAndTs(self,filtered_files):
        index = np.random.randint(0, len(filtered_files))
        filepath = filtered_files[index]
        print(index)
        try:
            self.traces, self.timestamps = get_traces(filepath, var_id="Trace2", begin=3000, shift=self.length_of_waveform, data_type=0)
        except Exception as e:
            print("Error in reading the file: ", filepath)
            print("Error:", e)
        return self.traces, self.timestamps
        

dcm = DCMDatConfig()


In [124]:
### Print Feb Data Details ###
filtered_normal_files,filtered_anomaly_files=dcm.GetFebFilteredFiles()

Number of available normal files: 2173
Number of available anomaly files: 2953


In [125]:
### Print Sep Data Details ###
filtered_normal_files2,filtered_anomaly_files2=dcm.GetSepFilteredFiles()

Number of available normal files: 10699
Number of available anomaly files: 20592


## Get Traces and Timestamps 

In [127]:
traces,timestamps=dcm.GetTracesAndTs(filtered_normal_files)

540


## Merge Datasets

In [135]:
class MergeDatasets:
    def __init__(self):
        self.traces = []
        self.timestamps = []
        self.flag = []
        self.file = []
        self.length_of_waveform = 10000

    def IterateFiles(self, file_paths, MonthName):
        for dcml in file_paths[:10]:  
            tmp_trace, tmp_timestamp = get_traces(dcml, var_id="Trace2", begin=3000, shift=self.length_of_waveform, data_type=0)
            tmp_trace = np.array(tmp_trace[1:])
            tmp_timestamp = np.array(tmp_timestamp[1:])

            self.traces.extend(tmp_trace)
            self.flag.extend([0] * len(tmp_trace))
            self.file.extend([MonthName] * len(tmp_trace))
            self.timestamps.extend(tmp_timestamp)
            dcm = pd.DataFrame({'anomoly_flag':self.flag, 'file':self.file, 'timestamps':self.timestamps, 'traces':self.traces})
        return dcm

    def MergeDatasets(self, dcm, bpm):
        merged_df = pd.merge_asof(
            dcm.sort_values("timestamps"),
            bpm.sort_values("timestamps"),
            on="timestamps",
            direction="nearest"
        )
        return merged_df

md = MergeDatasets()


In [136]:
dcm_FebMar22_N = md.IterateFiles(filtered_normal_files,'FebMar22')
dcm_FebMar22_A = md.IterateFiles(filtered_anomaly_files,'FebMar22')
dcm_Sep24_N = md.IterateFiles(filtered_normal_files2,'Sep24')
dcm_Sep24_A = md.IterateFiles(filtered_anomaly_files2,'Sep24')

File:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_183444.5928_DCML.bin.gz   bWidth below threshold: 52
Non zero alarm value (48) in a named normal file at:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_185832.2880_DCML.bin.gz
Next timestamp is 0:00:00.199993 sec apart
File:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_182423.6166_DCML.bin.gz   bWidth below threshold: 30
Non zero alarm value (48) in a named normal file at:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/01_15/20240901_155622.2514_DCML.bin.gz
Next timestamp is 0:00:00.099997 sec apart
Non zero alarm value (48) in a named normal file at:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/01_15/20240901_150505.0027_DCML.bin.gz
Next timestamp is 0:00:00.099996 sec apart
Non zero alarm value (48) in a named normal file at:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/an

In [143]:
dcm_FebMar22=pd.merge_asof(
    dcm_FebMar22_N.sort_values("timestamps"), 
    dcm_FebMar22_A.sort_values("timestamps"), 
    on="timestamps", 
    direction="nearest"
)

dcm_Sep24=pd.merge_asof(
    dcm_Sep24_N.sort_values("timestamps"), 
    dcm_Sep24_A.sort_values("timestamps"), 
    on="timestamps", 
    direction="nearest"
)
dcm=pd.merge_asof(
    dcm_FebMar22.sort_values("timestamps"), 
    dcm_Sep24.sort_values("timestamps"), 
    on="timestamps", 
    direction="nearest"
)


In [144]:
merged_df = pd.merge_asof(
    dcm.sort_values("timestamps"), 
    bpm.sort_values("timestamps"), 
    on="timestamps", 
    direction="nearest"
)
merged_df.head()

Unnamed: 0,anomoly_flag_x_x,file_x_x,timestamps,traces_x_x,anomoly_flag_y_x,file_y_x,traces_y_x,anomoly_flag_x_y,file_x_y,traces_x_y,...,LEBT:Focus_2:V_Set,LEBT:Steer_A:V_Set,LEBT:Steer_B:V_Set,LEBT:Steer_C:V_Set,LEBT:Steer_D:V_Set,Src:Accel:V_Set,Src:H2:Flw_Set,Src:Ign:Pwr_Set,Src:RF_Gnd:Pwr_Set,ICS_Tim:Gate_BeamOn:RR
0,0,FebMar22,2022-02-28 08:30:00.586558,"[0.00063476566, 0.0009765625, -9.765625e-05, -...",0,FebMar22,"[0.00063476566, 0.0009765625, -9.765625e-05, -...",0,FebMar22,"[0.00063476566, 0.0009765625, -9.765625e-05, -...",...,,,,,,,,,,
1,0,FebMar22,2022-02-28 08:30:00.603224,"[0.000390625, 9.765625e-05, -0.000390625, -0.0...",0,FebMar22,"[0.000390625, 9.765625e-05, -0.000390625, -0.0...",0,FebMar22,"[0.000390625, 9.765625e-05, -0.000390625, -0.0...",...,,,,,,,,,,
2,0,FebMar22,2022-02-28 08:30:00.619889,"[-9.765625e-05, 0.00024414062, -0.00048828125,...",0,FebMar22,"[-9.765625e-05, 0.00024414062, -0.00048828125,...",0,FebMar22,"[-9.765625e-05, 0.00024414062, -0.00048828125,...",...,,,,,,,,,,
3,0,FebMar22,2022-02-28 08:30:00.636554,"[-0.00029296876, 0.00068359374, 0.00034179687,...",0,FebMar22,"[-0.00029296876, 0.00068359374, 0.00034179687,...",0,FebMar22,"[-0.00029296876, 0.00068359374, 0.00034179687,...",...,,,,,,,,,,
4,0,FebMar22,2022-02-28 08:30:00.653220,"[0.00014648438, -4.8828126e-05, -0.00078125, -...",0,FebMar22,"[0.00014648438, -4.8828126e-05, -0.00078125, -...",0,FebMar22,"[0.00014648438, -4.8828126e-05, -0.00078125, -...",...,,,,,,,,,,


## Data Preprocessing

In [None]:
## Data Preprocessing ##

class DataPreprocessor:
    def __init__(self, df: pd.DataFrame):
      
        self.df = df.copy()  # Create a copy to avoid modifying the original DataFrame

    def check_nan(self):
       
        nan_counts = self.df.isna().sum()
        return nan_counts[nan_counts > 0]

    def remove_nan(self):
      
        self.df.dropna(inplace=True)
        return self.df

    def check_null(self):
        
        null_counts = self.df.isnull().sum()
        return null_counts[null_counts > 0]

    def remove_null(self):
        
        self.df.dropna(inplace=True)
        return self.df

    def check_duplicates(self):
        "
        return self.df.duplicated().sum()

    def remove_duplicates(self):
        
        self.df.drop_duplicates(inplace=True)
        return self.df

    def check_outliers(self):
        
        outlier_dict = {}
        for col in self.df.select_dtypes(include=[np.number]).columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = self.df[(self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))].shape[0]
            if outliers > 0:
                outlier_dict[col] = outliers
        return outlier_dict

    def remove_outliers(self):
        
        for col in self.df.select_dtypes(include=[np.number]).columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            self.df = self.df[(self.df[col] >= lower_bound) & (self.df[col] <= upper_bound)]
        return self.df

    def convert_float64_to_float32(self):
       
        float64_cols = self.df.select_dtypes(include=['float64']).columns
        self.df[float64_cols] = self.df[float64_cols].astype('float32')
        return self.df.dtypes

    def rename_columns(self, rename_dict):
       
        self.df.rename(columns=rename_dict, inplace=True)
        return self.df.head()

    def get_dataframe(self):
       
        return self.df
