## Import Libraries

In [1]:
### Direct Nearest Timestamp Matching ###
import pandas as pd
import numpy as np
import struct
import os
import matplotlib.pyplot as plt
import datetime

from data_utils import get_traces
from beam_settings_parser_hdf5 import BeamConfigParserHDF5
from beam_settings_prep import BeamConfigPreProcessor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

ModuleNotFoundError: No module named 'data_utils'

## Load and Merge Beam Position Monitor (BPM) datasets

In [3]:
### FebMar22 .csv beam settings
beam_settings_data_path = "/work/data_science/suf_sns/beam_configurations_data/processed_data/clean_beam_config_processed_df.csv"
configs = pd.read_csv(beam_settings_data_path)
configs = configs.drop("Unnamed: 0", axis=1, errors='ignore')
configs['timestamps'] = pd.to_datetime(configs['timestamps'])

### Sep24 hdf5 beam settings
beam_param_parser_cfg = {"data_location": "/work/data_science/suf_sns/beam_configurations_data/hdf5_sept2024/"}
parser = BeamConfigParserHDF5(beam_param_parser_cfg)
data, _ = parser.run()

beam_settings_prep_cfg = {
    "rescale": False,
    "beam_config": ['FE_IS:Match:TunerPos',
                    'LEBT:Chop_N:V_Set',
                    'LEBT:Chop_P:V_Set',
                    'LEBT:Focus_1:V_Set',
                    'LEBT:Focus_2:V_Set',
                    'LEBT:Steer_A:V_Set',
                    'LEBT:Steer_B:V_Set',
                    'LEBT:Steer_C:V_Set',
                    'LEBT:Steer_D:V_Set',
                    'Src:Accel:V_Set',
                    'Src:H2:Flw_Set',
                    'Src:Ign:Pwr_Set',
                    'Src:RF_Gnd:Pwr_Set',
                    'ICS_Chop:RampDown:PW', # ICS_Chop-RampDown-PW
                    'ICS_Chop:RampUp:PWChange', # ICS_Chop-RampUp-PWChange
                    'ICS_MPS:Gate_Source:Offset', # ICS_MPS-Gate_Source-Offset
                    'ICS_Tim:Chop_Flavor1:BeamOn', # ICS_Chop-BeamOn-Width
                    'ICS_Tim:Chop_Flavor1:OnPulseWidth', # ICS_Chop-BeamOn-PW
                    'ICS_Tim:Chop_Flavor1:RampUp', # ICS_Chop-RampUp-Width
                    'ICS_Tim:Chop_Flavor1:StartPulseWidth', # ICS_Chop-RampUp-PW
                    'ICS_Tim:Gate_BeamRef:GateWidth', # ICS_Tim-Gate_BeamRef-GateWidth
                    'ICS_Tim:Gate_BeamOn:RR']
}
prep = BeamConfigPreProcessor(beam_settings_prep_cfg)
prepared_settings, run_cfg = prep.run(data)

Provided file is not hdf5 format, skipping:  .ipynb_checkpoints
BeamParamParser: Number of samples parsed
FE_IS:Match:TunerPos 633
ICS_Chop:RampDown:PW 41
ICS_Chop:RampUp:PWChange 869
ICS_MPS:Gate_Source:Offset 262
ICS_Tim:Chop_Flavor1:BeamOn 1780
ICS_Tim:Chop_Flavor1:OnPulseWidth 2564
ICS_Tim:Chop_Flavor1:RampUp 52
ICS_Tim:Chop_Flavor1:StartPulseWidth 16
ICS_Tim:Gate_BeamOn:RR 12437
ICS_Tim:Gate_BeamRef:GateWidth 1818
LEBT:Chop_N:V_Set 6557
LEBT:Chop_P:V_Set 6561
LEBT:Focus_1:V_Set 3018
LEBT:Focus_2:V_Set 3018
LEBT:Steer_A:V_Set 6550
LEBT:Steer_B:V_Set 6557
LEBT:Steer_C:V_Set 6554
LEBT:Steer_D:V_Set 6551
Src:Accel:V_Set 3408
Src:H2:Flw_Set 2959
Src:Ign:Pwr_Set 2942
Src:RF_Gnd:Pwr_Set 5
['FE_IS:Match:TunerPos', 'LEBT:Chop_N:V_Set', 'LEBT:Chop_P:V_Set', 'LEBT:Focus_1:V_Set', 'LEBT:Focus_2:V_Set', 'LEBT:Steer_A:V_Set', 'LEBT:Steer_B:V_Set', 'LEBT:Steer_C:V_Set', 'LEBT:Steer_D:V_Set', 'Src:Accel:V_Set', 'Src:H2:Flw_Set', 'Src:Ign:Pwr_Set', 'Src:RF_Gnd:Pwr_Set', 'ICS_Chop:RampDown:PW', 'IC

  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)


Length of beam param df:  48909


  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)
  master_df.fillna(method='ffill', inplace=True)


In [4]:
def configs_hist(dataframe, timestamp):
    subset_columns = dataframe.columns.tolist()
    subset_columns.remove(timestamp)
    df_shifted = dataframe[subset_columns].shift(1)
    mask = (dataframe[subset_columns] == df_shifted).all(axis=1)
    dataframe = dataframe[~mask] #.reset_index(drop=True)
    
    dataframe['time_diff'] = dataframe[timestamp].diff()
    dataframe['timestamps_trm'] = dataframe[timestamp] + dataframe[timestamp].diff().shift(-1) - datetime.timedelta(seconds=0.000001)
    
    subset_columns.insert(0, timestamp)
    subset_columns.insert(1, "timestamps_trm")
    
    return dataframe[subset_columns]

def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['duplicate'] = df.duplicated().sum()
    return summ

In [5]:
# configs = configs_hist(configs,"timestamps")
# prepared_settings = configs_hist(prepared_settings,"timestamps")

In [6]:
column_to_add = [
    'FE_IS:Match:TunerPos',
    'LEBT:Chop_N:V_Set',
    'LEBT:Chop_P:V_Set',
    'LEBT:Focus_1:V_Set',
    'LEBT:Focus_2:V_Set',
    'LEBT:Steer_A:V_Set',
    'LEBT:Steer_B:V_Set',
    'LEBT:Steer_C:V_Set',
    'LEBT:Steer_D:V_Set',
    'Src:Accel:V_Set',
    'Src:H2:Flw_Set',
    'Src:Ign:Pwr_Set',
    'Src:RF_Gnd:Pwr_Set',
    'ICS_Tim:Gate_BeamOn:RR'
]

for entry in column_to_add:
    configs[entry] = np.nan

configs = configs.rename(columns={
    'ICS_Chop-RampDown-PW' : 'ICS_Chop:RampDown:PW', 
    'ICS_Chop-RampUp-PWChange' : 'ICS_Chop:RampUp:PWChange',
    'ICS_MPS-Gate_Source-Offset' : 'ICS_MPS:Gate_Source:Offset',
    'ICS_Chop-BeamOn-Width' : 'ICS_Tim:Chop_Flavor1:BeamOn',
    'ICS_Chop-BeamOn-PW' : 'ICS_Tim:Chop_Flavor1:OnPulseWidth',
    'ICS_Chop-RampUp-Width' : 'ICS_Tim:Chop_Flavor1:RampUp',
    'ICS_Chop-RampUp-PW' : 'ICS_Tim:Chop_Flavor1:StartPulseWidth',
    'ICS_Tim-Gate_BeamRef-GateWidth' : 'ICS_Tim:Gate_BeamRef:GateWidth', 
    })

In [7]:
beam_config = [
    'timestamps',
    'FE_IS:Match:TunerPos',
    'LEBT:Chop_N:V_Set',
    'LEBT:Chop_P:V_Set',
    'LEBT:Focus_1:V_Set',
    'LEBT:Focus_2:V_Set',
    'LEBT:Steer_A:V_Set',
    'LEBT:Steer_B:V_Set',
    'LEBT:Steer_C:V_Set',
    'LEBT:Steer_D:V_Set',
    'Src:Accel:V_Set',
    'Src:H2:Flw_Set',
    'Src:Ign:Pwr_Set',
    'Src:RF_Gnd:Pwr_Set',
    'ICS_Chop:RampDown:PW', # ICS_Chop-RampDown-PW
    'ICS_Chop:RampUp:PWChange', # ICS_Chop-RampUp-PWChange
    'ICS_MPS:Gate_Source:Offset', # ICS_MPS-Gate_Source-Offset
    'ICS_Tim:Chop_Flavor1:BeamOn', # ICS_Chop-BeamOn-Width
    'ICS_Tim:Chop_Flavor1:OnPulseWidth', # ICS_Chop-BeamOn-PW
    'ICS_Tim:Chop_Flavor1:RampUp', # ICS_Chop-RampUp-Width
    'ICS_Tim:Chop_Flavor1:StartPulseWidth', # ICS_Chop-RampUp-PW
    'ICS_Tim:Gate_BeamRef:GateWidth', # ICS_Tim-Gate_BeamRef-GateWidth
    'ICS_Tim:Gate_BeamOn:RR']

In [8]:
configs = configs.loc[:, beam_config]

In [9]:
bpm = pd.concat([configs, prepared_settings])

  bpm = pd.concat([configs, prepared_settings])


In [10]:
summary('bpm_summary', bpm)

bpm_summary shape: (67764, 23)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
timestamps,datetime64[ns],0,66937,2021-02-11 08:18:29.948852,2024-07-16 09:15:27.486576640,2024-10-01 03:49:33.901234468,2023-11-08 09:27:51.632123904,452 days 05:38:38.317375472,827
FE_IS:Match:TunerPos,float64,18855,3,58.009998,58.040001,58.040001,58.035648,0.007849,827
LEBT:Chop_N:V_Set,float64,18855,2,2.43,2.7,2.7,2.699995,0.001221,827
LEBT:Chop_P:V_Set,float64,18855,4,0.54,2.7,2.7,2.699851,0.015682,827
LEBT:Focus_1:V_Set,float64,18855,5,2.8848,46.0,46.077202,45.998471,0.220181,827
LEBT:Focus_2:V_Set,float64,18855,4,6.0,39.0,39.077202,38.991222,0.528146,827
LEBT:Steer_A:V_Set,float64,18855,1,1.7,1.7,1.7,1.7,0.0,827
LEBT:Steer_B:V_Set,float64,18855,1,2.1,2.1,2.1,2.1,0.0,827
LEBT:Steer_C:V_Set,float64,18855,1,1.6,1.6,1.6,1.6,0.0,827
LEBT:Steer_D:V_Set,float64,18855,1,2.0,2.0,2.0,2.0,0.0,827


## Load and Merge Beam Tracings from Differential Current Monitor (DCM)

### Feb/Mar22 Data

In [11]:
dataset1_loc = "/work/data_science/suf_sns/DCM_Errant/"

start_date = 20220218
end_date = 20220318

anomaly_type = "00110000" #--  48
length_of_waveform = 10000

exclude_dates = [20220220, 20220221, 20220222, 20220223, 20220301, 20220308, 20220309, 2022015]

In [12]:
filtered_normal_files = []
filtered_anomaly_files = []
for root, subfolders, files in os.walk(dataset1_loc):
    for file in files:
        if '.bin' in file and 'DCML' in file:
            try:
                date = int(file[:8])
            except:
                print("Error in filename: ", file)
                print("-- Date could not be read!")
                continue
            if date >= start_date and date <= end_date:

                if date in exclude_dates:
                    continue

                if '00000000' in file:
                    filtered_normal_files.append(os.path.join(root, file))
                elif anomaly_type in file:
                    filtered_anomaly_files.append(os.path.join(root, file))

print('Number of available normal files: ', len(filtered_normal_files))
print('Number of available anomaly files: ', len(filtered_anomaly_files))

Number of available normal files:  2173
Number of available anomaly files:  2953


Number of available normal files: 2173\
Number of available anomaly files: 2953\
Total files to work with: **5126**

**Question** - Why is the length of the traces 26, while the length of timestamps 27?\
*Thought* --> the first array in the traces has "unusual data" compared to the rest, we were supplied code that ignored the first record of the traces, should we do the same for the timestamps?

### Sept24 Data

In [13]:
dataset2_loc = "/w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024"

In [14]:
filtered_normal_files2 = []
filtered_anomaly_files2 = []
subfolders = [ f.path for f in os.scandir(dataset2_loc) if f.is_dir() ]
for directory in subfolders:
    if "normal" in directory or "anomal" in directory:                
        for root, subfolders, files in os.walk(directory):
            for file in files:
                full_path = root
                if ".gz" in file:
                    if 'normal' in directory:
                        filtered_normal_files2.append(os.path.join(full_path, file))
                    elif "anomal" in directory:
                        filtered_anomaly_files2.append(os.path.join(full_path, file))
                        
print('Number of available normal files: ', len(filtered_normal_files2))
print('Number of available anomaly files: ', len(filtered_anomaly_files2))

Number of available normal files:  10699
Number of available anomaly files:  20592


In [15]:
index = np.random.randint(0, len(filtered_normal_files))
filepath = filtered_normal_files[index]
print(index)
try:
    traces, timestamps = get_traces(filepath, var_id="Trace2", begin=3000, shift=length_of_waveform, data_type=0)
except Exception as e:
    traces = []
    timestamps = []
    print("Error in reading the file: ", filepath)
    print("Error:", e)

1041


In [16]:
# import tensorflow as tf

### Merge Two datasets together

Can we switch from list to dictionary?

In [28]:
traces = []
timestamps = []
flag = []
file = []
for dcml in filtered_normal_files[:10]:
    tmp_trace, tmp_timestamp = get_traces(dcml, var_id="Trace2", begin=3000, shift=length_of_waveform, data_type=0)
    tmp_trace = np.array(tmp_trace[1:])
    tmp_timestamp = np.array(tmp_timestamp[1:])
    for sample in tmp_trace:
        # sample = tf.convert_to_tensor(sample)
        traces.append(sample)
        flag.append(0)
        file.append('FebMar22')
    for time in tmp_timestamp:
        timestamps.append(time)

for dcml in filtered_anomaly_files[:10]:
    tmp_trace, tmp_timestamp = get_traces(dcml, var_id="Trace2", begin=3000, shift=length_of_waveform, data_type=-1)
    for sample in tmp_trace:
        # sample = tf.convert_to_tensor(sample)
        traces.append(sample)
        flag.append(1)
        file.append('FebMar22')
    for time in tmp_timestamp:
        timestamps.append(time)

for dcml in filtered_normal_files2[:10]:
    tmp_trace, tmp_timestamp = get_traces(dcml, var_id="Trace2", begin=3000, shift=length_of_waveform, data_type=0)
    tmp_trace = np.array(tmp_trace[1:])
    tmp_timestamp = np.array(tmp_timestamp[1:])
    for sample in tmp_trace:
        traces.append(sample)
        flag.append(0)
        file.append('Sep24')
    for time in tmp_timestamp:
        timestamps.append(time)

for dcml in filtered_anomaly_files2[:10]:
    tmp_trace, tmp_timestamp = get_traces(dcml, var_id="Trace2", begin=3000, shift=length_of_waveform, data_type=-1, alarm=48)
    for sample in tmp_trace:
        traces.append(sample)
        flag.append(1)
        file.append('Sep24')
    for time in tmp_timestamp:
        timestamps.append(time)

File:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_183444.5928_DCML.bin.gz   bWidth below threshold: 52
File:  /w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024/anomalies/03_18/20240903_182423.6166_DCML.bin.gz   bWidth below threshold: 30


There should be **698,438** in each array

&nbsp; &nbsp; &nbsp;2173 * 26 = 56,498\
10,699 * 60 = 641,940

Actauls equals to **xxxxx**, due to drops from Non-zero alarms present in the data

In [29]:
dcm = pd.DataFrame({'anomoly_flag':flag, 'file':file, 'timestamps':timestamps, 'traces':traces})

## Join DCM & BPM into single DataFrame

In [30]:
merged_df = pd.merge_asof(
    dcm.sort_values("timestamps"), 
    bpm.sort_values("timestamps"), 
    on="timestamps", 
    direction="nearest"
)

In [31]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   anomoly_flag                          864 non-null    int64         
 1   file                                  864 non-null    object        
 2   timestamps                            864 non-null    datetime64[ns]
 3   traces                                864 non-null    object        
 4   FE_IS:Match:TunerPos                  594 non-null    float64       
 5   LEBT:Chop_N:V_Set                     594 non-null    float64       
 6   LEBT:Chop_P:V_Set                     594 non-null    float64       
 7   LEBT:Focus_1:V_Set                    594 non-null    float64       
 8   LEBT:Focus_2:V_Set                    594 non-null    float64       
 9   LEBT:Steer_A:V_Set                    594 non-null    float64       
 10  LE

In [32]:
merged_df

Unnamed: 0,anomoly_flag,file,timestamps,traces,FE_IS:Match:TunerPos,LEBT:Chop_N:V_Set,LEBT:Chop_P:V_Set,LEBT:Focus_1:V_Set,LEBT:Focus_2:V_Set,LEBT:Steer_A:V_Set,...,Src:RF_Gnd:Pwr_Set,ICS_Chop:RampDown:PW,ICS_Chop:RampUp:PWChange,ICS_MPS:Gate_Source:Offset,ICS_Tim:Chop_Flavor1:BeamOn,ICS_Tim:Chop_Flavor1:OnPulseWidth,ICS_Tim:Chop_Flavor1:RampUp,ICS_Tim:Chop_Flavor1:StartPulseWidth,ICS_Tim:Gate_BeamRef:GateWidth,ICS_Tim:Gate_BeamOn:RR
0,0,FebMar22,2022-02-28 08:30:00.586558,"[0.00063476566, 0.0009765625, -9.765625e-05, -...",,,,,,,...,,18.0,2.0,-10.000000,920.0,50.0,98.0,18.0,1024.0,
1,0,FebMar22,2022-02-28 08:30:00.603224,"[0.000390625, 9.765625e-05, -0.000390625, -0.0...",,,,,,,...,,18.0,2.0,-10.000000,920.0,50.0,98.0,18.0,1024.0,
2,0,FebMar22,2022-02-28 08:30:00.619889,"[-9.765625e-05, 0.00024414062, -0.00048828125,...",,,,,,,...,,18.0,2.0,-10.000000,920.0,50.0,98.0,18.0,1024.0,
3,0,FebMar22,2022-02-28 08:30:00.636554,"[-0.00029296876, 0.00068359374, 0.00034179687,...",,,,,,,...,,18.0,2.0,-10.000000,920.0,50.0,98.0,18.0,1024.0,
4,0,FebMar22,2022-02-28 08:30:00.653220,"[0.00014648438, -4.8828126e-05, -0.00078125, -...",,,,,,,...,,18.0,2.0,-10.000000,920.0,50.0,98.0,18.0,1024.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,Sep24,2024-09-18 10:57:38.721789,"[-9.765625e-05, -4.8828126e-05, 0.0005371094, ...",58.029999,2.7,2.7,46.0,39.0,1.7,...,4.717,18.0,6.0,-21.666666,884.0,33.0,98.0,18.0,984.0,59.900002
860,0,Sep24,2024-09-18 10:57:38.738455,"[0.0005371094, 0.0, 4.8828126e-05, -0.00068359...",58.029999,2.7,2.7,46.0,39.0,1.7,...,4.717,18.0,6.0,-21.666666,884.0,33.0,98.0,18.0,984.0,59.900002
861,0,Sep24,2024-09-18 10:57:38.755121,"[0.0001953125, -0.00083007815, 0.00092773436, ...",58.029999,2.7,2.7,46.0,39.0,1.7,...,4.717,18.0,6.0,-21.666666,884.0,33.0,98.0,18.0,984.0,59.900002
862,0,Sep24,2024-09-18 10:57:38.771787,"[-0.00014648438, 0.00029296876, 0.000390625, -...",58.029999,2.7,2.7,46.0,39.0,1.7,...,4.717,18.0,6.0,-21.666666,884.0,33.0,98.0,18.0,984.0,59.900002


In [33]:
merged_df.groupby(['anomoly_flag','file'])['traces'].count()

anomoly_flag  file    
0             FebMar22    260
              Sep24       590
1             FebMar22     10
              Sep24         4
Name: traces, dtype: int64

In [37]:
summary('test',bpm)

test shape: (67764, 23)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std,duplicate
timestamps,datetime64[ns],0,66937,2021-02-11 08:18:29.948852,2024-07-16 09:15:27.486576640,2024-10-01 03:49:33.901234468,2023-11-08 09:27:51.632123904,452 days 05:38:38.317375472,827
FE_IS:Match:TunerPos,float64,18855,3,58.009998,58.040001,58.040001,58.035648,0.007849,827
LEBT:Chop_N:V_Set,float64,18855,2,2.43,2.7,2.7,2.699995,0.001221,827
LEBT:Chop_P:V_Set,float64,18855,4,0.54,2.7,2.7,2.699851,0.015682,827
LEBT:Focus_1:V_Set,float64,18855,5,2.8848,46.0,46.077202,45.998471,0.220181,827
LEBT:Focus_2:V_Set,float64,18855,4,6.0,39.0,39.077202,38.991222,0.528146,827
LEBT:Steer_A:V_Set,float64,18855,1,1.7,1.7,1.7,1.7,0.0,827
LEBT:Steer_B:V_Set,float64,18855,1,2.1,2.1,2.1,2.1,0.0,827
LEBT:Steer_C:V_Set,float64,18855,1,1.6,1.6,1.6,1.6,0.0,827
LEBT:Steer_D:V_Set,float64,18855,1,2.0,2.0,2.0,2.0,0.0,827
