## Import Libraries

In [None]:
### Import Packages ###
import pandas as pd
import numpy as np
import struct
import os
import matplotlib.pyplot as plt
import datetime

#Jlab Packages
from data_utils import get_traces
from beam_settings_parser_hdf5 import BeamConfigParserHDF5
from beam_settings_prep import BeamConfigPreProcessor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

ModuleNotFoundError: No module named 'data_utils'

### BPM Config ###

In [None]:
### Config paths ###
class BPMDataConfig:

    def __ini__(self):
        self.beam_settings_data_path = "/work/data_science/suf_sns/beam_configurations_data/processed_data/clean_beam_config_processed_df.csv"
        self.beam_param_parser_cfg = {"data_location": "/work/data_science/suf_sns/beam_configurations_data/hdf5_sept2024/"}
        self.beam_settings_prep_cfg = {
    "rescale": False,
    "beam_config": ['FE_IS:Match:TunerPos',
                    'LEBT:Chop_N:V_Set',
                    'LEBT:Chop_P:V_Set',
                    'LEBT:Focus_1:V_Set',
                    'LEBT:Focus_2:V_Set',
                    'LEBT:Steer_A:V_Set',
                    'LEBT:Steer_B:V_Set',
                    'LEBT:Steer_C:V_Set',
                    'LEBT:Steer_D:V_Set',
                    'Src:Accel:V_Set',
                    'Src:H2:Flw_Set',
                    'Src:Ign:Pwr_Set',
                    'Src:RF_Gnd:Pwr_Set',
                    'ICS_Chop:RampDown:PW', # ICS_Chop-RampDown-PW
                    'ICS_Chop:RampUp:PWChange', # ICS_Chop-RampUp-PWChange
                    'ICS_MPS:Gate_Source:Offset', # ICS_MPS-Gate_Source-Offset
                    'ICS_Tim:Chop_Flavor1:BeamOn', # ICS_Chop-BeamOn-Width
                    'ICS_Tim:Chop_Flavor1:OnPulseWidth', # ICS_Chop-BeamOn-PW
                    'ICS_Tim:Chop_Flavor1:RampUp', # ICS_Chop-RampUp-Width
                    'ICS_Tim:Chop_Flavor1:StartPulseWidth', # ICS_Chop-RampUp-PW
                    'ICS_Tim:Gate_BeamRef:GateWidth', # ICS_Tim-Gate_BeamRef-GateWidth
                    'ICS_Tim:Gate_BeamOn:RR']} 
        self.column_to_add = [
            'FE_IS:Match:TunerPos',
            'LEBT:Chop_N:V_Set',
            'LEBT:Chop_P:V_Set',
            'LEBT:Focus_1:V_Set',
            'LEBT:Focus_2:V_Set',
            'LEBT:Steer_A:V_Set',
            'LEBT:Steer_B:V_Set',
            'LEBT:Steer_C:V_Set',
            'LEBT:Steer_D:V_Set',
            'Src:Accel:V_Set',
            'Src:H2:Flw_Set',
            'Src:Ign:Pwr_Set',
            'Src:RF_Gnd:Pwr_Set',
            'ICS_Tim:Gate_BeamOn:RR']

    for entry in self.column_to_add:
        configs[entry] = np.nan

    self.configs = configs.rename(columns={
        'ICS_Chop-RampDown-PW' : 'ICS_Chop:RampDown:PW', 
        'ICS_Chop-RampUp-PWChange' : 'ICS_Chop:RampUp:PWChange',
        'ICS_MPS-Gate_Source-Offset' : 'ICS_MPS:Gate_Source:Offset',
        'ICS_Chop-BeamOn-Width' : 'ICS_Tim:Chop_Flavor1:BeamOn',
        'ICS_Chop-BeamOn-PW' : 'ICS_Tim:Chop_Flavor1:OnPulseWidth',
        'ICS_Chop-RampUp-Width' : 'ICS_Tim:Chop_Flavor1:RampUp',
        'ICS_Chop-RampUp-PW' : 'ICS_Tim:Chop_Flavor1:StartPulseWidth',
        'ICS_Tim-Gate_BeamRef-GateWidth' : 'ICS_Tim:Gate_BeamRef:GateWidth',})  
    
    self.beam_config = [
        'timestamps',
        'FE_IS:Match:TunerPos',
        'LEBT:Chop_N:V_Set',
        'LEBT:Chop_P:V_Set',
        'LEBT:Focus_1:V_Set',
        'LEBT:Focus_2:V_Set',
        'LEBT:Steer_A:V_Set',
        'LEBT:Steer_B:V_Set',
        'LEBT:Steer_C:V_Set',
        'LEBT:Steer_D:V_Set',
        'Src:Accel:V_Set',
        'Src:H2:Flw_Set',
        'Src:Ign:Pwr_Set',
        'Src:RF_Gnd:Pwr_Set',
        'ICS_Chop:RampDown:PW', # ICS_Chop-RampDown-PW
        'ICS_Chop:RampUp:PWChange', # ICS_Chop-RampUp-PWChange
        'ICS_MPS:Gate_Source:Offset', # ICS_MPS-Gate_Source-Offset
        'ICS_Tim:Chop_Flavor1:BeamOn', # ICS_Chop-BeamOn-Width
        'ICS_Tim:Chop_Flavor1:OnPulseWidth', # ICS_Chop-BeamOn-PW
        'ICS_Tim:Chop_Flavor1:RampUp', # ICS_Chop-RampUp-Width
        'ICS_Tim:Chop_Flavor1:StartPulseWidth', # ICS_Chop-RampUp-PW
        'ICS_Tim:Gate_BeamRef:GateWidth', # ICS_Tim-Gate_BeamRef-GateWidth
        'ICS_Tim:Gate_BeamOn:RR']



    def configs_hist(self,dataframe, timestamp):
        subset_columns = dataframe.columns.tolist()
        subset_columns.remove(timestamp)
        df_shifted = dataframe[subset_columns].shift(1)
        mask = (dataframe[subset_columns] == df_shifted).all(axis=1)
        dataframe = dataframe[~mask] #.reset_index(drop=True)
        dataframe['time_diff'] = dataframe[timestamp].diff()
        dataframe['timestamps_trm'] = dataframe[timestamp] + dataframe[timestamp].diff().shift(-1) - datetime.timedelta(seconds=0.000001)
        subset_columns.insert(0, timestamp)
        subset_columns.insert(1, "timestamps_trm")
    
        return dataframe[subset_columns]

    def summary(self,text, df):
        print(f'{text} shape: {df.shape}')
        summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
        summ['null'] = df.isnull().sum()
        summ['unique'] = df.nunique()
        summ['min'] = df.min()
        summ['median'] = df.median()
        summ['max'] = df.max()
        summ['mean'] = df.mean()
        summ['std'] = df.std()
        summ['duplicate'] = df.duplicated().sum()
        return summ

dc  = BPMDataConfig()

In [None]:
### FebMar22 .csv beam settings ###
configs = pd.read_csv(dc.beam_settings_data_path)
configs = configs.drop("Unnamed: 0", axis=1, errors='ignore')
configs['timestamps'] = pd.to_datetime(configs['timestamps'])

In [None]:
### Sep24 hdf5 beam settings ###
parser = BeamConfigParserHDF5(dc.beam_param_parser_cfg)
data, _ = parser.run()

In [None]:
### Get Prepared datasets ###
prep = BeamConfigPreProcessor(dc.beam_settings_prep_cfg)
prepared_settings, run_cfg = prep.run(data)


In [None]:
configs = configs.loc[:, beam_config]
bpm = pd.concat([configs, prepared_settings])

In [None]:
dc.summary('bpm_summary', bpm)

### DCM Configs

In [None]:
class DCMDatConfig:

    def __init__(self):
        self.dataset1_loc = "/work/data_science/suf_sns/DCM_Errant/"
        self.dataset2_loc = "/w/data_science-sciwork24/suf_sns/DCML_dataset_Sept2024"
        self.start_date = 20220218
        self.end_date = 20220318
        self.anomaly_type = "00110000" #--  48
        self.length_of_waveform = 10000
        self.exclude_dates = [20220220, 20220221, 20220222, 20220223, 20220301, 20220308, 20220309, 2022015]
        self.filtered_normal_files = []
        self.filtered_anomaly_files = []
        self.filtered_normal_files2 = []
        self.filtered_anomaly_files2 = []


    def GetFebFilteredFiles(self):
        for root, subfolders, files in os.walk(self.dataset1_loc):
            for file in files:
                if '.bin' in file and 'DCML' in file:
                    try:
                        date = int(file[:8])
                    except:
                        print("Error in filename: ", file)
                        print("-- Date could not be read!")
                        continue
                if date >= self.start_date and date <= self.end_date:

                if date in self.exclude_dates:
                    continue

                if '00000000' in file:
                    self.filtered_normal_files.append(os.path.join(root, file))
                elif self.anomaly_type in file:
                    self.filtered_anomaly_files.append(os.path.join(root, file))

        print('Number of available normal files: ', len(self.filtered_normal_files))
        print('Number of available anomaly files: ', len(self.filtered_anomaly_files))
        return self.filtered_normal_files,self.filtered_anomaly_files

    def GetSepFilteredFiles(self):
        subfolders = [ f.path for f in os.scandir(self.dataset2_loc) if f.is_dir() ]
        for directory in subfolders:
            if "normal" in directory or "anomal" in directory:                
                for root, subfolders, files in os.walk(directory):
                    for file in files:
                        full_path = root
                            if ".gz" in file:
                                if 'normal' in directory:
                                    self.filtered_normal_files2.append(os.path.join(full_path, file))
                                elif "anomal" in directory:
                                    self.filtered_anomaly_files2.append(os.path.join(full_path, file))
                        
        print('Number of available normal files: ', len(self.filtered_normal_files2))
        print('Number of available anomaly files: ', len(self.filtered_anomaly_files2))
        return self.filtered_normal_files2,self.filtered_anomaly_files2



dcm = DCMDatConfig()

In [None]:
### Print Feb Data Details ###
filtered_normal_files,filtered_anomaly_files=dcm.GetFebFilteredFiles()

In [None]:
### Print Sep Data Details ###
filtered_normal_files2,filtered_anomaly_files2==dcm.GetSepFilteredFiles()

## Get Traces and Timestamps 

In [None]:
def GetTracesAndTs(filtered_files):
    index = np.random.randint(0, len(filtered_files))
    filepath = filtered_files[index]
    print(index)
    try:
        traces, timestamps = get_traces(filepath, var_id="Trace2", begin=3000, shift=length_of_waveform, data_type=0)
    except Exception as e:
        traces = []
        timestamps = []
        print("Error in reading the file: ", filepath)
        print("Error:", e)

In [None]:
GetTracesAndTs(filtered_normal_files)

## Merge Datasets

In [None]:
class MergeDatasets:

    def __init__(self):
        self.traces = []
        self.timestamps = []
        self.flag = []
        self.file = []

    def IterateFiles(self,file_paths,MonthName):
        for dcml in file_paths[:10]:
        tmp_trace, tmp_timestamp = get_traces(dcml, var_id="Trace2", begin=3000, shift=length_of_waveform, data_type=0)
        tmp_trace = np.array(tmp_trace[1:])
        tmp_timestamp = np.array(tmp_timestamp[1:])
            for sample in tmp_trace:
                traces.append(sample)
                flag.append(0)
                file.append(MonthName)
            for time in tmp_timestamp:
                timestamps.append(time)

    def MergeDatasets(self):
        merged_df = pd.merge_asof(dcm.sort_values("timestamps"), bpm.sort_values("timestamps"), on="timestamps", direction="nearest")
        return merged_df


md = MergeDatasets()   

In [None]:
raw_data = md.MergeDatasets()
raw_data.head()

## Data Preprocessing

In [None]:
## Data Preprocessing ##

class DataPreprocessor:
    def __init__(self, df: pd.DataFrame):
      
        self.df = df.copy()  # Create a copy to avoid modifying the original DataFrame

    def check_nan(self):
       
        nan_counts = self.df.isna().sum()
        return nan_counts[nan_counts > 0]

    def remove_nan(self):
      
        self.df.dropna(inplace=True)
        return self.df

    def check_null(self):
        
        null_counts = self.df.isnull().sum()
        return null_counts[null_counts > 0]

    def remove_null(self):
        
        self.df.dropna(inplace=True)
        return self.df

    def check_duplicates(self):
        "
        return self.df.duplicated().sum()

    def remove_duplicates(self):
        
        self.df.drop_duplicates(inplace=True)
        return self.df

    def check_outliers(self):
        
        outlier_dict = {}
        for col in self.df.select_dtypes(include=[np.number]).columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = self.df[(self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))].shape[0]
            if outliers > 0:
                outlier_dict[col] = outliers
        return outlier_dict

    def remove_outliers(self):
        
        for col in self.df.select_dtypes(include=[np.number]).columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            self.df = self.df[(self.df[col] >= lower_bound) & (self.df[col] <= upper_bound)]
        return self.df

    def convert_float64_to_float32(self):
       
        float64_cols = self.df.select_dtypes(include=['float64']).columns
        self.df[float64_cols] = self.df[float64_cols].astype('float32')
        return self.df.dtypes

    def rename_columns(self, rename_dict):
       
        self.df.rename(columns=rename_dict, inplace=True)
        return self.df.head()

    def get_dataframe(self):
       
        return self.df
