# Pre-processing the ERS error messages and Run metadata in 2023

Most of the cells in what follows are not really needed to be re-executed, but show the usual process.

In [1]:
# From this page: https://atlasop.cern.ch/tdaq/ers/en-US/static/app/ers/ers.html
# We can export the messages we're interested in. 
# In this case I filtered all the ERS messages (of the ATLAS partition) with severity !INFO of 2023, with:
# - AppID: CHIP-ATLAS
# - MessageID: *TestFailed*
# - Msg Text filter: *tpu*

import pandas as pd
import re
from datetime import datetime
import pytz

ers_folder = "/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/dataset_generation/ers_errors/"
#df = pd.read_csv('/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/dataset_generation/ers_errors/tpu-failures-2023-andrei.csv')
df = pd.read_csv(f'{ers_folder}2023_ERS_CHIP_TestFailed_tpu.csv')


# Extract the TPU number from the 'text' column
def extract_tpu(text):
    #match = re.search(r"tpu-[a-zA-Z0-9-]+:pc-tdq-tpu-(\d+)", text)
    match = re.search(r"pc-tdq-tpu-(\d+)", text)
    return match.group(1) if match else None

# Extract the failure source from the 'text' column
def extract_failure_source(text):
    #match = re.search(r"Component: ([^:]+):HLT", text)
    #match = re.search(r"Test failure for application: ([^:]+):HLT", text)
    match = re.search(r"Test failure for application: ([^:]+)", text)
    return match.group(1) if match else None


# Convert 'time' to Europe/Berlin timezone. This is really annoying, but needed if we start using this in LCRC...
def convert_to_cern_timezone(time_str):

    time_format = "%H:%M:%S %b %d %Y"
    
    # Convert the time string to a datetime object
    naive_dt = datetime.strptime(time_str, time_format)
    
    # Assign UTC timezone
    utc_timezone = pytz.timezone('UTC')
    utc_dt = utc_timezone.localize(naive_dt)
    
    # Convert the UTC datetime to Europe/Berlin timezone
    cern_timezone = pytz.timezone('Europe/Berlin')
    cern_dt = utc_dt.astimezone(cern_timezone)
    
    return cern_dt


# Create the new columns
df['tpu_number'] = df['text'].apply(extract_tpu)
df['failure_source'] = df['text'].apply(extract_failure_source)


# Convert to Europe/Berlin timezone
df['time_cern'] = df['time'].apply(convert_to_cern_timezone)

# Keep only the columns we want: 'time', 'utime', 'tpu_number', and 'failure_source'
df_processed = df[['time_cern', 'utime', 'tpu_number', 'failure_source']]

# Group by 'time', 'utime', and 'tpu_number', and concatenate 'failure_source' values. Otherwise we get a lot of rows for the same time and with 1 is enough. Keep the information in case we want to understand more what's the issue.
df_merged = df_processed.groupby(['time_cern', 'utime', 'tpu_number']).agg({
    'failure_source': lambda x: '; '.join(x.dropna().unique())  # Merge unique failure sources
}).reset_index()

df_merged.to_csv(f"{ers_folder}processed_2023_ERS.csv", index=False)


In [27]:
# Now we want to know all the available runs in 2023. For this we use the file: ""../../../datasets/atlas-data-summary-runs-2023.html"

###### Important note. If you want to use this, you'll need to change the configuration of this SWAN session and add a configuration script. You should use for exaple:
###### /eos/user/j/jhoya/DAQ/AnomalyDetection/strada/setup_lxplus.sh   -> We need TDAQ release to use beauty

import argparse
import sys
import datetime as dt
import json

import numpy as np
import pandas as pd

sys.path.append('/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/detection_combined/')

from utils.offlinepbeastdataloader import OfflinePBeastDataLoader



offline_pbeast_data_loader = OfflinePBeastDataLoader('/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/datasets/atlas-data-summary-runs-2023.html')
run_numbers = offline_pbeast_data_loader.get_run_numbers()
runs_amount = offline_pbeast_data_loader.__len__()

df_runs = offline_pbeast_data_loader.get_runs_df()
    
print(runs_amount, run_numbers)


123 [461096, 461002, 460945, 460944, 460916, 460913, 460860, 460837, 460493, 460477, 460447, 460438, 460436, 460348, 460009, 456749, 456729, 456714, 456685, 456665, 456522, 456409, 456386, 456346, 456316, 456314, 456303, 456273, 456225, 456164, 456151, 456126, 456118, 456110, 456016, 455975, 455924, 455899, 455870, 455857, 455838, 455837, 455818, 455814, 455795, 455702, 455623, 455589, 454322, 454222, 454188, 454163, 454129, 454083, 454054, 453981, 453858, 453816, 453795, 453754, 453733, 453713, 453657, 453644, 453617, 453556, 453530, 453353, 453319, 452872, 452843, 452799, 452787, 452785, 452726, 452696, 452669, 452640, 452624, 452573, 452533, 452463, 452266, 452241, 452202, 452163, 452028, 451936, 451896, 451866, 451804, 451794, 451735, 451618, 451611, 451595, 451587, 451569, 451557, 451543, 451295, 451140, 451094, 451063, 451046, 451037, 451022, 450997, 450894, 450518, 450445, 450427, 450360, 450271, 450227, 448990, 448984, 448982, 448715, 448637, 448546, 448543, 448519]


In [28]:
# Start, end and duration for each of the runs in 2023
df_runs

Unnamed: 0_level_0,start,end,duration
Run Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
461096,2023-09-19 22:16:00,2023-09-20 08:37:00,610
461002,2023-09-18 17:12:00,2023-09-19 10:23:00,1037
460945,2023-09-18 01:18:00,2023-09-18 06:53:00,305
460944,2023-09-17 15:51:00,2023-09-18 01:08:00,549
460916,2023-09-17 07:15:00,2023-09-17 09:13:00,61
460913,2023-09-16 19:45:00,2023-09-17 06:58:00,671
460860,2023-09-15 13:59:00,2023-09-16 10:21:00,1220
460837,2023-09-14 21:12:00,2023-09-15 07:43:00,610
460493,2023-09-11 07:05:00,2023-09-11 15:16:00,488
460477,2023-09-10 17:19:00,2023-09-11 03:15:00,549


In [21]:
# Use this only to get a DataFrame with the information. For a JSON file look below.

ers_folder = "/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/dataset_generation/ers_errors/"
df_ers = pd.read_csv(f"{ers_folder}processed_2023_ERS.csv", parse_dates=True)


# Convert time columns to datetime format. Dates in python are a painnnnnn!!!
df_ers['time_cern'] = pd.to_datetime(df_ers['time_cern'], utc=True, errors='coerce').dt.tz_localize(None)
df_runs['start'] = pd.to_datetime(df_runs['start'], errors='coerce')
df_runs['end'] = pd.to_datetime(df_runs['end'], errors='coerce')

# Including the TPU errors in the dataframe
df_runs['tpu_issues'] = ''

# Iterate to find the corresponding errors
for index, run in df_runs.iterrows():
    # Filter issues that occurred during the run
    issues_in_run = df_ers[
        (df_ers['time_cern'] >= run['start']) & (df_ers['time_cern'] <= run['end'])
    ]
    
    # If there are issues, concatenate their information
    if not issues_in_run.empty:
        tpu_issues = issues_in_run.apply(lambda row: f"{row['tpu_number']} ({row['failure_source']})", axis=1)
        df_runs.at[index, 'tpu_issues'] = '; '.join(tpu_issues)


print(df_runs)

                         start                 end  duration tpu_issues
Run Number                                                             
461096     2023-09-19 22:16:00 2023-09-20 08:37:00       610           
461002     2023-09-18 17:12:00 2023-09-19 10:23:00      1037           
460945     2023-09-18 01:18:00 2023-09-18 06:53:00       305           
460944     2023-09-17 15:51:00 2023-09-18 01:08:00       549           
460916     2023-09-17 07:15:00 2023-09-17 09:13:00        61           
...                        ...                 ...       ...        ...
448715     2023-04-08 17:07:00 2023-04-09 17:29:00      1441           
448637     2023-04-07 14:29:00 2023-04-08 01:28:00       610           
448546     2023-04-06 18:07:00 2023-04-06 19:07:00        59           
448543     2023-04-06 16:49:00 2023-04-06 17:56:00        61           
448519     2023-04-06 09:14:00 2023-04-06 14:56:00       305           

[123 rows x 4 columns]


In [24]:
from IPython.display import display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

display(df_runs)

Unnamed: 0_level_0,start,end,duration,tpu_issues
Run Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
461096,2023-09-19 22:16:00,2023-09-20 08:37:00,610,
461002,2023-09-18 17:12:00,2023-09-19 10:23:00,1037,
460945,2023-09-18 01:18:00,2023-09-18 06:53:00,305,
460944,2023-09-17 15:51:00,2023-09-18 01:08:00,549,
460916,2023-09-17 07:15:00,2023-09-17 09:13:00,61,
460913,2023-09-16 19:45:00,2023-09-17 06:58:00,671,
460860,2023-09-15 13:59:00,2023-09-16 10:21:00,1220,
460837,2023-09-14 21:12:00,2023-09-15 07:43:00,610,
460493,2023-09-11 07:05:00,2023-09-11 15:16:00,488,
460477,2023-09-10 17:19:00,2023-09-11 03:15:00,549,


In [42]:
import pandas as pd
import json


ers_folder = "/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/dataset_generation/ers_errors/"
df_ers = pd.read_csv(f"{ers_folder}processed_2023_ERS.csv", parse_dates=True)


# Convert time columns to datetime format. Dates in python are a painnnnnn!!!
df_ers['time_cern'] = pd.to_datetime(df_ers['time_cern'], utc=True, errors='coerce').dt.tz_localize(None)
df_runs['start'] = pd.to_datetime(df_runs['start'], errors='coerce')
df_runs['end'] = pd.to_datetime(df_runs['end'], errors='coerce')

# Create a list to store the JSON representation
runs_list = []

print()

# Iterate over each run and find corresponding issues
for run_number, run in df_runs.iterrows():
    # Filter issues that occurred during the run
    issues_in_run = df_ers[
        (df_ers['time_cern'] >= run['start']) & (df_ers['time_cern'] <= run['end'])
    ]
    
    # Create a dictionary to represent the run
    run_dict = {
        "Run Number": int(run_number),
        "start": run['start'].isoformat(),
        "end": run['end'].isoformat(),
        "duration": int(run['duration']),
        "tpu_issues": []
    }
    
    # Add the corresponding issues to the run
    for _, issue in issues_in_run.iterrows():
        issue_dict = {
            "time": issue['time_cern'].isoformat(),
            "tpu": issue['tpu_number'],
            "failure_source": issue['failure_source']
        }
        run_dict["tpu_issues"].append(issue_dict)

    # Append the run dictionary to the list
    runs_list.append(run_dict)

# Write the list of runs to a JSON file
with open(f'{ers_folder}runs_with_tpu_issues.json', 'w') as json_file:
    json.dump(runs_list, json_file, indent=4)

print("Data has been successfully written to 'runs_with_tpu_issues.json'")






Data has been successfully written to 'runs_with_tpu_issues.json'


# Adding anomaly information from the Clustering algorithm 

Now that we have a json file with all the runs and the tpu issues that we can extract from ERS, we include also any anomalies found by the clustering algorithm. <br/>
We want to do this in order to be able to select a few set of runs with, in principle, no issues (or not drastic at least), that we can use to train the DeepLearning algorithms.


In [None]:
# Load JSON anomaly data
json_file = f"/eos/user/j/jhoya/DAQ/AnomalyDetection/ML_resources/Strada_parts/results/run_{run_number}.json"

In [46]:
import json
import os

ers_folder = "/eos/user/j/jhoya/DAQ/AnomalyDetection/strada/dataset_generation/ers_errors/"

with open(f'{ers_folder}runs_with_tpu_issues.json', 'r') as json_file:
    runs_data = json.load(json_file)

# Directory where anomaly JSON files are stored
anomalies_dir = '/eos/user/j/jhoya/DAQ/AnomalyDetection/ML_resources/Strada_parts/results/'

# Iterate over each run and add anomaly information if available
for run in runs_data:
    run_number = run["Run Number"]
    anomaly_file = os.path.join(anomalies_dir, f'run_{run_number}.json')

    # Initialize the anomalies key with an empty list
    run["anomalies"] = {}

    # Check if the anomaly JSON file for the current run exists
    if os.path.exists(anomaly_file):
        # Load the anomaly JSON file
        with open(anomaly_file, 'r') as anomaly_json_file:
            anomalies = json.load(anomaly_json_file)

        # Iterate over anomalies to find Clustering anomalies
        for tpu, anomaly_data in anomalies.items():
            for timestamp, details in anomaly_data.items():
                #if any(a in ["ClusteringDropToZero", "ClusteringGeneral"] for a in details["types"]):
                for anomaly_type in details["types"]:
                    if anomaly_type in ["ClusteringDropToZero", "ClusteringGeneral"]:
                        if tpu not in run["anomalies"]:
                            run["anomalies"][tpu] = {}

                        # Ensure the anomaly type exists for the TPU
                        if anomaly_type not in run["anomalies"][tpu]:
                            run["anomalies"][tpu][anomaly_type] = []

                        # Append the relevant information
                        run["anomalies"][tpu][anomaly_type].append({
                            "timestamp": timestamp,
                            "duration": details["duration"]
                        })

# Write the updated runs data to a new JSON file
with open(f'{ers_folder}updated_runs_with_tpu_issues.json', 'w') as updated_json_file:
    json.dump(runs_data, updated_json_file, indent=4)

print("Data has been successfully updated and written to 'updated_runs_with_tpu_issues.json'")


Data has been successfully updated and written to 'updated_runs_with_tpu_issues.json'
