In [None]:
from datetime import datetime, timedelta
import pandas as pd
from pathlib import Path
import re

from das_anomaly.settings import SETTINGS


# Replace 'events.csv' with the path to your CSV file
file_path = 'events.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Assuming the second column is at index 1
second_column = df.iloc[:, 1]

# Extract the first 15 characters of each entry in the second column
first_15_chars = second_column.apply(lambda x: str(x)[:15] if pd.notnull(x) else '')

# Remove rows where the extracted characters are empty strings (which were originally NaN)
first_15_chars = first_15_chars[first_15_chars != '']

# Convert format from "20220627_220846" to "YMD20220627-HMS220846"
converted_format = first_15_chars.str.replace('_', '-HMS', regex=False).apply(lambda x: 'YMD' + x)

# Print or process the converted data
print(converted_format)

print(len(converted_format))

In [None]:
# To drop the last two digits of each string in the series
trimmed_format = converted_format.apply(lambda x: x[:])

for i in trimmed_format:
    print(i)

print(len(trimmed_format))

In [None]:
# Substring to remove
substring_to_remove = "YMDYMD20220603-H"

# Remove the substring from each element
modified_list = [s for s in trimmed_format if substring_to_remove not in s]

# Print the modified list
print(modified_list)

print(len(modified_list))


In [None]:
# Remove elements containing the keyword "vent"
filtered_list = [s for s in modified_list if "vent" not in s]

# Print the filtered list
print(filtered_list)

print(len(filtered_list))

In [None]:
def find_png_files(directory: str | Path) -> list[str]:
    """
    Recursively collect *.png files under *directory* and return
    a sorted list of absolute paths (as strings).
    """
    directory = Path(directory).expanduser().resolve()
    return sorted(str(p) for p in directory.rglob("*.png"))

# directories that hold your images
directory_paths = [SETTINGS.TRAIN_IMAGES_PATH, SETTINGS.TEST_IMAGES_PATH]

# gather every png found in all directories
png_file_paths_total: list[str] = []
for d in directory_paths:
    png_file_paths_total.extend(find_png_files(d))

# show the paths (optional)
# for p in png_file_paths_total:
#     print(p)

print("number of all files =", len(png_file_paths_total))

In [None]:
# ---------- helpers ----------------------------------------------------------
def _event_str_to_dt(event_str: str) -> datetime:
    """'YMD20220511-HMS025717'  ➜  2022‑05‑11 02:57:17"""
    return datetime.strptime(event_str, "YMD%Y%m%d-HMS%H%M%S")

_ts_re = re.compile(r"_(\d{4}_\d{2}_\d{2}T\d{2}_\d{2}_\d{2})__")

def _png_start_dt(png_path: str) -> datetime | None:
    """
    '/.../DAS_________2022_06_04T03_24_33__2022_06_04T03_24_35.png'
                        ^^^^^^^^^^^^^^^^^^^^
    """
    m = _ts_re.search(png_path)
    if m:
        return datetime.strptime(m.group(1), "%Y_%m_%dT%H_%M_%S")
    return None
# -----------------------------------------------------------------------------

def find_matching_pngs(
    png_paths: list[str],
    event_start_strings: list[str],
    seconds_front: int = 4,
) -> list[str]:
    """Return PNGs whose *first* timestamp falls in [event, event+seconds_front]."""
    # pre‑compute event windows to speed things up
    windows = [
        (evt_dt, evt_dt + timedelta(seconds=seconds_front))
        for evt_dt in map(_event_str_to_dt, event_start_strings)
    ]

    matched = []
    for path in png_paths:
        start_dt = _png_start_dt(path)
        if start_dt is None:
            continue
        # stop at first hit – each PNG only needs one match
        if any(lo <= start_dt <= hi for lo, hi in windows):
            matched.append(path)
    return matched


# ---------------------- how you would call it -------------------------------
all_matched_files = find_matching_pngs(
    png_paths=png_file_paths_total,
    event_start_strings=filtered_list,   # the list you read from your CSV
    seconds_front=3,                     # inclusive window [t, t+3 s]
)

for f in all_matched_files:
    print(f)
print(f"Total matched = {len(all_matched_files)}")
