In [3]:
# 1. IMPORTS
import pandas as pd
import numpy as np
from pathlib import Path
from constant import Neo4jTimeConstant

inputs = [
    ('cpu-usage', '../thesis-data/neo4j-cpu.csv'),
    ('memory-usage', '../thesis-data/neo4j-memory_usage.csv'),
    ('memory-working', '../thesis-data/neo4j-memory_working.csv'),
]

# 2. SETUP AND DATA LOADING
# Make sure to change this path to the correct location of your file
def process(prefix: str, file: str):
    df = pd.read_csv(file)
    pods = df['pod'].unique()

    output_dir = Path("./separated-data")
    output_dir.mkdir(parents=True, exist_ok=True)

    # 5. PROCESS AND PLOT DATA FOR EACH POD
    for pod in pods:
        # --- Data Preparation ---
        data = df[df['pod'] == pod].copy()
        data['date'] = pd.to_datetime(data['Time'], unit='ms', utc=True)
        data = data.rename(columns={'Value #A': 'value'})

        conditions = [
            (data['date'] >= Neo4jTimeConstant.utc_start_1) & (data['date'] < Neo4jTimeConstant.utc_end_1),
            (data['date'] >= Neo4jTimeConstant.utc_start_2) & (data['date'] < Neo4jTimeConstant.utc_end_2),
            (data['date'] >= Neo4jTimeConstant.utc_start_3) & (data['date'] < Neo4jTimeConstant.utc_end_3),
            (data['date'] >= Neo4jTimeConstant.utc_start_4) & (data['date'] < Neo4jTimeConstant.utc_end_4)
        ]
        choices = [1, 2, 3, 4]
        data['repetion'] = np.select(conditions, choices, default=0)

        
        repetition_starts = {
            1: Neo4jTimeConstant.utc_start_1,
            2: Neo4jTimeConstant.utc_start_2,
            3: Neo4jTimeConstant.utc_start_3,
            4: Neo4jTimeConstant.utc_start_4
        }

        start_times_col = data['repetion'].map(repetition_starts)

        data['delta_time'] = (data['date'] - start_times_col).dt.total_seconds()
        
        # Also add the measure number column for complete data
        data['measure_#'] = data.groupby('repetion').cumcount() + 1
        data.loc[data['repetion'] == 0, 'measure_#'] = 0

        # --- Save Processed Data ---
        out_df = data[['date', 'value', 'repetion', 'measure_#', 'delta_time']]
        file_path = output_dir / f"{prefix}-{pod.lower()}.csv"
        out_df.to_csv(file_path, index=False)

        print(f"Saved data for pod {pod} to {file_path}")
        print("\nSample of data with the new 'delta_time' column:")
        # Display rows from a repetition to see the new column in action
        print(out_df[out_df['repetion'] == 1].head())


for input in inputs:
    process(*input)

Saved data for pod custom-collector-849864d698-glb2p to separated-data/cpu-usage-custom-collector-849864d698-glb2p.csv

Sample of data with the new 'delta_time' column:
                        date         value  repetion  measure_#  delta_time
33 2025-09-21 06:29:10+00:00  8.979316e-07         1          1         0.0
34 2025-09-21 06:29:20+00:00  2.624173e-04         1          2        10.0
35 2025-09-21 06:29:30+00:00  2.624173e-04         1          3        20.0
36 2025-09-21 06:29:40+00:00  2.624173e-04         1          4        30.0
37 2025-09-21 06:29:50+00:00  2.631392e-03         1          5        40.0
Saved data for pod sample-service-custom-5f89c5dc6c-hqjq4 to separated-data/cpu-usage-sample-service-custom-5f89c5dc6c-hqjq4.csv

Sample of data with the new 'delta_time' column:
                         date     value  repetion  measure_#  delta_time
180 2025-09-21 06:29:10+00:00  0.000586         1          1         0.0
181 2025-09-21 06:29:20+00:00  0.001865         1 