Example of input file paths:

/home/ana/Documents/2023/ldms-darshan-analysis/ior/eclipse/darshan-ldms-output/csv/8m-4m/17901504-IOR_pscratch_1024_none.csv
/home/ana/Documents/2023/ldms-darshan-analysis/ior/eclipse/darshan-ldms-output/csv/8m-4m/17893042-IOR_pscratch_1024_cpu.csv
/home/ana/Documents/2023/ldms-darshan-analysis/ior/eclipse/darshan-ldms-output/csv/8m-4m/17895745-IOR_pscratch_1024_memory.csv

Log with multiple jobs:
/home/ana/Documents/2023/ldms-darshan-analysis/ior/eclipse/darshan-ldms-output/csv/8m-4m/test_all_jobs.csv

### TODOs
- [ ] Visualizations code in R and python
- [ ] Identify app phase:
- [ ] Identify longer operations 
- [ ] Identify long intervals between last read/write and a met operation open/close 
- [ ] Identify distance between the first rank to finish and others
- [ ] Identify long intervals between operations in the same rank


In [1]:
import os, csv, time, glob, argparse, psutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt 
import rpy2.robjects as ro
import seaborn as sns
from datetime import datetime

class Job:

    def __init__(self, job, ranks, nodes, users, filename, exe):
        
        self.job = job
        self.ranks = ranks
        self.nodes = nodes
        self.users = users
        self.filename = filename
        self.exe = exe

def app_phase(df, output_file, self):
    write_to_file("---------------------------------------")
    write_to_file("EXECUTION SUMMARY PER APPLICATION PHASE:")
    write_to_file("---------------------------------------")

# Calculate and write general statistics in a file
def get_statistics(df, output_file, self):

    with open(output_file, 'w') as f:

        def write_to_file(*args):
            print(" ".join(map(str, args)), file=f, flush=True)

        write_to_file("---------------------------------------")
        write_to_file("JOB CHARACTERISTICS:")
        write_to_file("---------------------------------------")
        write_to_file("Job ID:", self.job)
        write_to_file(len(self.ranks), "Rank (s):", sorted(self.ranks))
        write_to_file(len(self.nodes), "Node (s):", sorted(self.nodes))
        write_to_file("User ID:", self.users)
        write_to_file("Directory:", self.exe)
        write_to_file("Modules collected:", df['module'].unique())
        write_to_file("Module data (MOD):", list(df.type).count('MOD'))
        write_to_file("Meta data (MET):", list(df.type).count('MET'))
        exec_time = round(df['end'].max() - df['start'].min(), 5)
        write_to_file("I/O runtime:", exec_time, "seconds")
        write_to_file("Bandwidth (MiB/second):", round((df['len'].sum() / exec_time) / (1024 ** 2), 2))

        df_rw = df[df['op'].isin(["read", "write"])]
        df_read = df[df['op'] == "read"]
        df_write = df[df['op'] == "write"]

        write_to_file("---------------------------------------")
        write_to_file("I/O OPERATIONS:")
        write_to_file("---------------------------------------")

        current_op = None
        phase_start = None
        total_durations = {'read': 0, 'write': 0, 'open': 0, 'close': 0}

        def update_total_duration(op, phase_start, phase_end, length):
            if current_op is not None and current_op == op:
                total_durations[op] += (phase_end - phase_start)

        for index, row in df.iterrows():
            if current_op is None or current_op != row['op']:
                update_total_duration(current_op, phase_start, row['end'], row['len'])
                current_op = row['op']
                phase_start = row['start']

        # Get the last phase
        update_total_duration(current_op, phase_start, row['end'], row['len'])

        pivot_df = df.pivot_table(index=None, columns='op', values='len', aggfunc='sum')
        for op, duration in total_durations.items():
            write_to_file(f'Duration {op}s: {round(duration, 4)} seconds')
            if op == "read" or op == "write":
                bytesproc = round((pivot_df[op].max() / (1024 ** 2)) / duration, 4)
                write_to_file("Bandwidth:", bytesproc, "(MiB/second)")

        write_to_file("\nREADS:", round(df_read['len'].sum() / (1024 ** 2)),  "(MiB)")
        write_to_file("Max size per rank:", round(df_read.groupby('rank')['len'].agg('sum').max() / (1024 ** 2)), "MiB")
        write_to_file("Min size per rank:", round(df_read.groupby('rank')['len'].agg('sum').min() / (1024 ** 2)), "MiB")
        write_to_file("Bandwidth (MiB/second):", round((df_read['len'].sum() / (df_read['end'].max() - df_read['start'].min())) / (1024 ** 2), 2))
        
        write_to_file("\nWRITES:", round(df_write['len'].sum() / (1024 ** 2)), "(MiB)")
        write_to_file("Max size per rank:", round(df_write.groupby('rank')['len'].agg('sum').max() / (1024 ** 2)), "MiB")
        write_to_file("Min size per rank:", round(df_write.groupby('rank')['len'].agg('sum').min() / (1024 ** 2)), "MiB")
        write_to_file("Bandwidth (MiB/second):", round((df_write['len'].sum() / (df_write['end'].max() - df_write['start'].min())) / (1024 ** 2),2))

        # IMBALANCE METRICS:
        # Average = sum time computing / number of ranks
        # Imbalance time = time that would be saved if the load was perfectly balanced across resources
        # Percent Imbalance = performance that could be gained if load was perfectly balanced
        # Imbalance Percentage = percentage of time that resources (excluding the slowest one) are
        # not involved in computing
        write_to_file("---------------------------------------")
        write_to_file("LOAD IMBALANCE METRICS:")
        write_to_file("---------------------------------------")
        # Get difference between execution time and time processing I/O per rank
        df_idle = df.groupby('rank')['dur'].sum().reset_index()
        df_idle.columns = ['Rank', 'I/O Time']
        df_idle['Total time - I/O Time'] = exec_time - df_idle['I/O Time']
        df_idle = df_idle.sort_values(by='Total time - I/O Time', ascending=False)

        write_to_file("Total execution time:", exec_time)
        num_ranks = df_idle['I/O Time'].nunique()
        average = df_idle['I/O Time'].sum() / num_ranks
        write_to_file("- Average:", round(average), "seconds")
        it = df_idle['I/O Time'].max() - average
        write_to_file("- Imbalance Time:", round(it, 2), "seconds")
        pi = ((df_idle['I/O Time'].max() / average) - 1) * 100
        write_to_file("- Percent Imbalance:", round(pi, 2), "%")
        ip = (it / df_idle['I/O Time'].max()) * (num_ranks / (num_ranks - 1))
        write_to_file("- Imbalance Percentage:", round(ip, 2), "%")
        std = np.std(df_idle['I/O Time'])
        write_to_file("- Standard deviation", round(std, 2))

        write_to_file("---------------------------------------")
        write_to_file("SUMMARY PER RANK: ")
        write_to_file("---------------------------------------")
        write_to_file("Total time without executing I/O operations:")
        df['start'] = pd.to_datetime(df['start'], unit='s').dt.round('S')
        df['end'] = pd.to_datetime(df['end'], unit='s').dt.round('S')
        write_to_file(df_idle)
    
# Define jobs characteristics
def main(filename): 

    df = pd.read_csv(filename, engine="pyarrow")

    # Get basic info about each Job:
    local_df = pd.DataFrame()
    for i in df.job_id.unique():
        
        local_df = df[df['job_id'] == i]
        job = Job(i, local_df['rank'].unique(), local_df['ProducerName'].unique(),local_df['uid'].unique(), 
            local_df['file'].unique(), local_df['exe'].unique())

        local_df['start'] = local_df['timestamp'] - local_df['dur']
        local_df['end'] = local_df['timestamp']
        
        # Job characteristics and statistics:  
        output_file = filename.replace(".csv", ".txt")
        get_statistics(local_df, output_file, job)
       
        # Job visualizations
        # get_visualizations_R(args.input, "./figures/ior/teste.png")
        # get_visualizations_py(local_df, "./figures/ior/teste.png")

    # if(system):
    #     correlate_system(args.input, args.system)

In [2]:
if __name__ == '__main__':

    start_time_exec = time.time()
    # filename = input('Insert absolut LDMS-Darshan log path:')
    main("/home/ana/Documents/2023/ldms-darshan-analysis/ior/eclipse/darshan-ldms-output/csv/8m-4m/test_all_jobs.csv")   
    end_time_exec = time.time()
    print("Execution:", end_time_exec - start_time_exec)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  local_df['start'] = local_df['timestamp'] - local_df['dur']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  local_df['end'] = local_df['timestamp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start'] = pd.to_datetime(df['start'], unit='s').dt.round('S')
A value is trying to be set on a copy o

Execution: 14.834666728973389


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['start'] = pd.to_datetime(df['start'], unit='s').dt.round('S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end'] = pd.to_datetime(df['end'], unit='s').dt.round('S')


# Visualizations in R

Load libraries