In [1]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

# DTX Darshan logs

In [2]:
application_name = "nwchem"
darshan_file = "../%s/darshan/nwchem.log" %(application_name)

## Read the metadata for the log

In [3]:
def read_metadata(filename):
    metadata = {}
    inf = open(filename)
    for line in inf:
        # ignore blank lines
        if len(line) < 2:
            continue
        # stop when the header section is finished
        if line[0] != "#":
            break
        delimiter = line.find(":")
        if delimiter == -1:
            continue
        key = line[2 : delimiter].lstrip().rstrip()
        value = line[delimiter + 1 : -1].lstrip().rstrip()
        if key not in metadata:
            metadata[key] = []
        metadata[key].append(value)
    inf.close()
    return metadata

In [4]:
metadata = read_metadata(darshan_file)

In [5]:
print("Metadata available for %s" %([i for i in metadata]))



In [6]:
print("Darshan version: %s" %(metadata["darshan log version"]))

Darshan version: ['3.10']


## Read the Darshan log

In [7]:
# Darshan files have the following format:
# <module> <rank> <record id> <counter> <value> <file name> <mount pt> <fs type>

df = pd.read_csv(darshan_file, delimiter='\t', comment='#',
                 names=['IOType', 'Rank', 'RecordID', 'Counter', 'Value',
                        'FileName', 'MountPt', 'FSType'])
df.head()

Unnamed: 0,IOType,Rank,RecordID,Counter,Value,FileName,MountPt,FSType
0,POSIX,0,1828374797521024576,POSIX_OPENS,2.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
1,POSIX,0,1828374797521024576,POSIX_FILENOS,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
2,POSIX,0,1828374797521024576,POSIX_DUPS,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
3,POSIX,0,1828374797521024576,POSIX_READS,9.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
4,POSIX,0,1828374797521024576,POSIX_WRITES,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs


## Log Summary

In [8]:
log_summary = {}
log_summary["Walltime"] = int(metadata["run time"][0])
log_summary["Total_ranks"] = int(metadata["nprocs"][0])
log_summary["IO_Types"] = df['IOType'].unique()
log_summary["FS_Types"] = df['FSType'].unique()
log_summary["#ranks_involved"] = len(df['Rank'].unique())
log_summary["#files_accessed"] = len(df['FileName'].unique())

# Get access size for each IO type
log_summary["Access_size_bytes"] = {}
for i in log_summary["IO_Types"]:
    value = metadata["%s module" %(i)][0]
    # Assuming all logs record bytes
    value = value[:value.find(" ")]
    log_summary["Access_size_bytes"][i] = int(value)

In [9]:
print("Stats for %s" %(application_name))
print("---------")
for i in log_summary:
    print(i, log_summary[i])

Stats for nwchem
---------
Walltime 177
Total_ranks 224
IO_Types ['POSIX' 'STDIO']
FS_Types ['gpfs' 'UNKNOWN']
#ranks_involved 224
#files_accessed 468
Access_size_bytes {'POSIX': 37034, 'STDIO': 414}


# Plot Darshan statistics