In [1]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re

In [2]:
application_name = "nwchem"
postfix_name = ""
darshan_file = "../%s/darshan/%s%s" %(application_name, application_name, postfix_name)
feature_list = {}

In [3]:
# Aggregaed log information
df_agg = pd.read_csv(darshan_file + ".log", delimiter='\t', comment='#',
                     names=['IOType', 'Rank', 'RecordID', 'Counter', 'Value',
                            'FileName', 'MountPt', 'FSType'])
df_agg.head()

Unnamed: 0,IOType,Rank,RecordID,Counter,Value,FileName,MountPt,FSType
0,POSIX,0,1828374797521024576,POSIX_OPENS,2.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
1,POSIX,0,1828374797521024576,POSIX_FILENOS,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
2,POSIX,0,1828374797521024576,POSIX_DUPS,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
3,POSIX,0,1828374797521024576,POSIX_READS,9.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
4,POSIX,0,1828374797521024576,POSIX_WRITES,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs


In [4]:
# Metadata information
def read_metadata(filename):
    metadata = {}
    inf = open(filename)
    for line in inf:
        # ignore blank lines
        if len(line) < 2:
            continue
        # stop when the header section is finished
        if line[0] != "#":
            break
        delimiter = line.find(":")
        if delimiter == -1:
            continue
        key = line[2 : delimiter].lstrip().rstrip()
        value = line[delimiter + 1 : -1].lstrip().rstrip()
        if key not in metadata:
            metadata[key] = []
        metadata[key].append(value)
    inf.close()
    return metadata

metadata = read_metadata(darshan_file + ".log")
print("Metadata available for %s" %([i for i in metadata]))



In [5]:
# DXT information

def get_accessed_files(file):
    inf = open(file)
    file_list = []
    cfile = 0
    for line in inf:
        if "DXT, file_id:" in line:
            cfile = line[50:-1]
        if "DXT, write_count:" in line:
            access_cnt = sum([int(i) for i in re.findall(r'\d+', line)])
            file_list += [cfile] * access_cnt
    inf.close()
    return file_list


df = pd.read_csv(darshan_file + ".dxt.log", delimiter='\t', comment='#',
                 names=["Module", "Rank", "IOType", "Segment", "Offset",
                        "Length", "Start", "End"])
# Add information about the added files
df["File"] = get_accessed_files(darshan_file + ".dxt.log")
# Filter out entries with length 0
df = df[df.Length > 0]
df.head()

Unnamed: 0,Module,Rank,IOType,Segment,Offset,Length,Start,End,File
0,X_POSIX,0,read,0,0,269,0.0475,0.0475,pfs/alpine/csc143/proj-shared/againaru/nwchem/...
3,X_POSIX,0,read,3,0,269,0.128,0.128,pfs/alpine/csc143/proj-shared/againaru/nwchem/...
6,X_POSIX,0,read,6,0,269,0.3202,0.3202,pfs/alpine/csc143/proj-shared/againaru/nwchem/...
9,X_POSIX,0,read,0,0,3425,0.2738,0.2738,gpfs/alpine/csc143/proj-shared/againaru/nwchem...
11,X_POSIX,0,read,0,0,8192,0.4403,0.442,pfs/alpine/csc143/proj-shared/againaru/nwchem/...


# Gather features according to ANL list

In [6]:
type_op = ["READ", "WRITE"]
minperf = {}
for op in type_op:
    # the temp dataframes will have one entry per file
    temp_time = df_agg[(df_agg.Counter == "POSIX_F_MAX_%s_TIME" %(op)) & (df_agg.Value > 0)]
    temp_size = df_agg[(df_agg.Counter == "POSIX_MAX_%s_TIME_SIZE" %(op)) & (df_agg.Value > 0)]

    fast_recordid = temp_time['RecordID'].unique()
    perf = {}
    for record in fast_recordid:
        # time / size
        time = temp_time[(temp_time.RecordID == record)]["Value"].values[0]
        size = temp_size[(temp_size.RecordID == record)]["Value"].values[0]
        if record not in perf:
            perf[record] = size / time
        if size / time < perf[record]:
            perf[record] = size / time
    minperf[op] = min([perf[i] for i in perf])
    print(minperf[op])
    
feature_list["POSIX_RAW_agg_perf_by_slowest"] = min(minperf[i] for i in minperf)

2288907.5160659403
1073.1952712333361


In [7]:
type_op = ["READ", "WRITTEN"]
feature_list["POSIX_RAW_total_bytes"] = 0
for op in type_op:
    feature_list["POSIX_RAW_total_bytes"] += df_agg[(df_agg.Counter == "POSIX_BYTES_%s" %(op)) &
                                                    (df_agg.Value > 0)]["Value"].sum()

In [8]:
feature_list["RAW_nprocs"] = int(metadata["nprocs"][0])
feature_list["RAW_runtime"] = int(metadata["run time"][0])
feature_list["users"] = len(metadata["uid"][0].split(" "))
feature_list["apps"] = len(metadata["jobid"][0].split(" "))
feature_list["apps_short"] = 0

In [9]:
feature_list["POSIX_RAW_OPENS"] = df_agg[df_agg.Counter == "POSIX_OPENS"]["Value"].sum()

In [10]:
type_op = ['READS', 'WRITES', 'OPENS', 'SEEKS', 'STATS', 'MMAPS', 'SYNCS']
total_ops = 0
for op in type_op:
    total_ops += df_agg[(df_agg.Counter == "POSIX_" + op) & (df_agg.Value > 0)]["Value"].sum()

In [35]:
feature_list["POSIX_RAW_total_accesses"] = total_ops
feature_list["POSIX_RAW_total_files"] = len(df_agg[df_agg.Value > 0]["FileName"].unique())

In [12]:
type_ops = ["READ", "WRITTEN"]
for op in type_ops:
    feature_list["POSIX_BYTES_%s_PERC" %(op)] = df_agg[df_agg.Counter == "POSIX_BYTES_%s" %(op)]["Value"].sum() * 100 /\
                                                feature_list["POSIX_RAW_total_bytes"]

In [36]:
# Shared files are the ones accessed by more than one rank
temp = []
for _, group in df_agg[df_agg.Value > 0].groupby('FileName'):
    temp.append(len(group.Rank.unique()))
feature_list["POSIX_unique_files_perc"] = len([i for i in temp if i == 1]) * 100 / feature_list["POSIX_RAW_total_files"]
feature_list["POSIX_shared_files_perc"] = len([i for i in temp if i > 1]) * 100 / feature_list["POSIX_RAW_total_files"]

In [80]:
# Read only files are the ones that appear in only READ operations
write_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("WRITE|WRITTEN"))]["FileName"].unique())
read_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("READ"))]["FileName"].unique())

feature_list["POSIX_read_only_files_perc"] = len(read_set - write_set) * 100 / len(read_set | write_set)
feature_list["POSIX_read_write_files_perc"] = len(write_set & read_set) * 100 / len(read_set | write_set)
feature_list["POSIX_write_only_files_perc"] = len(write_set - read_set) * 100 / len(read_set | write_set)

In [91]:
# Percentage is defined as sum of each counter over total number of accesses
type_op = ["POSIX_WRITES_PERC", "POSIX_RW_SWITCHES_PERC",
                "POSIX_SEQ_READS_PERC", "POSIX_SEQ_WRITES_PERC",
                "POSIX_CONSEC_READS_PERC", "POSIX_CONSEC_WRITES_PERC",
                "POSIX_FILE_NOT_ALIGNED_PERC", "POSIX_MEM_NOT_ALIGNED_PERC",
                "POSIX_SIZE_READ_0_100_PERC", "POSIX_SIZE_READ_100_1K_PERC",
                "POSIX_SIZE_READ_1K_10K_PERC", "POSIX_SIZE_READ_10K_100K_PERC",
                "POSIX_SIZE_READ_100K_1M_PERC", "POSIX_SIZE_READ_1M_4M_PERC",
                "POSIX_SIZE_READ_4M_10M_PERC", "POSIX_SIZE_READ_10M_100M_PERC",
                "POSIX_SIZE_READ_100M_1G_PERC", "POSIX_SIZE_READ_1G_PLUS_PERC",
                "POSIX_SIZE_WRITE_0_100_PERC", "POSIX_SIZE_WRITE_100_1K_PERC",
                "POSIX_SIZE_WRITE_1K_10K_PERC", "POSIX_SIZE_WRITE_10K_100K_PERC",
                "POSIX_SIZE_WRITE_100K_1M_PERC", "POSIX_SIZE_WRITE_1M_4M_PERC",
                "POSIX_SIZE_WRITE_4M_10M_PERC", "POSIX_SIZE_WRITE_10M_100M_PERC",
                "POSIX_SIZE_WRITE_100M_1G_PERC", "POSIX_SIZE_WRITE_1G_PLUS_PERC",
                "POSIX_ACCESS1_COUNT_PERC", "POSIX_ACCESS2_COUNT_PERC",
                "POSIX_ACCESS3_COUNT_PERC", "POSIX_ACCESS4_COUNT_PERC"]
for op_perc in type_op:
    op = op_perc[:-5]
    feature_list[op_perc] = df_agg[(df_agg.Value > 0) & (df_agg.Counter == op)]["Value"].sum() * \
                            100 / feature_list["POSIX_RAW_total_accesses"]

In [18]:
feature_list["POSIX_unique_bytes_perc"] = -1
feature_list["POSIX_shared_bytes_perc"] = -1
feature_list["POSIX_read_only_bytes_perc"] = -1
feature_list["POSIX_read_write_bytes_perc"] = -1
feature_list["POSIX_write_only_bytes_perc"] = -1

In [93]:
extra = ["POSIX_LOG10_agg_perf_by_slowest", "POSIX_LOG10_MODE",
    "POSIX_LOG10_total_bytes", "LOG10_nprocs", "POSIX_LOG10_SEEKS",
    "LOG10_runtime", "POSIX_LOG10_STATS", "POSIX_LOG10_MMAPS",
    "POSIX_LOG10_FSYNCS", "POSIX_LOG10_MEM_ALIGNMENT",
    "POSIX_LOG10_FILE_ALIGNMENT", "POSIX_LOG10_OPENS",
    "POSIX_LOG10_total_accesses", "POSIX_LOG10_total_files"]
for i in extra:
    feature_list[i] = -1

In [94]:
feature_list

{'POSIX_RAW_agg_perf_by_slowest': 1073.1952712333361,
 'POSIX_RAW_total_bytes': 38031035998.0,
 'RAW_nprocs': 224,
 'RAW_runtime': 177,
 'users': 1,
 'apps': 1,
 'apps_short': 0,
 'POSIX_RAW_OPENS': 469.0,
 'POSIX_RAW_total_accesses': 230064.0,
 'POSIX_RAW_total_files': 468,
 'POSIX_BYTES_READ_PERC': 0.05875323512400521,
 'POSIX_BYTES_WRITTEN_PERC': 99.941246764876,
 'POSIX_unique_bytes_perc': -1,
 'POSIX_shared_bytes_perc': -1,
 'POSIX_read_only_bytes_perc': -1,
 'POSIX_read_write_bytes_perc': -1,
 'POSIX_write_only_bytes_perc': -1,
 'POSIX_unique_files_perc': 100.0,
 'POSIX_shared_files_perc': 0.0,
 'POSIX_read_only_files_perc': 21.05263157894737,
 'POSIX_read_write_files_perc': 10.526315789473685,
 'POSIX_write_only_files_perc': 68.42105263157895,
 'POSIX_WRITES_PERC': 98.40001043187982,
 'POSIX_RW_SWITCHES_PERC': 0.0,
 'POSIX_SEQ_READS_PERC': 1.187060991724042,
 'POSIX_SEQ_WRITES_PERC': 98.39392516864872,
 'POSIX_CONSEC_READS_PERC': 1.187060991724042,
 'POSIX_CONSEC_WRITES_PERC': 9

In [None]:
## TODEL
# There are files that only read/write 0 bytes and are still accessed

temp = set(df_agg[(df_agg.Value > 0) & (~df_agg.Counter.str.contains("READ")) & (~df_agg.Counter.str.contains("WRITE|WRITTEN"))]["FileName"].unique())
read_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("READ"))]["FileName"].unique())
write_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("WRITE|WRITTEN"))]["FileName"].unique())

print("/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.182" in read_set)
x = temp - read_set
print(x - write_set)
df_agg[(df_agg.Value > 0) & (df_agg.FileName == "/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.182")]