In [1]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import re
import csv
import os.path

In [2]:
application_name = "nwchem"
postfix_name = ""
darshan_file = "../%s/darshan/%s%s" %(application_name, application_name, postfix_name)
feature_list = {}

In [3]:
# Aggregaed log information
df_agg = pd.read_csv(darshan_file + ".log", delimiter='\t', comment='#',
                     names=['IOType', 'Rank', 'RecordID', 'Counter', 'Value',
                            'FileName', 'MountPt', 'FSType'])
df_agg.head()

Unnamed: 0,IOType,Rank,RecordID,Counter,Value,FileName,MountPt,FSType
0,POSIX,0,1828374797521024576,POSIX_OPENS,2.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
1,POSIX,0,1828374797521024576,POSIX_FILENOS,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
2,POSIX,0,1828374797521024576,POSIX_DUPS,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
3,POSIX,0,1828374797521024576,POSIX_READS,9.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
4,POSIX,0,1828374797521024576,POSIX_WRITES,0.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs


In [4]:
# Metadata information
def read_metadata(filename):
    metadata = {}
    inf = open(filename)
    for line in inf:
        # ignore blank lines
        if len(line) < 2:
            continue
        # stop when the header section is finished
        if line[0] != "#":
            break
        delimiter = line.find(":")
        if delimiter == -1:
            continue
        key = line[2 : delimiter].lstrip().rstrip()
        value = line[delimiter + 1 : -1].lstrip().rstrip()
        if key not in metadata:
            metadata[key] = []
        metadata[key].append(value)
    inf.close()
    return metadata

metadata = read_metadata(darshan_file + ".log")
print("Metadata available for %s" %([i for i in metadata]))



In [5]:
# DXT information

def get_accessed_files(file):
    inf = open(file)
    file_list = []
    cfile = 0
    for line in inf:
        if "DXT, file_id:" in line:
            idx = line.find("file_name")
            cfile = line[idx+11:-1]
        if "DXT, write_count:" in line:
            access_cnt = sum([int(i) for i in re.findall(r'\d+', line)])
            file_list += [cfile] * access_cnt
    inf.close()
    return file_list


df = pd.read_csv(darshan_file + ".dxt.log", delimiter='\t', comment='#',
                 names=["Module", "Rank", "IOType", "Segment", "Offset",
                        "Length", "Start", "End"])
# Add information about the added files
df["File"] = get_accessed_files(darshan_file + ".dxt.log")
df["Offset_end"] = df["Offset"] + df["Length"]
# Filter out entries with length 0
df = df[df.Length > 0]
df.head()

Unnamed: 0,Module,Rank,IOType,Segment,Offset,Length,Start,End,File,Offset_end
0,X_POSIX,0,read,0,0,269,0.0475,0.0475,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,269
3,X_POSIX,0,read,3,0,269,0.128,0.128,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,269
6,X_POSIX,0,read,6,0,269,0.3202,0.3202,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,269
9,X_POSIX,0,read,0,0,3425,0.2738,0.2738,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,3425
11,X_POSIX,0,read,0,0,8192,0.4403,0.442,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,8192


# Gather features according to ANL list

## From the aggregated logs

In [6]:
type_op = ["READ", "WRITE"]
minperf = {}
for op in type_op:
    # the temp dataframes will have one entry per file
    temp_time = df_agg[(df_agg.Counter == "POSIX_F_MAX_%s_TIME" %(op)) & (df_agg.Value > 0)]
    temp_size = df_agg[(df_agg.Counter == "POSIX_MAX_%s_TIME_SIZE" %(op)) & (df_agg.Value > 0)]

    fast_recordid = temp_time['RecordID'].unique()
    perf = {}
    for record in fast_recordid:
        # time / size
        time = temp_time[(temp_time.RecordID == record)]["Value"].values[0]
        size = temp_size[(temp_size.RecordID == record)]["Value"].values[0]
        if record not in perf:
            perf[record] = size / time
        if size / time < perf[record]:
            perf[record] = size / time
    minperf[op] = min([perf[i] for i in perf])
    print(minperf[op])
    
feature_list["POSIX_RAW_agg_perf_by_slowest"] = min(minperf[i] for i in minperf)

2288907.5160659403
1073.1952712333361


In [7]:
type_op = ["READ", "WRITTEN"]
feature_list["POSIX_RAW_total_bytes"] = 0
for op in type_op:
    feature_list["POSIX_RAW_total_bytes"] += df_agg[(df_agg.Counter == "POSIX_BYTES_%s" %(op)) &
                                                    (df_agg.Value > 0)]["Value"].sum()

In [8]:
feature_list["RAW_nprocs"] = int(metadata["nprocs"][0])
feature_list["RAW_runtime"] = int(metadata["run time"][0])
feature_list["users"] = len(metadata["uid"][0].split(" "))
feature_list["apps"] = len(metadata["jobid"][0].split(" "))
feature_list["apps_short"] = 0

In [9]:
feature_list["POSIX_RAW_OPENS"] = df_agg[df_agg.Counter == "POSIX_OPENS"]["Value"].sum()

In [10]:
type_op = ['READS', 'WRITES', 'OPENS', 'SEEKS', 'STATS', 'MMAPS', 'SYNCS']
total_ops = 0
for op in type_op:
    total_ops += df_agg[(df_agg.Counter == "POSIX_" + op) & (df_agg.Value > 0)]["Value"].sum()

In [11]:
feature_list["POSIX_RAW_total_accesses"] = total_ops
feature_list["POSIX_RAW_total_files"] = len(df_agg[df_agg.Value > 0]["FileName"].unique())

In [12]:
type_ops = ["READ", "WRITTEN"]
for op in type_ops:
    feature_list["POSIX_BYTES_%s_PERC" %(op)] = df_agg[df_agg.Counter == "POSIX_BYTES_%s" %(op)]["Value"].sum() * 100 /\
                                                feature_list["POSIX_RAW_total_bytes"]

In [13]:
# Shared files are the ones accessed by more than one rank
temp = []
for _, group in df_agg[df_agg.Value > 0].groupby('FileName'):
    temp.append(len(group.Rank.unique()))
feature_list["POSIX_unique_files_perc"] = len([i for i in temp if i == 1]) * 100 / feature_list["POSIX_RAW_total_files"]
feature_list["POSIX_shared_files_perc"] = len([i for i in temp if i > 1]) * 100 / feature_list["POSIX_RAW_total_files"]

In [14]:
# Read only files are the ones that appear in only READ operations
write_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("WRITE|WRITTEN"))]["FileName"].unique())
read_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("READ"))]["FileName"].unique())

feature_list["POSIX_read_only_files_perc"] = len(read_set - write_set) * 100 / len(read_set | write_set)
feature_list["POSIX_read_write_files_perc"] = len(write_set & read_set) * 100 / len(read_set | write_set)
feature_list["POSIX_write_only_files_perc"] = len(write_set - read_set) * 100 / len(read_set | write_set)

In [15]:
# Percentage is defined as sum of each counter over total number of accesses
type_op = ["POSIX_WRITES_PERC", "POSIX_RW_SWITCHES_PERC", "POSIX_READS_PERC",
                "POSIX_FILE_NOT_ALIGNED_PERC", "POSIX_MEM_NOT_ALIGNED_PERC",
                "POSIX_SIZE_READ_0_100_PERC", "POSIX_SIZE_READ_100_1K_PERC",
                "POSIX_SIZE_READ_1K_10K_PERC", "POSIX_SIZE_READ_10K_100K_PERC",
                "POSIX_SIZE_READ_100K_1M_PERC", "POSIX_SIZE_READ_1M_4M_PERC",
                "POSIX_SIZE_READ_4M_10M_PERC", "POSIX_SIZE_READ_10M_100M_PERC",
                "POSIX_SIZE_READ_100M_1G_PERC", "POSIX_SIZE_READ_1G_PLUS_PERC",
                "POSIX_SIZE_WRITE_0_100_PERC", "POSIX_SIZE_WRITE_100_1K_PERC",
                "POSIX_SIZE_WRITE_1K_10K_PERC", "POSIX_SIZE_WRITE_10K_100K_PERC",
                "POSIX_SIZE_WRITE_100K_1M_PERC", "POSIX_SIZE_WRITE_1M_4M_PERC",
                "POSIX_SIZE_WRITE_4M_10M_PERC", "POSIX_SIZE_WRITE_10M_100M_PERC",
                "POSIX_SIZE_WRITE_100M_1G_PERC", "POSIX_SIZE_WRITE_1G_PLUS_PERC",
                "POSIX_ACCESS1_COUNT_PERC", "POSIX_ACCESS2_COUNT_PERC",
                "POSIX_ACCESS3_COUNT_PERC", "POSIX_ACCESS4_COUNT_PERC"]
for op_perc in type_op:
    op = op_perc[:-5]
    feature_list[op_perc] = df_agg[(df_agg.Value > 0) & (df_agg.Counter == op)]["Value"].sum() * \
                            100 / feature_list["POSIX_RAW_total_accesses"]

In [16]:
df_agg[(df_agg.Counter == "POSIX_WRITES") & (df_agg.Value > 0)]

Unnamed: 0,IOType,Rank,RecordID,Counter,Value,FileName,MountPt,FSType
434,POSIX,0,15975224631563294251,POSIX_WRITES,58.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
520,POSIX,0,9878691345467816968,POSIX_WRITES,315.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
606,POSIX,0,9400558082968463591,POSIX_WRITES,42000.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
692,POSIX,0,6885232322663352530,POSIX_WRITES,8.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
778,POSIX,0,9388230125761083179,POSIX_WRITES,1000.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
864,POSIX,0,7498566071233774021,POSIX_WRITES,1001.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
1122,POSIX,0,4784925612406122468,POSIX_WRITES,1.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
8432,POSIX,42,7571701736965779869,POSIX_WRITES,42000.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
15742,POSIX,84,269207620607162164,POSIX_WRITES,42000.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
23052,POSIX,126,15629356577228947362,POSIX_WRITES,42000.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs


In [17]:
# Percentage is defined by sum of each conter over total writes or total reads 
type_op = ["POSIX_SEQ_READS_PERC", "POSIX_SEQ_WRITES_PERC",
            "POSIX_CONSEC_READS_PERC", "POSIX_CONSEC_WRITES_PERC"]
for op_perc in type_op:
    op = op_perc[:-5]
    total_access_type = df_agg[df_agg.Counter == "POSIX_WRITES"]["Value"].sum()
    if "READ" in op:
        total_access_type = df_agg[df_agg.Counter == "POSIX_READS"]["Value"].sum()
    feature_list[op_perc] = df_agg[(df_agg.Value > 0) & (df_agg.Counter == op)]["Value"].sum() * \
                            100 / total_access_type

In [18]:
extra = ["POSIX_LOG10_agg_perf_by_slowest", "POSIX_LOG10_MODE",
    "POSIX_LOG10_total_bytes", "LOG10_nprocs", "POSIX_LOG10_SEEKS",
    "LOG10_runtime", "POSIX_LOG10_STATS", "POSIX_LOG10_MMAPS",
    "POSIX_LOG10_FSYNCS", "POSIX_LOG10_MEM_ALIGNMENT",
    "POSIX_LOG10_FILE_ALIGNMENT", "POSIX_LOG10_OPENS",
    "POSIX_LOG10_total_accesses", "POSIX_LOG10_total_files"]
for i in extra:
    feature_list[i] = -1

## From the DXT logs

In [19]:
# Making sure the information in the aggregated and the DXT logs are consistent
filelist = df["File"].unique()
for i in filelist:
    # read
    dxt = df[(df.File == i) & (df.IOType == "read") & (df.Module.str.contains("POSIX"))]["Length"].sum()
    agg = df_agg[(df_agg.FileName.str.contains(i)) & (df_agg.Counter == "POSIX_BYTES_READ")]["Value"].sum()
    if dxt != agg:
        print("READ", i, dxt, agg)
    # write
    dxt = df[(df.File == i) & (df.IOType == "write") & (df.Module.str.contains("POSIX"))]["Length"].sum()
    agg = df_agg[(df_agg.FileName == i) & (df_agg.Counter == "POSIX_BYTES_WRITTEN")]["Value"].sum()
    if dxt != agg:
        print("WRITE", i, dxt, agg)

In [20]:
# for now do not check for overlapping intervals (only unique start end)
temp = df[df.Length > 0].groupby(["File", "Offset", "Offset_end"])["Length"].agg(
    ['sum', 'count']).reset_index()

feature_list["POSIX_unique_bytes_perc"] = sum(temp[temp['count'] == 1]["sum"]) * 100 / temp["sum"].sum()
feature_list["POSIX_shared_bytes_perc"] = sum(temp[temp['count'] > 1]["sum"]) * 100 / temp["sum"].sum()

In [21]:
# for now do not check for overlapping intervals (only unique start end)
write_set = df[(df.IOType == "write")][["File", "Offset", "Offset_end"]].values.tolist()
write_set = set([tuple(i) for i in write_set])
read_set = df[(df.IOType == "read")][["File", "Offset", "Offset_end"]].values.tolist()
read_set = set([tuple(i) for i in read_set])

feature_list["POSIX_read_only_bytes_perc"] = len(read_set - write_set) * 100 / len(read_set | write_set)
feature_list["POSIX_read_write_bytes_perc"] = len(write_set & read_set) * 100 / len(read_set | write_set)
feature_list["POSIX_write_only_bytes_perc"] = len(write_set - read_set) * 100 / len(read_set | write_set)

# Additional features

In [22]:
# Read after write (percentage of reads after write to total reads)
# subgroup is true if consecutive entries belong to the same file and the two consecutive entries are write / read
df['subgroup'] = (df['IOType'] == "write") & (df['IOType'].shift(1) == "read") & (df['File'] == df['File'].shift(1))
feature_list["READ_after_WRITE"] = len(df[df.subgroup == True]) * 100 / len(df[df.IOType == "read"])
print("RAW", len(df[df.subgroup == True]), len(df[df.IOType == "read"]))

df['subgroup'] = (df['IOType'] == "read") & (df['IOType'].shift(1) == "read") & (df['File'] == df['File'].shift(1))
feature_list["READ_after_READ"] = len(df[df.subgroup == True]) * 100 / len(df[df.IOType == "read"])
print("RAR", len(df[df.subgroup == True]), len(df[df.IOType == "read"]))


# Write after read (percentage of writes after read to total writes)
# subgroup is true if consecutive entries belong to the same file and the two consecutive entries are read / write
df['subgroup'] = (df['IOType'] == "read") & (df['IOType'].shift(1) == "write") & (df['File'] == df['File'].shift(1))
feature_list["WRITE_after_READ"] = len(df[df.subgroup == True]) * 100 / len(df[df.IOType == "write"])
print("WAR", len(df[df.subgroup == True]), len(df[df.IOType == "write"]))


df['subgroup'] = (df['IOType'] == "write") & (df['IOType'].shift(1) == "write") & (df['File'] == df['File'].shift(1))
feature_list["WRITE_after_WRITE"] = len(df[df.subgroup == True]) * 100 / len(df[df.IOType == "write"])
print("WAW", len(df[df.subgroup == True]), len(df[df.IOType == "write"]))

# RAW and RAR to not equal to 100 because the first read of each file is not counted toward either

RAW 0 2732
RAR 2728 2732
WAR 0 226383
WAW 226371 226383


In [23]:
# consecutive memory accesses to the same file by the same rank
df['subgroup'] = (df['Rank'] == df['Rank'].shift(1)) & (df['File'] == df['File'].shift(1))
feature_list["Rank_consecutive_RAW"] = len(df[df.subgroup == True]) * 100 / feature_list["POSIX_RAW_total_accesses"]

# consecutive memory accesses to the same file by the different ranks
df['subgroup'] = (df['Rank'] != df['Rank'].shift(1)) & (df['File'] == df['File'].shift(1))
feature_list["Rank_switched_RAW"] = len(df[df.subgroup == True]) * 100 / feature_list["POSIX_RAW_total_accesses"]

In [24]:
# write the feature list in a csv file
write_header = True
if os.path.isfile('feature_list.csv'):
    write_header = False
with open('feature_list.csv', 'a') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=list(feature_list.keys()))
    if write_header:
        writer.writeheader()
    writer.writerows([feature_list])

In [25]:
feature_list

{'POSIX_RAW_agg_perf_by_slowest': 1073.1952712333361,
 'POSIX_RAW_total_bytes': 38031035998.0,
 'RAW_nprocs': 224,
 'RAW_runtime': 177,
 'users': 1,
 'apps': 1,
 'apps_short': 0,
 'POSIX_RAW_OPENS': 469.0,
 'POSIX_RAW_total_accesses': 230064.0,
 'POSIX_RAW_total_files': 468,
 'POSIX_BYTES_READ_PERC': 0.05875323512400521,
 'POSIX_BYTES_WRITTEN_PERC': 99.941246764876,
 'POSIX_unique_files_perc': 100.0,
 'POSIX_shared_files_perc': 0.0,
 'POSIX_read_only_files_perc': 21.05263157894737,
 'POSIX_read_write_files_perc': 10.526315789473685,
 'POSIX_write_only_files_perc': 68.42105263157895,
 'POSIX_WRITES_PERC': 98.40001043187982,
 'POSIX_RW_SWITCHES_PERC': 0.0,
 'POSIX_READS_PERC': 1.1909729466583212,
 'POSIX_FILE_NOT_ALIGNED_PERC': 99.5818554836915,
 'POSIX_MEM_NOT_ALIGNED_PERC': 0.0,
 'POSIX_SIZE_READ_0_100_PERC': 0.0034772932749148064,
 'POSIX_SIZE_READ_100_1K_PERC': 0.0017386466374574032,
 'POSIX_SIZE_READ_1K_10K_PERC': 1.1857570067459489,
 'POSIX_SIZE_READ_10K_100K_PERC': 0.0,
 'POSIX_SI

In [26]:
## TODEL
# There are files that only read/write 0 bytes and are still accessed

temp = set(df_agg[(df_agg.Value > 0) & (~df_agg.Counter.str.contains("READ")) & (~df_agg.Counter.str.contains("WRITE|WRITTEN"))]["FileName"].unique())
read_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("READ"))]["FileName"].unique())
write_set = set(df_agg[(df_agg.Value > 0) & (df_agg.Counter.str.contains("WRITE|WRITTEN"))]["FileName"].unique())

print("/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.182" in read_set)
x = temp - read_set
print(x - write_set)
df_agg[(df_agg.Value > 0) & (df_agg.FileName == "/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.182")]

False
{'/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_p.072', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.217', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.140', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_p.005', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.168', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_p.060', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.041', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.100', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_p.104', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.107', '/gpfs/alpine/csc143/proj-shared/againaru/nwchem/summit_submit/copro.dir_check_s.148', '/gpfs/alpine/csc143/proj-shared/aga

Unnamed: 0,IOType,Rank,RecordID,Counter,Value,FileName,MountPt,FSType
32766,POSIX,182,6247817917741009117,POSIX_OPENS,1.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32772,POSIX,182,6247817917741009117,POSIX_STATS,1.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32779,POSIX,182,6247817917741009117,POSIX_MODE,438.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32790,POSIX,182,6247817917741009117,POSIX_MEM_ALIGNMENT,8.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32792,POSIX,182,6247817917741009117,POSIX_FILE_ALIGNMENT,16777220.0,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32835,POSIX,182,6247817917741009117,POSIX_F_OPEN_START_TIMESTAMP,0.222083,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32838,POSIX,182,6247817917741009117,POSIX_F_CLOSE_START_TIMESTAMP,0.241579,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32839,POSIX,182,6247817917741009117,POSIX_F_OPEN_END_TIMESTAMP,0.233141,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32842,POSIX,182,6247817917741009117,POSIX_F_CLOSE_END_TIMESTAMP,0.241681,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
32845,POSIX,182,6247817917741009117,POSIX_F_META_TIME,0.011161,/gpfs/alpine/csc143/proj-shared/againaru/nwche...,/gpfs/alpine,gpfs
