In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

## Finding duplicate rows

Load dataset without timing features, otherwise there will be no duplicates.

In [2]:
df_no_time = pd.read_csv(Path(r"../data/blue_waters_posix_with_paths_no_negative_outliers_no_time.csv"))

### Try with vasp_gam as test app

In [3]:
test_app = df_no_time[df_no_time.exe.str.strip() == 'vasp_gam']
test_app.head()

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,WRITE_4M_10M,WRITE_10M_100M,WRITE_100M_1G,WRITE_1G_PLUS,rank,POSIX_TOTAL_TIME,nprocs,exe,lustre,path
4050,16646,-17,-17,253920,264620,218648,16646,0,0,-17,...,0,0,0,0,-1,90.58713,96,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4056,16344,-17,-17,253920,271720,218044,16344,0,0,-17,...,0,0,0,0,-1,96.198886,96,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4060,13706,-17,-17,312320,32569,255106,13706,0,0,-17,...,0,0,0,0,-1,42.093948,256,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4062,13266,-17,-17,308480,22241,251299,13266,0,0,-17,...,0,0,0,0,-1,38.330144,256,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4067,13260,-17,-17,308480,19644,251095,13260,0,0,-17,...,0,0,0,0,-1,28.921739,256,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...


In [4]:
test_app_dups = test_app[test_app.POSIX_BYTES_READ.duplicated() & test_app.POSIX_BYTES_WRITTEN.duplicated()]
len(test_app_dups)

45840

In [5]:
test_app_no_dups = test_app[(test_app.POSIX_BYTES_READ.duplicated() == False) & (test_app.POSIX_BYTES_WRITTEN.duplicated() == False)]
test_app_no_dups.head()

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,WRITE_4M_10M,WRITE_10M_100M,WRITE_100M_1G,WRITE_1G_PLUS,rank,POSIX_TOTAL_TIME,nprocs,exe,lustre,path
4050,16646,-17,-17,253920,264620,218648,16646,0,0,-17,...,0,0,0,0,-1,90.58713,96,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4056,16344,-17,-17,253920,271720,218044,16344,0,0,-17,...,0,0,0,0,-1,96.198886,96,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4060,13706,-17,-17,312320,32569,255106,13706,0,0,-17,...,0,0,0,0,-1,42.093948,256,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4062,13266,-17,-17,308480,22241,251299,13266,0,0,-17,...,0,0,0,0,-1,38.330144,256,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4067,13260,-17,-17,308480,19644,251095,13260,0,0,-17,...,0,0,0,0,-1,28.921739,256,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...


In [6]:
test_dup_set = test_app[(test_app.POSIX_BYTES_READ == test_app_no_dups.iloc[2].POSIX_BYTES_READ)
         & (test_app.POSIX_BYTES_WRITTEN == test_app_no_dups.iloc[2].POSIX_BYTES_WRITTEN)]
test_dup_set

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,WRITE_4M_10M,WRITE_10M_100M,WRITE_100M_1G,WRITE_1G_PLUS,rank,POSIX_TOTAL_TIME,nprocs,exe,lustre,path
4060,13706,-17,-17,312320,32569,255106,13706,0,0,-17,...,0,0,0,0,-1,42.093948,256,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...


Specify columns that may differ such that the corresponding rows are still considered as duplicate jobs:
- POSIX_MAX_(READ/WRITE)\_Time_Size: which operation is the slowest/fastest might be affected by concurrent jobs running at the same time, possibly on the same node as well as IO weather.
- POSIX_(FASTEST/SLOWEST)\_RANK: same as above.
- POSIX_TOTAL_TIME: is the output feature
- rank: duplicate jobs may have been scheduled to different nodes.

In [7]:
columns_dissimilar = ["POSIX_MAX_READ_TIME_SIZE", "POSIX_MAX_WRITE_TIME_SIZE", "POSIX_FASTEST_RANK", "POSIX_SLOWEST_RANK",
                      "rank","POSIX_TOTAL_TIME"]

In [8]:
test_app[test_app.duplicated(test_app.columns.difference(columns_dissimilar))]

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,WRITE_4M_10M,WRITE_10M_100M,WRITE_100M_1G,WRITE_1G_PLUS,rank,POSIX_TOTAL_TIME,nprocs,exe,lustre,path
4441,2314,-13,-13,4608,16,194,2314,0,0,-13,...,0,0,0,0,-1,2.733955,192,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4455,3578,-17,-17,77120,26022,64139,3578,0,0,-17,...,0,0,0,0,-1,12.016318,64,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4518,3182,-20,-20,75713,58079393,61515,3246,0,0,-20,...,0,0,0,0,-1,296.457484,64,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4598,3182,-20,-20,75713,59738273,61515,3246,0,0,-20,...,0,0,0,0,-1,313.086068,64,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
4649,3182,-20,-20,75713,58079393,61515,3246,0,0,-20,...,0,0,0,0,-1,298.274991,64,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777677,882,-20,-20,147408,34572112,16006,898,0,0,-20,...,0,0,0,0,-1,256.205289,16,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
777680,822,-20,-20,91970,29500157,15637,838,0,0,-20,...,0,0,0,0,-1,150.464894,16,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
777681,818,-20,-20,94089,20325966,15318,834,0,0,-20,...,0,0,0,0,-1,158.052334,16,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
810837,3150,-20,-20,67648,12445960,60106,3214,0,0,-20,...,0,0,0,0,-1,63.009715,64,vasp_gam,1,/hpcwork/noco0056/io_transfer_learning/data/bl...


### Now find duplicates for the entire dataset

In [9]:
df_no_time_dups = df_no_time[df_no_time.duplicated(df_no_time.columns.difference(columns_dissimilar))]
df_no_time_dups.head()

Unnamed: 0,POSIX_OPENS,POSIX_FILENOS,POSIX_DUPS,POSIX_READS,POSIX_WRITES,POSIX_SEEKS,POSIX_STATS,POSIX_MMAPS,POSIX_FSYNCS,POSIX_RENAME_SOURCES,...,WRITE_4M_10M,WRITE_10M_100M,WRITE_100M_1G,WRITE_1G_PLUS,rank,POSIX_TOTAL_TIME,nprocs,exe,lustre,path
106,2540,-2541,-2541,3499,0,0,5081,0,0,-2541,...,0,0,0,0,7,2.461696,423,./Hsigma,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
143,1027,-1024,-1024,33,20,14,3,0,0,-1024,...,0,0,0,0,0,0.08574,768,./nek5000,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
159,1922,-1923,-1923,2713,0,0,3845,0,0,-1923,...,0,0,0,0,289,1.563848,320,./Hsigma,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
203,2234,-2235,-2235,3134,0,0,4469,0,0,-2235,...,0,0,0,0,76,1.116228,372,./Hsigma,1,/hpcwork/noco0056/io_transfer_learning/data/bl...
204,2522,-2523,-2523,3491,0,0,5045,0,0,-2523,...,0,0,0,0,415,2.611028,420,./Hsigma,1,/hpcwork/noco0056/io_transfer_learning/data/bl...


In [10]:
df_no_time_no_dups = df_no_time[df_no_time.duplicated(df_no_time.columns.difference(columns_dissimilar)) == False]

In [11]:
df_no_time_no_dups.to_csv(r"../data/blue_waters_posix_with_paths_no_negative_outliers_no_time_no_dups.csv",index=False)