In [5]:
#!conda install -y -c conda-forge pyarrow

In [1]:
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.transforms import blended_transform_factory
import dask.dataframe as dd
import pandas as pd
import re

%matplotlib inline
%config Completer.use_jedi = False

**Load GPU failures**

In [2]:
FAILURES = '/gpfs/alpine/stf218/proj-shared/data/lake/summit_gpu_failures/gpu_failures.csv'
NODE = 'hostname'
TIME = 'timestamp'
XID = 'xid'

failures = pd.read_csv(FAILURES)[[NODE, XID]]

In [3]:
# Remove data for login and batch nodes.
failures = failures[~failures[NODE].str.startswith('login') & ~failures[NODE].str.startswith('batch')]
failures[failures[NODE].str.startswith('login') | failures[NODE].str.startswith('batch')][NODE].unique()

array([], dtype=object)

In [4]:
xid_names = {
    31: 'Memory page fault', 13: 'Graphics engine exception', 43: 'Stopped processing', 74: 'NVLINK error',
    63: 'Page retirement event', 64: 'Page retirement failure', 48: 'Double-bit error', 45: 'Preemptive cleanup',
    61: 'Internal microcontroller warning', 44: 'Graphics engine fault', 79: 'Fallen off the bus', 62: 'Internal microcontroller halt',
    38: 'Driver firmware error', 32: 'Corrupted push buffer stream', 12: 'Driver error handling exception', 69: 'Graphics engine class error'}


In [5]:
failures['name'] = failures[XID].apply(xid_names.get)

In [6]:
len(failures)

251859

In [13]:
failures.groupby('name')[XID].count().sort_values(ascending=False)

name
Memory page fault                   186496
Graphics engine exception            32339
Stopped processing                   22649
NVLINK error                          8736
Page retirement event                  851
Page retirement failure                210
Double-bit error                       179
Preemptive cleanup                     162
Graphics engine fault                   44
Fallen off the bus                      31
Internal microcontroller halt           29
Driver firmware error                   26
Driver error handling exception         21
Corrupted push buffer stream            11
Graphics engine class error              1
Name: xid, dtype: int64

**Obtain failure frequencies in nodes**

In [8]:
FREQ = 'freq'
FAILURE = 'failure'

freq_per_node = failures.groupby([XID, NODE], as_index=False).size().rename(columns={'size': FREQ, XID: FAILURE})
freq_per_node[FAILURE] = freq_per_node[FAILURE].apply(xid_names.get)
freq_per_node.head()

Unnamed: 0,failure,hostname,freq
0,Driver error handling exception,e26n13,21
1,Graphics engine exception,a01n01,20
2,Graphics engine exception,a01n02,11
3,Graphics engine exception,a01n03,4
4,Graphics engine exception,a01n04,7


In [12]:
freq_per_node.groupby(FAILURE)[FREQ].max().sort_values(ascending=False)

failure
NVLINK error                        8462
Memory page fault                   1189
Graphics engine exception            259
Stopped processing                   118
Page retirement failure               89
Page retirement event                 37
Preemptive cleanup                    34
Double-bit error                      33
Driver error handling exception       21
Corrupted push buffer stream           9
Fallen off the bus                     8
Graphics engine fault                  5
Internal microcontroller halt          4
Driver firmware error                  2
Graphics engine class error            1
Name: freq, dtype: int64