In [1]:
import pickle
import h5py
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(suppress=True, precision=3)

In [7]:
features = pickle.load(open("data/selected_features.pkl", "rb"))

In [15]:
features

['Raw Reported Uncorrectable Errors',
 'Raw SATA Downshift Errors',
 'Raw Reallocated Sectors',
 'Raw Current Pending Sectors',
 'Raw Load Cycles',
 'Raw Head Flying Hours',
 'Raw Power On Hours',
 'Seek Count',
 'Raw Start/Stops',
 'Raw Power Off Retracts',
 'Raw Power Cycles',
 'Raw Total LBA Written',
 'Raw Total LBA Read']

In [2]:
df = h5py.File("data/backblaze_prepared.h5", "r")

In [3]:
!ls data

backblaze_compact.h5	      selected_features.pkl
backblaze_full.h5	      smart_attr_names_for_prepared.pkl
backblaze_full_normalized.h5  smart_attrs_compact.pkl
backblaze_prepared.h5	      toyset.hdf5
HDD_selected.h5


In [4]:
features = pickle.load(open("data/smart_attr_names_for_prepared.pkl", "rb"))

In [5]:
[(i, f) for i,f in enumerate(features)]

[(0, 'smart_1__read_error_rate'),
 (1, 'raw_smart_1__read_error_rate'),
 (2, 'smart_3__spin_up_time'),
 (3, 'smart_4__start_stop_count'),
 (4, 'raw_smart_4__start_stop_count'),
 (5, 'smart_5__reallocated_sector_count'),
 (6, 'raw_smart_5__reallocated_sector_count'),
 (7, 'smart_7__seek_error_rate'),
 (8, 'raw_smart_7__seek_error_rate'),
 (9, 'smart_9__power_on_hours'),
 (10, 'raw_smart_9__power_on_hours'),
 (11, 'smart_12__power_cycle_count'),
 (12, 'raw_smart_12__power_cycle_count'),
 (13, 'smart_183__sata_downshift_error_count'),
 (14, 'raw_smart_183__sata_downshift_error_count'),
 (15, 'smart_184__end_to_end_error_IOEDC'),
 (16, 'raw_smart_184__end_to_end_error_IOEDC'),
 (17, 'smart_187__reported_uncorrectable_errors'),
 (18, 'raw_smart_187__reported_uncorrectable_errors'),
 (19, 'smart_188__command_timeout_count'),
 (20, 'raw_smart_188__command_timeout_count'),
 (21, 'smart_189__high_fly_writes'),
 (22, 'raw_smart_189__high_fly_writes'),
 (23, 'smart_190__airflow_temperature'),
 (2

In [6]:
selective_7_feautures = [
    "Current Pending Sector Count",
    "Total Logical Blocks Written",
    "Total Logical Blocks Read",
    "Load Cycle Coun",
    "Power On Hours",
    "Temperature",
    "Seek Count"
]
selective_7_feautures, len(selective_7_feautures)

(['Current Pending Sector Count',
  'Total Logical Blocks Written',
  'Total Logical Blocks Read',
  'Load Cycle Coun',
  'Power On Hours',
  'Temperature',
  'Seek Count'],
 7)

In [7]:
ids = [
    32,
    38,
    39,
    28,
    10,
    30,
    41
]
ids

[32, 38, 39, 28, 10, 30, 41]

In [9]:
X = np.empty((*df['X'].shape[:-1], len(ids)))
X.shape

(35666, 1277, 7)

In [11]:
for i,_id in enumerate(ids):
    X[:, :, i] = df['X'][:, :, _id]

In [12]:
np.alltrue(X[:, :, 0] == df['X'][:, :, 32])

True

In [13]:
mask = (-100 - 1) / (X.max(axis=0).max(axis=0) - 1)
mask

array([-101.   , -101.   , -101.   , -101.   ,  -63.378, -101.   ,
        -39.274])

In [14]:
X.max(axis=0).max(axis=0)

array([2.   , 2.   , 2.   , 2.   , 2.594, 2.   , 3.572])

In [12]:
_X = ((X - 1) / (X.max(axis=0).max(axis=0) - 1))

In [13]:
_X.max(axis=0).max(axis=0)

array([1., 1., 1., 1., 1., 1., 1.])

In [14]:
for i, m in enumerate(mask):
    x = _X[:, :, i]
    x[x == m] = -100

In [16]:
np.histogram(_X)

(array([ 65718975,         0,         0,         0,         0,         0,
                0,         0,         0, 253099399]),
 array([-100. ,  -89.9,  -79.8,  -69.7,  -59.6,  -49.5,  -39.4,  -29.3,
         -19.2,   -9.1,    1. ]))

In [None]:
one_year = X[:, ]

In [20]:
__X = np.empty_like(_X)
__X.shape

(35666, 1277, 7)

In [21]:
offs = np.sum(np.alltrue(_X == -100, axis=-1), axis=-1)
offs.shape

(35666,)

In [22]:
for i,off in enumerate(offs):
    if off != 0:
        __X[i, :-off] = _X[i, off:]
        __X[i, -off:] = _X[i, :off]
    else:
        __X[i] = _X[i]

In [2]:
df = h5py.File("data/backblaze_full_normalized.h5", "a")

df.keys()

<KeysViewHDF5 ['Names', 'X', 'Y', 'one_year', 'two_year']>

In [23]:
df.create_dataset(name="X", data=__X, dtype=np.float32)

<HDF5 dataset "X": shape (35666, 1277, 7), type "<f4">

In [4]:
df2 = h5py.File("data/backblaze_prepared.h5", "r")

In [24]:
df2['Y'].shape

(35666,)

In [25]:
df2['Y'][:10]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [26]:
df.create_dataset(name="Y", data=df2['Y'][:], dtype=np.float32)

<HDF5 dataset "Y": shape (35666,), type "<f4">

In [27]:
df.flush()

In [29]:
y = df2['Y'][:10]
y

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [42]:
names = np.array([
    f'HDD:{i}:{int(val)}'.encode()
    for i, val in enumerate(df2['Y'][:])]).astype('S13')
 
names

array([b'HDD:0:0', b'HDD:1:0', b'HDD:2:0', ..., b'HDD:35663:0',
       b'HDD:35664:0', b'HDD:35665:1'], dtype='|S13')

In [43]:
df.create_dataset(name='Names', data=names)

<HDF5 dataset "Names": shape (35666,), type "|S13">

In [44]:
df.close()

In [3]:
X = df['X'][:]

In [19]:
def slice_seq(x, days, mask):
    offs = np.sum(np.alltrue(x == mask, axis=-1), axis=-1)
    end = x.shape[1] - offs
    start = np.max([end - days, np.zeros_like(end)], axis=0) 
    _x = mask * np.ones((X.shape[0], days, X.shape[2]))
    for i,(s, e) in enumerate(zip(start, end)):
        _x[i, 0:(e-s)] = x[i, s:e] 
    return _x

In [45]:
one_year_X = slice_seq(X, 365, -100)

one_year = df.create_group("one_year")

one_year.create_dataset(name="X", data=one_year_X, dtype=np.float32)
one_year.create_dataset(name="Y", data=df['Y'][:], dtype=np.float32)
one_year.create_dataset(name="Names", data=df['Names'][:], dtype=df['Names'].dtype)

<HDF5 dataset "Names": shape (35666,), type "|S13">

In [46]:
two_year_X = slice_seq(X, 365*2, -100)

two_year = df.create_group("two_year")

two_year.create_dataset(name="X", data=one_year_X, dtype=np.float32)
two_year.create_dataset(name="Y", data=df['Y'][:], dtype=np.float32)
two_year.create_dataset(name="Names", data=df['Names'][:], dtype=df['Names'].dtype)

<HDF5 dataset "Names": shape (35666,), type "|S13">

In [4]:
Y = df['Y'][:]
Y.shape

(35666,)

In [6]:
Y = Y.reshape((-1, 1))

In [7]:
del df['Y']
df.create_dataset(name="Y", data=Y, dtype=np.float32)

<HDF5 dataset "Y": shape (35666, 1), type "<f4">

In [9]:
gr = df['two_year']
del gr['Y']
gr.create_dataset(name="Y", data=Y, dtype=np.float32)

<HDF5 dataset "Y": shape (35666, 1), type "<f4">

In [13]:
df['two_year']['Y'].shape

(35666, 1)

In [14]:
df.flush()
df.close()

In [3]:
grp = df.create_group("90_days")

grp.create_dataset(name="Y", data=df['Y'][:], dtype=df['Y'].dtype)

<HDF5 dataset "Y": shape (35666, 1), type "<f4">

In [4]:
X = df['X'][:]

In [6]:
offs = np.sum(np.alltrue(X == -100, axis=-1), axis=-1)
offs.shape

(35666,)

In [7]:
offs[0]

16

In [14]:
_X = []

In [15]:
i = 0
idx = []
for j,off in enumerate(offs):
    end = X.shape[1] - off
    if end > 90:
        _X.append(X[j,end-90:end])
        idx.append(j)

In [17]:
_X = np.array(_X)

In [18]:
_X.shape

(35462, 90, 7)

In [19]:
names = df['Names'][idx]

In [20]:
names.shape

(35462,)

In [21]:
y = df['Y'][idx]
y.shape

(35462, 1)

In [22]:
del grp['Y']

In [23]:
grp.create_dataset(name="X", data=_X, dtype=_X.dtype)
grp.create_dataset(name="Y", data=y, dtype=y.dtype)
grp.create_dataset(name="Names", data=names, dtype=names.dtype)

<HDF5 dataset "Names": shape (35462,), type "|S13">

In [24]:
df.flush(); df.close()