### Extract relevant features and prepare learning.
 * Most of the data columns are S.M.A.R.T. values that can vary in meaning based on the manufacturer and model. For this reason at this time limit the more in depth analysis to one identical drive model at a time 
 * Sometimes we don't have contiguous sets of data. This is important beause a ML pipline pretty much always requires input vectos/tensors of constant size. For nominal drives this loss doesn't matter much because we have many examples of 'good drives' but for drives that fail it would be best not to lose their statistics. For this reason if we seek DAY_BACK contiguous days, but can only get DAY_BACK - CONTIG_TRYS we would still keep that drive by having CONTIG_TRYS number of non-contiguous days. For example: for a particular disk there are not N=30 straight days of observation before failure, but there are N=1 to 20 days, and then 22 to 153 days, thus we would take 1 to 20 and then 22 to 32. So our total number of obs days is still N=30. This is a kind of data munging hyperparameter, what effects does it have on the results? 

In [1]:
import os
import sys
import math
import pickle
import struct
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
ORIG_COLS = [u'date', u'failure', u'smart_1_raw', u'smart_3_raw', u'smart_4_raw',
       u'smart_5_raw', u'smart_7_raw', u'smart_9_raw', u'smart_10_raw',
       u'smart_12_raw', u'smart_183_raw', u'smart_184_raw', u'smart_187_raw',
       u'smart_188_raw', u'smart_189_raw', u'smart_190_raw', u'smart_191_raw',
       u'smart_192_raw', u'smart_193_raw', u'smart_194_raw', u'smart_197_raw',
       u'smart_198_raw', u'smart_199_raw', u'smart_240_raw', u'smart_241_raw',
       u'smart_242_raw', u'NaNs']
DAY_BACK = 60
TEST = False
DISTANT_DATE = pd.to_datetime("2020-01-01", format='%Y-%m-%d')
DAY_BUFFER_FACTOR = 1.5
FAIL_PAST = True
DIR_PATH = "ST4000DM000_data"
OUT_PATH = "data/" + DIR_PATH + "/"
directory = os.listdir(DIR_PATH)
CONTIG_TRYS = 5;
               
SELECTED_COLS = [u'smart_1_raw', u'smart_4_raw',u'smart_5_raw', u'smart_7_raw', 
        u'smart_9_raw', u'smart_10_raw',  u'smart_12_raw', u'smart_183_raw', 
        u'smart_184_raw', u'smart_187_raw', u'smart_188_raw', 
        u'smart_189_raw', u'smart_190_raw', u'smart_192_raw', u'smart_193_raw', 
        u'smart_194_raw', u'smart_197_raw', u'smart_198_raw', u'smart_199_raw', 
        u'smart_240_raw', u'smart_241_raw',u'smart_242_raw']

In [3]:
def get_run(disk):
    """
    Trys to get a contiguous run of days equal to DAY_BACK
    A contiguous set of days may not exist
    There could be too few days or the days may be not contiguous
    If day back lands at a day which does not exist, this must be handeled
    attempts to handle try_count times
    """
    try_count = 0
    while try_count < CONTIG_TRYS:
        ran_range = int(len(disk)  - DAY_BUFFER_FACTOR*DAY_BACK)
        if ran_range < 1:
            continue
        ran_num = np.random.randint(ran_range) +  DAY_BACK
        lastdayfrom = disk.index[ran_num]
        try:
            disk_run = disk.loc[lastdayfrom - pd.Timedelta(days=DAY_BACK):lastdayfrom].reset_index()
        except:
            try_count +=1
            continue
        if len(disk_run) == DAY_BACK + 1:
            return disk_run
        else:
            try_count +=1
    return []

def get_failure_run(disk):
    """
    Trys to get a contiguous run of days equal to DAY_BACK for a disk that failed
    because few failures exist, do a lot to avoid these
    """
    disk_len = len(disk)
    if disk_len >= DAY_BACK:
        max_day_back = DAY_BACK
    else:
        max_day_back = disk_len
    disk = disk.set_index('date')
    try_count = 0
    avail_day_back = max_day_back
    while try_count < CONTIG_TRYS:
        lastdayfrom = disk.index[len(disk)-1]
        try:
            disk_run = disk.loc[lastdayfrom - pd.Timedelta(days=avail_day_back):lastdayfrom].reset_index()
            if len(disk_run) < max_day_back + 1 and avail_day_back < disk_len:
                #print "days were not contiguous, but there are more days left in array", len(disk_run), avail_day_back, disk_len
                try_count += 1
                avail_day_back += 1
                continue
            else:
                #print "days were not contiguous and there are no more days in array", try_count
                return disk_run, _
        except:
            #print "landed on a day that did not exist", avail_day_back, disk_len
            try_count +=1
            if avail_day_back < disk_len:
                avail_day_back += 1
                continue
            return [], "D"
    return [], "M"
                    

In [4]:
col_len = len(ORIG_COLS)
n_lost_failures = 0
n_lost_nominals = 0
n_incomplete_failures = 0
n_discontig_failures = 0
disk_bin = []
y = []
z = []
n = 0
for disk_file in directory:
    
    if disk_file.split('.')[1] != 'csv':
        continue
        
    n += 1
    if TEST and n > 2001:
        break
        
    if n % 2000 == 0 :
        print 'disk: ', n
        print 'n_lost_nominals', n_lost_nominals
        print 'n_lost_failures', n_lost_failures 
        print 'incomplete failures: ', n_incomplete_failures 
        print 'disctoninous failures: ', n_discontig_failures
        print 'total number of failures resolved: ', np.sum(y)
    
    disk = pd.read_csv(DIR_PATH + '/' + disk_file, comment="#", header=0)
    if len(disk.columns) != col_len:
        print "Oh no! ", disk_file , " has too few/many columns!"
        continue
    else:
        for col in disk.columns:
            if col not in ORIG_COLS:
                print "Oh no! ", disk_file , " has strange col: ", col
        
    disk['date'] = pd.to_datetime(disk['date'], format='%Y-%m-%d')        
    fail_day = disk['date'][disk['failure']>=1].values
    if len(fail_day) > 1:
        print "Oh no! ", disk_file , " is a zombie hard drive returning from dead!"
        continue
        
    if len(fail_day) == 0:
        #fake_day = DISTANT_DATE
        disk['day2fail'] = DISTANT_DATE - disk['date']
        if len(disk) > DAY_BUFFER_FACTOR * DAY_BACK:
            disk = disk.set_index('date')
            disk_run = get_run(disk) 
            if any(disk_run):
                y.append(0)
                z.append(disk_run['day2fail'].values)
                #for col in disk_run.columns:
                #    if col not in SELECTED_COLS:
                #        del disk_run[col]
                disk_bin.append(disk_run)
            else:
                n_lost_nominals +=1
            
    if len(fail_day) == 1:
        disk['day2fail'] = fail_day[0] - disk['date']
        if FAIL_PAST == 1:
            disk['failure'] = 1
        disk_run, s = get_failure_run(disk)
        if s == "D":
            n_discontig_failures += 1
            continue
        if s == "M":
            n_incomplete_failures += 1
            continue
        if any(disk_run):
            y.append(1)
            z.append(disk_run['day2fail'].values)
            #for col in disk_run.columns:
            #    if col not in SELECTED_COLS:
            #        del disk_run[col]
            disk_bin.append(disk_run)
        else:
            n_lost_failures +=1

disk:  2000
n_lost_nominals 1
n_lost_failures 0
incomplete failures:  1
disctoninous failures:  0
total number of failures resolved:  101
disk:  4000
n_lost_nominals 1
n_lost_failures 0
incomplete failures:  2
disctoninous failures:  0
total number of failures resolved:  195
disk:  6000
n_lost_nominals 2
n_lost_failures 0
incomplete failures:  3
disctoninous failures:  0
total number of failures resolved:  280
disk:  8000
n_lost_nominals 4
n_lost_failures 0
incomplete failures:  4
disctoninous failures:  0
total number of failures resolved:  368
disk:  10000
n_lost_nominals 6
n_lost_failures 0
incomplete failures:  4
disctoninous failures:  0
total number of failures resolved:  442
disk:  12000
n_lost_nominals 6
n_lost_failures 0
incomplete failures:  4
disctoninous failures:  0
total number of failures resolved:  523
disk:  14000
n_lost_nominals 6
n_lost_failures 0
incomplete failures:  4
disctoninous failures:  2
total number of failures resolved:  614
disk:  16000
n_lost_nominals 9


In [5]:
print 'n_lost_nominals', n_lost_nominals
print 'n_lost_failures', n_lost_failures 
print 'incomplete failures: ', n_incomplete_failures 
print 'disctoninous failures: ', n_discontig_failures
print 'total number of failures resolved: ', np.sum(y)
print 'disk bin length: ', len(disk_bin)

n_lost_nominals 19
n_lost_failures 0
incomplete failures:  31
disctoninous failures:  4
total number of failures resolved:  1532
disk bin length:  36066


In [6]:
output = open(OUT_PATH + 'disk_bin_d' + str(DAY_BACK) + '.pkl', 'wb')
pickle.dump(disk_bin, output)
output.close()

Examining dividual drives behavior is interesting. For example on the first data munge found:
 * Z300XJJ2.csv, Z300XH5X.csv had no values just commas for first 30 lines or so

 * Z30149QL.csv, has skips in data, no data for several days after 2014-07-20, then resumes normally

 * Z3025KZV.csv has a gap from 2014 to 2015 and smart9 resets

 * S300XCP4.csv had dates that didn't match the smart9 drive hours
 
Finally we make a non-ragged clean tensor from the data:

In [1]:
x_ = []
y_ = []
z_ = []

n = 0
for item in disk_bin:
    tmp = z[n]/np.timedelta64(1, 'D')
    if len(tmp) == DAY_BACK+1:
        for col in item.columns:
            if col not in SELECTED_COLS:
                del item[col]
        x_.append(item.values)
        z_.append(tmp)
        y_.append(y[n])
    n += 1
    
print np.shape(x_)
print np.shape(z_)
print np.shape(y_)
print np.sum(y_)

np.save(OUT_PATH + 'train_d' + str(DAY_BACK) + '.npy', x_)
np.save(OUT_PATH + 'label_d' + str(DAY_BACK) + '.npy', y_)
np.save(OUT_PATH + 'lookback_d'+ str(DAY_BACK) + '.npy', z_)

NameError: name 'disk_bin' is not defined