# Hard Drive Survival Table Generator
* creates a summary table of all 'serial_number' for each model
* 
* writes a .csv (into OUTPUT_DIR) for each folder in DATA_FOLDERS 
* each unqiue drive (a given 'serial_number') will be one line
* the output data is ready for survival analysis 

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from time import localtime, strftime, time
import logging
import sys

In [2]:
TEST = False
DATA_DIR = "data/"
DATA_FOLDERS = ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
#DATA_FOLDERS =  ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016"]
FEATURE_COLS = ['smart_9_raw','model','failure','serial_number']
THE_TIME = strftime("%Y-%m-%d-%H-%M", localtime())
SUMMARY_DIR = "summary_data/"
PERCENT_TOTAL_REQ = .05
FAILURE_RATE_REQ = 5
MIN_NUMBER_REQ = 100
OUTPUT_DIR = "survival_data"

In [3]:
def start_logger():
    if not os.path.isdir(OUTPUT_DIR):
        os.system("mkdir " + OUTPUT_DIR)
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    logger = logging.getLogger(__name__)
    logfile = OUTPUT_DIR + '/' + THE_TIME + ".log"
    handler = logging.FileHandler(logfile, 'w')
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    logger.info("TEST\t\t=\t" + str(TEST))
    logger.info("DATA_DIR\t=\t" + DATA_DIR)
    logger.info("DATA_FOLDERS\t=\t" + str(DATA_FOLDERS))
    logger.info("FEATURE_COLS\t=\t" + str(FEATURE_COLS))
    logger.info("THE_TIME\t=\t" + THE_TIME)
    logger.info("SUMMARY_DIR\t=\t" + SUMMARY_DIR)
    logger.info("PERCENT_TOTAL_REQ\t=\t" + str(PERCENT_TOTAL_REQ))
    logger.info("FAILURE_RATE_REQ\t=\t" + str(FAILURE_RATE_REQ))
    logger.info("MIN_NUMBER_REQ\t=\t" + str(MIN_NUMBER_REQ))
    logger.info("OUTPUT_DIR\t=\t" + OUTPUT_DIR)
    return logger

In [4]:
def sift_data(data, model):
    for ikey in data.keys():
        if ikey not in FEATURE_COLS:
            del data[ikey]
    #print(data.columns)
    data = data[data['model'] == model]
    del data['model']
    return data

def tb_capacity(x):
    """
    1 gig is np.power(2, 30) bytes, but maybe it is 10^9 bytes. Who you asking? Whatever.
    """
    tb = np.power(10, 12)
    #if not math.isnan(x):
    return x/tb

In [5]:
def aggregate_by_serial(hd):
    aggregations = {
        'smart_9_raw': { # smart 9 is the disk uptime
            'runtime_max': 'max',  
            'runtime_min': 'min',
            'uptime': lambda x: max(x) - min(x)  
        },
        #'model':{
        #   'model_count': 'count',
           #'model': 'mean'
        #},
        'failure': {
         'n_obs' : 'count',
         'failure': 'sum'
        }  
    }

    survival = hd_all.groupby('serial_number').agg(aggregations).reset_index()
    survival.columns = survival.columns.droplevel()
    survival.rename(columns={'': 'serial_number'}, inplace=True)
    return survival

In [6]:
#logger = start_logger()
#SUMMARY_DIR = "summary_data/"

target_models = []
for df in DATA_FOLDERS:
    print(SUMMARY_DIR + df + '.csv')
    summary_dats = pd.read_csv(SUMMARY_DIR + df + '.csv', header=0, nrows=200)
    #print(temp.head)
    summary_dats = summary_dats.sort_values(by="percent_total", ascending=False)
    clipped1 = summary_dats[summary_dats['percent_total'] >= PERCENT_TOTAL_REQ]
    clipped2 = summary_dats[(summary_dats['failure_rate'] >= FAILURE_RATE_REQ) & (summary_dats['drive_count'] >= MIN_NUMBER_REQ)]
    [target_models.append(m) for m in clipped1['model']]
    [target_models.append(m) for m in clipped2['model']]

    
print(np.unique(target_models))

unique_target_models = np.unique(target_models)

summary_data/2014.csv
summary_data/2015.csv
summary_data/data_Q1_2016.csv
summary_data/data_Q2_2016.csv
summary_data/data_Q3_2016.csv
summary_data/data_Q4_2016.csv
['HGST HMS5C4040ALE640' 'HGST HMS5C4040BLE640' 'Hitachi HDS5C3030ALA630'
 'Hitachi HDS5C4040ALE630' 'Hitachi HDS722020ALA330' 'ST3000DM001'
 'ST31500341AS' 'ST31500541AS' 'ST32000542AS' 'ST4000DM000' 'ST4000DX000'
 'ST500LM012 HN' 'ST8000DM002' 'WDC WD10EACS' 'WDC WD10EADS'
 'WDC WD1600AAJS' 'WDC WD20EFRX' 'WDC WD30EFRX' 'WDC WD30EZRX'
 'WDC WD5000LPVX' 'WDC WD60EFRX']


In [7]:
start_time = time()
logger = start_logger()
logger.info("Unique models working over: %s" % unique_target_models)

for model in unique_target_models:
    logger.info(" ### ### ### ### ### ### ### ### ### ### #### ### ###")
    logger.info("Working with: %s" % model)
    logger.info("Elapsed time: %s seconds" % np.round(time() - start_time,1))
    print("Working with: %s" % model)
    hd_all = pd.DataFrame()
    for data_dir in DATA_FOLDERS:
        hd_subset = pd.DataFrame()
        data_path = DATA_DIR + data_dir + "/"
        logger.info(" * * *")
        logger.info("Loading and working with: %s" % data_path)
        print("Loading and working with: %s" % data_path)
        for data_file in os.listdir(data_path):
            if data_file.split('.')[1] == 'csv':
                if TEST:
                    temp = pd.read_csv(data_path + data_file, header=0, nrows=20)
                    temp = sift_data(temp, model)
                    #print('t')
                else:
                    temp = pd.read_csv(data_path + data_file, header=0)
                    temp = sift_data(temp, model)
                #print('st: ', np.shape(temp))
                #print ('tt', type(temp))
                hd_subset = hd_subset.append(temp)
                #print('shss: ', np.shape(hd_subset))
        #hd.capacity_bytes = hd.capacity_bytes.map(tb_capacity)
        #hd.rename(columns={'capacity_bytes': 'capacity_tb'}, inplace=True)
        #hd['date'] = hd['date'].apply(pd.to_datetime)
        #print(hd_subset.head(5))
        logger.info("There are %d unique drives in the subset. " % hd_subset['serial_number'].value_counts().count())
        #logger.info("There are %d unique models in the subset. " % hd_subset['model'].value_counts().count())
        logger.info("There are %d failures in the subset." % hd_subset['failure'].sum())
        hd_all = hd_all.append(hd_subset)
    logger.info("There are %d unique drives in the complete set. " % hd_all['serial_number'].value_counts().count())
    #logger.info("There are %d unique models in the complete set. " % hd_all['model'].value_counts().count())
    logger.info("There are %d failures in the complete set." % hd_all['failure'].sum())

    survives = aggregate_by_serial(hd_all)
    #print(survives.head(5))
    
    #x = 'alpha beta gamma'
    model_ns = model.replace(" ", "_")
    survives.to_csv(OUTPUT_DIR + "/survival_" + model_ns + ".csv", index = False)
    #survives.to_csv(OUTPUT_DIR + "/survival_" + model_ns + data_dir + ".csv", index = False)
        
        
#summary = summarize(hd)
#summary.to_csv(OUTPUT_DIR + "/" + THE_TIME + data_dir + ".csv", index = False)

Working with: HGST HMS5C4040ALE640
Loading and working with: data/2014/
Loading and working with: data/2015/
Loading and working with: data/data_Q1_2016/
Loading and working with: data/data_Q2_2016/
Loading and working with: data/data_Q3_2016/
Loading and working with: data/data_Q4_2016/
Working with: HGST HMS5C4040BLE640
Loading and working with: data/2014/
Loading and working with: data/2015/
Loading and working with: data/data_Q1_2016/
Loading and working with: data/data_Q2_2016/
Loading and working with: data/data_Q3_2016/
Loading and working with: data/data_Q4_2016/
Working with: Hitachi HDS5C3030ALA630
Loading and working with: data/2014/
Loading and working with: data/2015/
Loading and working with: data/data_Q1_2016/
Loading and working with: data/data_Q2_2016/
Loading and working with: data/data_Q3_2016/
Loading and working with: data/data_Q4_2016/
Working with: Hitachi HDS5C4040ALE630
Loading and working with: data/2014/
Loading and working with: data/2015/
Loading and workin