# Hard Drive Model Summary Generator
* makes a summary of hard drive models for each data period
* writes a .csv into OUTPUT_DIR for each folder in DATA_FOLDERS 
* the output data summary columns might include:
 ['model', 'size_tb', 'drive_count', 'days_observed', 'days_elapsed','runtime_total', 'runtime_elapsed', 'failure_rate', 'percent_total', 'failures']
 
* count is number of drives of a given model, in particualr the total count of unique serial_numbers that pandas aggregated over
* obs_days in the sumver over all drives of max(date) - min(date)
* obs_runtime is the sum over all drives of max(runtime) - min(runtime)
* runtime is the sum over all dirves of max(runtime)
* obs_failure_rate `100.0*nfailures/(obs_runtime/8760.0)` 
* failure rate is `100.0*nfailures/(runtime/8760.0)` really makes no sense because it uses assumption of complete observation coverage
* a unique drive is a unique serial_number
* if we had complete observation coverage of a drive since the begining then obs_runtime would be equal to runtime
* the annual failure rate of a drive can be greater than 100%, for example in Q1 of 2016 model ST320LT007 had 31 failures out of 73 drives in the 64 days of the quarter, thus the failure rate was approximately 242% `(100*31*365/(73*64)=242)` if we assume that each drive had 24 hour uptime and we had complete observation coverage.

 observations coverage would be number of days in obs period time number of drives / days, which is often greater than 1, this should not be so, do duplicates need to be removed? Likely!
 
An outstanding confusion in the data is why the number of drives, count, times the number of days in the obs period is less than both obs_days and days

why is `count*24*(number of days in obs period)` less than both obs_runtime and runtime

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from time import localtime, strftime
import logging
import sys

In [2]:
TEST = False
DATA_DIR = "data/"
DATA_FOLDERS = ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
#DATA_FOLDERS = ["data_Q2_2016"]
FEATURE_COLS = ['date','capacity_bytes','smart_9_raw','model','failure','serial_number']
THE_TIME = strftime("%Y-%m-%d-%H-%M", localtime())
OUTPUT_DIR = "summary_data"

In [3]:
def start_logger():
    if not os.path.isdir(OUTPUT_DIR):
        os.system("mkdir " + OUTPUT_DIR)
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    logger = logging.getLogger(__name__)
    logfile = OUTPUT_DIR + '/' + THE_TIME + ".log"
    handler = logging.FileHandler(logfile, 'w')
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    logger.info("TEST\t\t=\t" + str(TEST))
    logger.info("DATA_DIR\t=\t" + DATA_DIR)
    logger.info("DATA_FOLDERS\t=\t" + str(DATA_FOLDERS))
    logger.info("FEATURE_COLS\t=\t" + str(FEATURE_COLS))
    logger.info("THE_TIME\t=\t" + THE_TIME)
    logger.info("OUTPUT_DIR\t=\t" + OUTPUT_DIR)
    return logger

In [4]:
def sift_data(data):
    for ikey in data.keys():
        if ikey not in FEATURE_COLS:
            del data[ikey]
    return data


def tb_capacity(x):
    """
    1 gig is np.power(2, 30) bytes, but maybe it is 10^9 bytes. Who you asking? Whatever.
    """
    tb = np.power(10, 12)
    #if not math.isnan(x):
    return x/tb

In [5]:
def summarize(data):
    aggregations = {
    'failure': {
     'failure': 'sum'
    },
    'capacity_tb':{
        'mean_cap': 'mean'
    },
    'date': { 
        #'max_days': 'max',
        #'min_days': 'min',
        #'unique_days': lambda x: x.nunique(),
        'days_obs': 'count', #count returns series with number of non-NA/null observations over requested axis.
        'days_elap': lambda x: max(x) - min(x)  
    },
    'smart_9_raw': {
        #'min_runtime': 'min'
        'runtime_max': 'max',  
        'runtime_elap': lambda x: max(x) - min(x)}    
    }

    by_model_serial = data.groupby(['model', 'serial_number']).agg(aggregations)#.reset_index()
    by_model_serial.columns = by_model_serial.columns.droplevel()
    models = by_model_serial.index.levels[0].tolist()
    ### The annualized failure rate is: 100 * Failures/(Drive Days/365)
    summary_cols = ['model', 'size_tb', 'drive_count', 'days_observed', 'days_elapsed','runtime_total', 'runtime_elapsed',
                    'failure_rate', 'percent_total', 'failures']
    summary = pd.DataFrame([], columns=summary_cols)
    ntot_drives = float(data['serial_number'].value_counts().count())
    
    for i in models:
        tmp = by_model_serial.xs(i)
        drive_count = len(tmp)
        nfailures = np.sum(tmp['failure'])
        runtime_elap =  np.sum(tmp['runtime_elap'])
        if runtime_elap != 0:
            failrate = 100.0 * nfailures/(runtime_elap/8760.0) 
        else: 
            failrate  = 'NaN'
        df_tmp = pd.DataFrame([[i, stats.mode(tmp['mean_cap'])[0][0], drive_count, 
                                np.nansum(tmp['days_obs']), np.nansum(tmp['days_elap']),
                                np.sum(tmp['runtime_max']), runtime_elap, failrate, 
                                drive_count/ntot_drives, nfailures]], columns=summary_cols)

        summary = summary.append(df_tmp, ignore_index=True)

    return summary

In [6]:
logger = start_logger()

total_summary = []
for data_dir in DATA_FOLDERS:
    data_path = DATA_DIR + data_dir + "/"
    logger.info(" * * *")
    logger.info("Loading and working with: %s" % data_path)
    logger.info("Current time: %s" % strftime("%Y-%m-%d-%H-%M", localtime()))
    hd = pd.DataFrame()
    for data_file in os.listdir(data_path):
        if data_file.split('.')[1] == 'csv':
            if TEST:
                temp = pd.read_csv(data_path + data_file, header=0, nrows=1000)
                temp = sift_data(temp)
            else:
                temp = pd.read_csv(data_path + data_file, header=0)
                temp = sift_data(temp)
            hd = hd.append(temp)
    hd.capacity_bytes = hd.capacity_bytes.map(tb_capacity)
    hd.rename(columns={'capacity_bytes': 'capacity_tb'}, inplace=True)
    hd['date'] = pd.to_datetime(hd['date'], format='%Y-%m-%d')
    _sn = hd['serial_number'].value_counts().count()
    _models = hd['model'].value_counts().count()
    _dates = hd['date'].value_counts().count()
    _fail = hd['failure'].sum()
    logger.info("There are %d unique drives. " % _sn)
    logger.info("There are %d unique models. " % _models)
    logger.info("There are %d unique dates. " % _dates)
    logger.info("There are %d failures." % _fail)
    total_summary.append([data_dir, _sn, _dates, _models, _fail])
    
    hd["quarter"] = hd['date'].dt.quarter
    quarters = hd['quarter'].unique()
    logger.info("There are %d quarters." % len(quarters))
    if len(quarters) > 1:
        for q in quarters:
            qhd = hd[hd['quarter'] == q]
            
            summary = summarize(qhd)
            summary.to_csv(OUTPUT_DIR + "/" + "Q" + str(q) + "_" + data_dir + ".csv", index = False)
    else:
        summary = summarize(hd)
        summary.to_csv(OUTPUT_DIR + "/" + data_dir  + ".csv", index = False)

f =open(OUTPUT_DIR + "/total_summary.txt','w')
for item in total_summary:
    f.write(str(item))
f.close()