# Hard Drive Model Summary Generator
* makes a summary of hard drive models for each data period
* writes a .csv (into OUTPUT_DIR) for each folder in DATA_FOLDERS 
* the output data summary columns might include:
 ['model', 'size', 'count', 'days', 'obs_days', 'runtime', 'obs_runtime','failure_rate', 'obs_failure_rate', 'percent_total', 'failures']

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from time import localtime, strftime
import logging
import sys

In [2]:
TEST = False
DATA_DIR = "data/"
DATA_FOLDERS = ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
#DATA_FOLDERS = ["data_Q1_2016","data_Q2_2016"]
FEATURE_COLS = ['date','capacity_bytes','smart_9_raw','model','failure','serial_number']
THE_TIME = strftime("%Y-%m-%d-%H-%S", localtime())
OUTPUT_DIR = "summary_data"

In [3]:
def start_logger():
    if not os.path.isdir(OUTPUT_DIR):
        os.system("mkdir " + OUTPUT_DIR)
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    logger = logging.getLogger(__name__)
    logfile = OUTPUT_DIR + '/' + THE_TIME + ".log"
    handler = logging.FileHandler(logfile, 'w')
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    logger.info("TEST\t\t=\t" + str(TEST))
    logger.info("DATA_DIR\t=\t" + DATA_DIR)
    logger.info("DATA_FOLDERS\t=\t" + str(DATA_FOLDERS))
    logger.info("FEATURE_COLS\t=\t" + str(FEATURE_COLS))
    logger.info("THE_TIME\t=\t" + THE_TIME)
    logger.info("OUTPUT_DIR\t=\t" + OUTPUT_DIR)
    return logger

In [4]:
def sift_data(data):
    for ikey in data.keys():
        if ikey not in FEATURE_COLS:
            del data[ikey]
    return data


def tb_capacity(x):
    """
    1 gig is np.power(2, 30) bytes, but maybe it is 10^9 bytes. Who you asking? Whatever.
    """
    tb = np.power(10, 12)
    #if not math.isnan(x):
    return x/tb

In [None]:
def summarize(data):
    aggregations = {
    'failure': {
     'failure': 'sum'
    },
    'capacity_tb':{
        'mean_cap': 'mean'
    },
    'date': { 
        #'max_days': 'max'
        #'min_days': 'min'
        'days': 'count',
        'obs_days': lambda x: max(x) - min(x)  
    },
    'smart_9_raw': {
        #'min_runtime': 'min'
        'runtime': 'max',  
        'obs_runtime': lambda x: max(x) - min(x)}    
    }

    by_model_serial = data.groupby(['model', 'serial_number']).agg(aggregations)#.reset_index()
    by_model_serial.columns = by_model_serial.columns.droplevel()
    models = by_model_serial.index.levels[0].tolist()
    ### The annualized failure rate is: 100 * Failures/(Drive Days/365)
    summary_cols = ['model', 'size', 'count', 'days', 'obs_days','runtime', 'obs_runtime','failure_rate', 'obs_failure_rate', 
                    'percent_total', 'failures']
    #summary_cols = ['model', 'size', 'count', 'days', 'obs_days', 'max_days','min_days', 
    #                'min_runtime','runtime', 'obs_runtime','failure_rate', 'obs_failure_rate', 
    #                'percent_total', 'failures']
    summary = pd.DataFrame([], columns=summary_cols)
    ntot_drives = float(data['serial_number'].value_counts().count())
    
    for i in models:
        tmp = by_model_serial.xs(i)
        drive_count = len(tmp)
        nfailures = np.sum(tmp['failure'])
        obs_runtime =  np.sum(tmp['obs_runtime'])
        runtime = np.sum(tmp['runtime'])
        if runtime != 0:
            failrate_runtime = 100.0 * nfailures/(runtime/8760.0) 
        else: 
            failrate_runtime = 'NaN'
        if obs_runtime != 0:
            failrate_obs_runtime = 100.0 * nfailures/(obs_runtime/8760.0) 
        else: 
            failrate_obs_runtime  = 'NaN'
            
        #print(np.nansum(tmp['days']))
        #print(np.nansum(tmp['obs_days'])) 
        df_tmp = pd.DataFrame([[i, stats.mode(tmp['mean_cap'])[0][0], drive_count, np.nansum(tmp['days']), np.nansum(tmp['obs_days']),
                                runtime, obs_runtime, failrate_runtime, failrate_obs_runtime, 
                                drive_count/ntot_drives, nfailures]], columns=summary_cols)

        summary = summary.append(df_tmp, ignore_index=True)

    return summary

In [None]:
logger = start_logger()

for data_dir in DATA_FOLDERS:
    data_path = DATA_DIR + data_dir + "/"
    logger.info(" * * *")
    logger.info("Loading and working with: %s" % data_path)
    hd = pd.DataFrame()
    for data_file in os.listdir(data_path):
        if data_file.split('.')[1] == 'csv':
            if TEST:
                temp = pd.read_csv(data_path + data_file, header=0, nrows=10000)
                temp = sift_data(temp)
            else:
                temp = pd.read_csv(data_path + data_file, header=0)
                temp = sift_data(temp)
            hd = hd.append(temp)
    hd.capacity_bytes = hd.capacity_bytes.map(tb_capacity)
    hd.rename(columns={'capacity_bytes': 'capacity_tb'}, inplace=True)
    hd['date'] = hd['date'].apply(pd.to_datetime)
    logger.info("There are %d unique drives. " % hd['serial_number'].value_counts().count())
    logger.info("There are %d unique models. " % hd['model'].value_counts().count())
    logger.info("There are %d unique dates. " % hd['date'].value_counts().count())
    logger.info("There are %d failures." % hd['failure'].sum())
    summary = summarize(hd)
    
    summary.to_csv(OUTPUT_DIR + "/" + data_dir + ".csv", index = False)
    #summary.to_csv(OUTPUT_DIR + "/" + THE_TIME + "-" + data_dir + ".csv", index = False)

