# Hard Drive Model Data Generator
* pulls out many features over all times for all drives of a given MODEL
* prepares data for time series machine learning
* writes a .csv (into OUTPUT_DIR) for each unqiue 'serial_number' of a given MODEL
* the output data features include all columns that contain the string FEATURE_COL
* generates many files! a file for each unique serial # of a given model!

In [26]:
import os
import pandas as pd
import numpy as np
from time import gmtime, strftime, time
import logging
import sys

In [None]:
MODEL = 'ST4000DM000'
TEST = False
DATA_DIR = "data/"
DATA_FOLDERS = ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
NAN_FRAC = 12 
FEATURE_COL = "raw"
COLS = [u'date', u'failure', u'serial_number', u'smart_1_raw', u'smart_3_raw', u'smart_4_raw',
       u'smart_5_raw', u'smart_7_raw', u'smart_9_raw', u'smart_10_raw',
       u'smart_12_raw', u'smart_183_raw', u'smart_184_raw', u'smart_187_raw',
       u'smart_188_raw', u'smart_189_raw', u'smart_190_raw', u'smart_191_raw',
       u'smart_192_raw', u'smart_193_raw', u'smart_194_raw', u'smart_197_raw',
       u'smart_198_raw', u'smart_199_raw', u'smart_240_raw', u'smart_241_raw',
       u'smart_242_raw']
THE_TIME = strftime("%Y-%m-%d-%H-%S", gmtime())
OUTPUT_DIR = MODEL + "_data"

Sets up a custom logger that will work with ipython, repeated calls to this piece of code without restarting the IPython kernel will result in creating and attaching yet another handler to the logger on every run and the number of messages being logged to the file with each log call will grow

In [None]:
def start_logger():
    if os.path.isdir(OUTPUT_DIR):
        os.system("rm " + OUTPUT_DIR + "/*")
        os.system("rmdir " + OUTPUT_DIR)
    os.system("mkdir " + OUTPUT_DIR)
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    logger = logging.getLogger(__name__)
    logfile = OUTPUT_DIR + '/' + THE_TIME + ".log"
    handler = logging.FileHandler(logfile, 'w')
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    logger.info("MODEL\t=\t" + MODEL)
    logger.info("TEST\t=\t" + str(TEST))
    logger.info("DATA_DIR\t=\t" + DATA_DIR)
    logger.info("OUTPUT_DIR\t=\t" + OUTPUT_DIR)
    logger.info("THE_TIME\t=\t" + THE_TIME)
    logger.info("COLS\t=\t" + str(COLS))
    logger.info("NAN_FRAC\t=\t" + str(NAN_FRAC))
    return logger

### select specific MODEL and drop columns that do not contain FEATURE_COL, are date or are serial
    

In [None]:
def sift_data_dep(data):
    sifted_data = pd.DataFrame.copy(data[data['model'] == MODEL])
    for ikey in sifted_data.keys():
        if FEATURE_COL not in ikey and ikey != 'serial_number' and ikey != 'failure' and ikey != 'date':
                del sifted_data[ikey]
    return sifted_data

In [None]:
def sift_data(data):
    sifted_data = pd.DataFrame.copy(data[data['model'] == MODEL])
    for ikey in sifted_data.keys():
        if ikey not in COLS:
                del sifted_data[ikey]
    return sifted_data

### drop columns that are all NaN or have more than NAN_FRACTION

In [None]:
 def clean_data_aggresive(data):
    hd_clean = data.dropna(axis=1, how='all')
    for ikey in hd_clean.keys():
        column_nan_count = hd_clean[ikey].isnull().sum()
        logger.info("%d NaN fraction of:\t %s" % (column_nan_count/len(hd_clean), ikey))
        if column_nan_count / len(hd_clean) >= NAN_FRAC:
            logger.info("Dropping column: %s" % ikey)
            hd_clean = hd_clean.drop(ikey, 1)

    for ikey in hd_clean.keys():
        column_unique_count = len(hd_clean[ikey].unique())
        logging.info("%d unique value count of:\t %s" % (column_unique_count, ikey))
        if column_unique_count == 1 and ikey != 'failure':
                logging.info("Dropping column: %s" % ikey)
                hd_clean = hd_clean.drop(ikey, 1)

    hd_clean = hd_clean.fillna(666)

    for ikey in hd_clean.keys():
        column_nan_count = hd_clean[ikey].isnull().sum()
        if column_nan_count > 0:
            logger.warning("Warning! There shouldn't be any NaNs but there are in %s" % ikey)
    return hd_clean


In [None]:
 def clean_data(data):
    """
    crtical clean question, how to deal with NaN?
    """
    data = data.dropna(axis=1, how='all') # if all na, really bad, just drop
    data['NaNs'] = data.isnull().sum(axis=1) # save the number of NaNs for later
    data = data.drop(data[data.NaNs >= NAN_FRAC].index) # if more than NaN frac, just drop
    hd_clean = data.fillna(0) # else fill with 666 or zero?
    return hd_clean


### main loop

In [None]:
logger = start_logger()
start_time = time()


for data_dir in DATA_FOLDERS:
    data_dir = DATA_DIR + data_dir + "/"
    hd = pd.DataFrame()
    logger.info(" ### ### ### ### ### ### ### ### ### ### #### ### ###")
    logger.info("Working with: %s" % data_dir)
    logger.info("Elapsed time: %s seconds" % np.round(time() - start_time,1))
    
    for data_file in os.listdir(data_dir):
        if data_file.split('.')[1] == 'csv':
            if TEST:
                temp = pd.read_csv(data_dir+data_file, header=0, nrows=200)
                temp = sift_data(temp)
            else:
                temp = pd.read_csv(data_dir + data_file, header=0)
                temp = sift_data(temp)
            hd = hd.append(temp)

    logger.info("Working on: %s" % data_dir)
    logger.info("Shape of this data: %s" % str(np.shape(hd)))
    logger.info("There are %d unique drives. " % len(hd['serial_number'].unique()))    
    logger.info("There are %d failures." % hd['failure'].sum())

    clean = clean_data(hd)
    hd = None
    logger.info("There are %d unique drives after cleaning. " % len(clean['serial_number'].unique()))
    logger.info("There are %d failures after cleaning." % clean['failure'].sum())
    
    serial_grouped = clean.groupby(['serial_number'])
    serial_grouped = list(serial_grouped)

    logger.info("Writing serial csv files from %s" % data_dir)

    for i in range((len(serial_grouped))):
        fname = OUTPUT_DIR + "/" + serial_grouped[i][0] + '.csv'
        individual_serial_data = serial_grouped[i][1]
        del individual_serial_data['serial_number']
        individual_serial_data = individual_serial_data.sort_values(by=['smart_9_raw'])
        individual_serial_data.to_csv(path_or_buf=fname, sep=',', header=True, index=False, mode='a',
                                      encoding='utf-8')
        f = open(fname, 'a')
        f.write("#")
        f.close()


