# Hard Drive Model Data Generator
* pulls out many features over all times for all drives of a given MODEL
* prepares data for time series machine learning
* writes a .csv (into OUTPUT_DIR) for each unqiue 'serial_number' of a given MODEL
* the output data features include all columns that contain the string FEATURE_COL
* note: generates many files!

In [1]:
import os
import pandas as pd
import numpy as np
from time import gmtime, strftime, time
import logging
import sys

In [2]:
MODEL = 'ST4000DM000'
TEST = False
DATA_DIR = "data/"
DATA_FOLDERS = ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
NAN_FRAC = .25 
FEATURE_COL = "raw"
THE_TIME = strftime("%Y-%m-%d-%H-%S", gmtime())
OUTPUT_DIR = MODEL + "_data"

Sets up a custom logger that will work with ipython, repeated calls to this piece of code without restarting the IPython kernel will result in creating and attaching yet another handler to the logger on every run and the number of messages being logged to the file with each log call will grow

In [3]:
def start_logger():
    if os.path.isdir(OUTPUT_DIR):
        os.system("rm " + OUTPUT_DIR + "/*")
        os.system("rmdir " + OUTPUT_DIR)
    os.system("mkdir " + OUTPUT_DIR)
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    logger = logging.getLogger(__name__)
    logfile = OUTPUT_DIR + '/' + THE_TIME + ".log"
    handler = logging.FileHandler(logfile, 'w')
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)
    logger.info("MODEL\t=\t" + MODEL)
    logger.info("TEST\t=\t" + str(TEST))
    logger.info("DATA_DIR\t=\t" + DATA_DIR)
    logger.info("OUTPUT_DIR\t=\t" + OUTPUT_DIR)
    logger.info("THE_TIME\t=\t" + THE_TIME)
    logger.info("FEATURE_COL\t=\t" + FEATURE_COL)
    logger.info("NAN_FRAC\t=\t" + str(NAN_FRAC))
    return logger

### select specific MODEL and drop columns that do not contain FEATURE_COL, are date or are serial
    

In [4]:
def sift_data(data):
    sifted_data = pd.DataFrame.copy(data[data['model'] == MODEL])
    for ikey in sifted_data.keys():
        if FEATURE_COL not in ikey and ikey != 'serial_number' and ikey != 'failure' and ikey != 'date':
                del sifted_data[ikey]
    return sifted_data

### drop columns that are all NaN or have more than NAN_FRACTION

In [5]:
 def clean_data(data):
    hd_clean = data.dropna(axis=1, how='all')
    for ikey in hd_clean.keys():
        column_nan_count = hd_clean[ikey].isnull().sum()
        logger.info("%d NaN fraction of:\t %s" % (column_nan_count/len(hd_clean), ikey))
        if column_nan_count / len(hd_clean) >= NAN_FRAC:
            logger.info("Dropping column: %s" % ikey)
            hd_clean = hd_clean.drop(ikey, 1)#, inplace=True)

    for ikey in hd_clean.keys():
        column_unique_count = len(hd_clean[ikey].unique())
        logging.info("%d unique value count of:\t %s" % (column_unique_count, ikey))
        #if column_unique_count == 1 and ikey != 'failure':
        #        logging.info("Dropping column: %s" % ikey)
        #        hd_clean = hd_clean.drop(ikey, 1)#, inplace=True)

    hd_clean = hd_clean.fillna(666)

    for ikey in hd_clean.keys():
        column_nan_count = hd_clean[ikey].isnull().sum()
        if column_nan_count > 0:
            logger.warning("Warning! There shouldn't be any NaNs but there are in %s" % ikey)
    return hd_clean


### main loop

In [6]:
logger = start_logger()
start_time = time()

for data_dir in DATA_FOLDERS:
    data_dir = DATA_DIR + data_dir + "/"
    hd = pd.DataFrame()
    #logger.info("* * * \nLoading: %s" % data_dir)
    logger.info(" ### ### ### ### ### ### ### ### ### ### #### ### ###")
    logger.info("Working with: %s" % data_dir)
    logger.info("Elapsed time: %s seconds" % np.round(time() - start_time,1))
    #print("Working with: %s" % model)
    
    
    
    for data_file in os.listdir(data_dir):
        if data_file.split('.')[1] == 'csv':
            if TEST:
                temp = pd.read_csv(data_dir+data_file, header=0, nrows=200)
                temp = sift_data(temp)
            else:
                temp = pd.read_csv(data_dir + data_file, header=0)
                temp = sift_data(temp)
            hd = hd.append(temp)

    logger.info("Working on: %s" % data_dir)
    logger.info("Shape of this data: %s" % str(np.shape(hd)))
    logger.info("There are %d unique drives. " % len(hd['serial_number'].unique()))    
    logger.info("There are %d failures." % hd['failure'].sum())

    clean = clean_data(hd)
    
    logger.info("There are %d unique drives after cleaning. " % len(clean['serial_number'].unique()))
    logger.info("There are %d failures after cleaning." % clean['failure'].sum())
    
    serial_grouped = clean.groupby(['serial_number'])
    serial_grouped = list(serial_grouped)

    logger.info("Writing serial csv files from %s" % data_dir)

    for i in range((len(serial_grouped))):
        fname = OUTPUT_DIR + "/" + serial_grouped[i][0] + '.csv'
        individual_serial_data = serial_grouped[i][1]
        if os.path.exists(fname):
            f = open(fname, 'a')
        else:
            f = open(fname, 'w')
        f.write("#" + str(np.shape(individual_serial_data)[0]) + ",")
        f.write(str(np.shape(individual_serial_data)[1]) + ",")
        f.write(data_dir + "\n#")
        f.close()
        del individual_serial_data['serial_number']
        individual_serial_data = individual_serial_data.sort_values(by=['smart_9_raw'])
        individual_serial_data.to_csv(path_or_buf=fname, sep=',', header=True, index=False, mode='a',
                                      encoding='utf-8')
        #if TEST:
        #    if i >= 10:
        #        break


#log_file.close()

INFO:root:364 unique value count of:	 date
INFO:root:12920 unique value count of:	 serial_number
INFO:root:2 unique value count of:	 failure
INFO:root:3186706 unique value count of:	 smart_1_raw
INFO:root:2 unique value count of:	 smart_3_raw
INFO:root:123 unique value count of:	 smart_4_raw
INFO:root:557 unique value count of:	 smart_5_raw
INFO:root:3111730 unique value count of:	 smart_7_raw
INFO:root:13646 unique value count of:	 smart_9_raw
INFO:root:2 unique value count of:	 smart_10_raw
INFO:root:103 unique value count of:	 smart_12_raw
INFO:root:3 unique value count of:	 smart_15_raw
INFO:root:429 unique value count of:	 smart_183_raw
INFO:root:7 unique value count of:	 smart_184_raw
INFO:root:463 unique value count of:	 smart_187_raw
INFO:root:1731 unique value count of:	 smart_188_raw
INFO:root:671 unique value count of:	 smart_189_raw
INFO:root:29 unique value count of:	 smart_190_raw
INFO:root:2 unique value count of:	 smart_191_raw
INFO:root:73 unique value count of:	 smart