<b style="font-size:150%;"> This script makes a statistical analysis of the hard drive disks data. <br>
</b> 

In [1]:
import numpy as np
import pandas as pd
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import ticker
import seaborn
%matplotlib inline

In [2]:
## matplotlib settings
matplotlib.style.use('ggplot')
#plt.rcParams['figure.figsize']=16,10
plt.rcParams.update({'font.size': 12, 
                            'font.family': 'STIXGeneral', 
                            'mathtext.fontset': 'stix'})

In [3]:
# seaborn settings
seaborn.set_palette('deep', desat=.6)

In [4]:
# reading data set
data= pd.read_csv('hard_drive_data_2015.csv')
data= data.drop('Unnamed: 0', 1)

<p style= "font-size: 120%;">
Failed disks:
</p>


In [5]:
fail= pd.DataFrame(data[data.failure==1]).reset_index()
fail= fail.drop('index', 1)

<p style= "font-size: 120%;">
Success disks:
</p>



In [6]:
success= data.groupby(['serial_number']).filter(lambda x: x['failure'].sum()==0) 
succ= pd.DataFrame(success.groupby(['serial_number']).tail(n=1)).reset_index()
succ= succ.drop('index', 1)

<b style="font-size:150%;"> I PART: Quantifying difference between distributions  <br>
of failed and success disks
</b> 

<p style="font-size:120%;"> The Cohen's distance is the best measurement to quantify differences between the distributions.
</p> 



In [7]:
class Cohen_distance(object):
    '''
    Input: specific S.M.A.R.T attribute of failed and success disks
    '''
    #def __init__(self, dist1, dist2):
    #    self.dist1= dist1
    #    self.dist2= dist2

    def distance(self, dist1, dist2):
        diff= abs(dist1.mean() - dist2.mean() )
        n1, n2= len(dist1), len(dist2)
        var1, var2= dist1.var(), dist2.var()
        denominator= ((n1-1)*var1 + (n2-1)*var2)/ (n1+n2)
        d= diff/np.sqrt(denominator)
        return d

    def overlap_superiority(self, dist1, dist2):
        control_sample= dist1
        treatment_sample= dist2
        thresh = (control_sample.mean() + treatment_sample.mean()) / 2
        control_above= sum(control_sample>thresh)/len(control_sample)
        treatment_below= sum(treatment_sample <thresh)/ len(treatment_sample)
        overlap= control_above+ treatment_below
        superiority= np.size([x for x,y in zip(treatment_sample, control_sample) if x>y ])/len(treatment_sample)
        return overlap, superiority
    
    def get_effect_size(self, dist1, dist2):
        distance= self.distance(dist1, dist2)
        overl_prob, sup_prob= self.overlap_superiority(dist1, dist2)
        return distance, overl_prob, sup_prob

In [8]:
statistics= []
stats= Cohen_distance()

statistics.append( stats.get_effect_size(fail.smart_1_normalized, succ.smart_1_normalized))
statistics.append( stats.get_effect_size(fail.smart_3_raw/1000., succ.smart_3_raw/1000.)) # secs
statistics.append( stats.get_effect_size(fail.smart_5_raw, succ.smart_5_raw))
statistics.append( stats.get_effect_size(fail.smart_9_raw/(24*365.), succ.smart_9_raw/(24*365.) )) # hours
statistics.append( stats.get_effect_size(fail.smart_12_raw, succ.smart_12_raw ))
statistics.append( stats.get_effect_size(fail.smart_194_raw, succ.smart_194_raw ))

statistics= pd.DataFrame(statistics, columns=['Cohens_distance', 'overlap_prob', 'superiority_prob'])
statistics['SMART_attribute']= ['error_rate', 'spin-up_time', 'bad_sectors_count', 'running_time', 'power_cycle_count', 'internal_temp']
statistics.sort_values(by='Cohens_distance', ascending= False)

Unnamed: 0,Cohens_distance,overlap_prob,superiority_prob,SMART_attribute
2,0.76192,1.138154,0.003416,bad_sectors_count
0,0.410913,1.07595,0.013486,error_rate
1,0.220472,1.038011,0.003221,spin-up_time
3,0.09597,1.052287,0.016837,running_time
5,0.052604,0.967871,0.010444,internal_temp
4,0.033592,1.159403,0.018545,power_cycle_count


<p style="font-size:120%;"> According to these results, the SMART attributes where there is more difference between failed and success disks are: <br> 
-- the bad sectors count <br> 
-- error rate <br>
-- the spin-up time. 
</p> 



<b style="font-size:150%;"> PART II: Quantifying the precision of the measurement  <br>
of the Cohen's distances.
</b> 

<p style="font-size:120%;"> Here I will compute the confidence intervals.
</p> 

In [9]:
class CohenSampler(Cohen_distance):
    '''
    Makes a sampling distribution of sampling distances.
    Uses Boostraap
    '''
    def resample(self, dist1, dist2):
        group1= np.random.choice(dist1, len(dist1), replace=True)
        group2= np.random.choice(dist2, len(dist2), replace=True)
        return self.distance(group1, group2)
         
    
    def compute_sample_stats(self, dist1, dist2, niters= 1000):
        sampling_dist= [self.resample(dist1, dist2) for i in range(niters)]
        return np.array(sampling_dist)

In [10]:
sampling= CohenSampler()
s= sampling.compute_sample_stats(fail.smart_194_raw, succ.smart_194_raw )
s

array([             nan,   7.64628398e-02,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
         5.15807399e-02,              nan,              nan,
                    nan,              nan,              nan,
         1.90880212e-02,              nan,   1.70588761e-02,
                    nan,              nan,   2.22725722e-02,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,              nan,              nan,
                    nan,