# HD_Survival_Analysis
generates fits from lifelines package, saves those fits as data files 

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import lifelines as lil
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
import os
import sys
from bokeh.plotting import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.models import Range1d
output_notebook()

In [None]:
TEST = True
DATA_DIR = "data/"
DATA_FOLDERS = ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
DATA_FOLDERS = ["Q1_2014","Q1_2014","Q1_2014","Q1_2014",
                "Q1_2015","Q2_2015","Q2_2015","Q4_2015",
                "data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
SUMMARY_DIR = "summary_data/"
PERCENT_TOTAL_REQ = .05
FAILURE_RATE_REQ = 5
MIN_NUMBER_REQ = 100
INPUT_DIR = "survival_data"
OUTPUT_DIR = "data/survival/"

In [None]:
target_models = []
for df in DATA_FOLDERS:
    print(SUMMARY_DIR + df + '.csv')
    summary_dats = pd.read_csv(SUMMARY_DIR + df + '.csv', header=0, nrows=200)
    summary_dats = summary_dats.sort_values(by="percent_total", ascending=False)
    clipped1 = summary_dats[summary_dats['percent_total'] >= PERCENT_TOTAL_REQ]
    clipped2 = summary_dats[(summary_dats['failure_rate'] >= FAILURE_RATE_REQ) & (summary_dats['drive_count'] >= MIN_NUMBER_REQ)]
    [target_models.append(m) for m in clipped1['model']]
    [target_models.append(m) for m in clipped2['model']]

unique_target_models = np.unique(target_models)
print(unique_target_models)

In [None]:
survival_data = []
km_data = []
for model in unique_target_models:
    model_ns = model.replace(" ", "_")
    pf = INPUT_DIR + "/survival_" +  model_ns + '.csv'# + "data_Q2_2016.csv"
    print(pf)
    tmp = pd.read_csv(pf, header=0)
    yr = 365. * 24.
    tmp['runtime_max'] = tmp['runtime_max']/yr
    tmp['uptime'] = tmp['uptime']/yr
    tmp['runtime_min'] = tmp['runtime_min']/yr
    survival_data.append(tmp)
    km = lil.KaplanMeierFitter()
    try:
        km.fit(durations=tmp['runtime_max'], event_observed=tmp['failure'], entry=tmp['runtime_min'])
        km_data.append(km)
        s = km.survival_function_
        ci = km.confidence_interval_
        i = s.index
        s = s.iloc[:,0].values
        u = ci.iloc[:,0].values
        l = ci.iloc[:,1].values
        tmp = pd.DataFrame(np.asarray([i,s,u,l]).T, columns = ['time','surv', 'surv_upper', 'surv_lower'])
        tmp.to_csv(OUTPUT_DIR + "kmf_" + model_ns + ".csv", index = False)
    except:
        print("Probably too few early truncation times and too many events. Try BreslowFlemingHarringtonFitter?")
        print("Skipping ...")
        pass

In [None]:
def km_bokeh_plot(kms, models):
    surv_plt1 = figure(title="Survival Analysis", tools=['hover,box_zoom,wheel_zoom,save,reset'])
    hover = surv_plt1.select(dict(type=HoverTool))
    hover.tooltips = [
        ("Model ", "@model"),
        ("Time ", "@timeline"),
        ("survival fraction ", "@km_surv"),
        ("upper 95% bound ", "@surv_upper"),
        ("lower 95% bound ", "@surv_lower")
        ]
    if len(kms) > 1:
        hover.mode = 'mouse'
    else: 
        hover.mode = 'vline'
    colors = ['#1a1334', '#03c383', '#fbbf45', '#ed0345' ,  '#26294a', '#aad962', '#01545a', '#ef6a32','#017351',
              '#a12a5e', '#710162', '#110141']
    n = 0
    for km in kms:
        s = km.survival_function_
        ci = km.confidence_interval_
        time = s.index
        surv = s.iloc[:,0].values
        surv_upper = ci.iloc[:,0].values
        surv_lower = ci.iloc[:,1].values
        
        band_x = np.append(time, time[::-1])
        band_y = np.append(surv_upper, surv_lower[::-1])
        source = ColumnDataSource(
            data=dict(
                timeline=[i for i in time],
                km_surv=[i for i in surv],
                model=[models[n] for i in time],
                surv_lower=[i for i in surv_lower],
                surv_upper=[i for i in surv_upper]
            )
        )
        surv_plt1.patch(band_x, band_y, color=colors[n], fill_alpha=0.2)
        surv_plt1.line('timeline', 'km_surv', line_width = 2, alpha=.8, source = source, legend=models[n], color=colors[n])
        n += 1
    surv_plt1.xaxis.axis_label = 'Time (Years)'
    surv_plt1.yaxis.axis_label = 'Kaplan-Meier Estimation (survival fraction)'
    surv_plt1.grid.grid_line_alpha = 0
    surv_plt1.ygrid.band_fill_color = "grey"
    surv_plt1.ygrid.band_fill_alpha = 0.2
    surv_plt1.x_range.range_padding = 0
    surv_plt1.legend.location = "bottom_left"
    surv_plt1.plot_height = 500
    surv_plt1.plot_width = 700
    surv_plt1.min_border_left = 80
    surv_plt1.outline_line_width = 1
    surv_plt1.outline_line_alpha = 0.3
    surv_plt1.y_range = Range1d(0.0, 1.02)
    show(surv_plt1)
         
#km_bokeh_plot(km_data, unique_target_models)
km_bokeh_plot(km_data[0:6], unique_target_models[0:6])

## Hazard rates with Nelson-Aalen
 * The failure rate is the total number of failures within a population, divided by the total time expended by that population, during a particular measurement interval.
 * The hazard function or hazard rate is the failure rate calculated instantaneously.  
 * The cumulative hazard curve is a basic tool: it is the sum of failure rate estimates so it is much more stable than the point-wise instananeous estimates.
 * The hazard curve has a catch: the derivation involves a smoothing kernel smoother applied to the differences of the cumulative hazard curve), and thus it has a free parameter.

In [None]:
def avg_failrate(t, chaz, model_ns):
    yrs = [1., 2., 3., 4., 5., 6.]
    #avg_frate = []
    #delta_frate = []
    avg_frate = np.zeros(len(yrs))-1
    delta_frate = np.zeros(len(yrs))-1
    index = []
    i = -1
    for yr in yrs:
        mt = np.min(np.abs(t-yr))
        i += 1
        if mt>.2:
            break
        tmp = np.where(np.abs(t-yr) == mt)[0][0]
        index.append(tmp)
        print "average failure rate at", yr, " year is ", chaz[index[i]]/yr
        #avg_frate.append(chaz[index[i]]/yr)
        avg_frate[i] = chaz[index[i]]/yr
        if i == 0:
            #delta_frate.append(chaz[index[i]]/yr)
            delta_frate[i] = chaz[index[i]]/yr
        if i >= 1:
            dfrate =  (chaz[index[i]] - chaz[index[i-1]]) / (yrs[i]-yrs[i-1])
            print "average failure between ", yrs[i], " year and ", yrs[i-1], "year is ", dfrate
            #delta_frate.append(dfrate)
            delta_frate[i] = dfrate
    #print avg_frate
    #print delta_frate 
    tmp = pd.DataFrame(np.asarray([yrs, avg_frate, delta_frate]).T, 
                           columns = ['yr', 'avg_fail_rate', 'delta_avg_fail_rate'])
    tmp.to_csv(OUTPUT_DIR + "avg_" + model_ns + ".csv", index = False)
    return None


In [None]:
from lifelines import NelsonAalenFitter
for model in unique_target_models:
    model_ns = model.replace(" ", "_")
    pf = INPUT_DIR + "/survival_" +  model_ns + '.csv'# + "data_Q2_2016.csv"
    print(pf)
    tmp = pd.read_csv(pf, header=0)
    yr = 365. * 24.
    tmp['runtime_max'] = tmp['runtime_max']/yr
    tmp['uptime'] = tmp['uptime']/yr
    tmp['runtime_min'] = tmp['runtime_min']/yr
    hz = NelsonAalenFitter()
    #while True:
    try:
        hz.fit(tmp['runtime_max'], tmp['failure'], entry=tmp['runtime_min'])
        chaz = hz.cumulative_hazard_
        t = chaz.index
        chaz = chaz.iloc[:,0].values
        
        shaz = hz.smoothed_hazard_
        smoothing_bandwidth_time=.5
        shaz50 = shaz(smoothing_bandwidth_time)
        shaz50 = shaz50.iloc[:,0].values
        
        shaz = hz.smoothed_hazard_
        smoothing_bandwidth_time=.25
        shaz25 = shaz(smoothing_bandwidth_time)
        shaz25 = shaz25.iloc[:,0].values
        
        shaz = hz.smoothed_hazard_
        smoothing_bandwidth_time=1.0
        shaz100 = shaz(smoothing_bandwidth_time)
        shaz100 = shaz100.iloc[:,0].values
        
        #print t
        avg_failrate(t.values, chaz, model_ns)
            
            
        tmp = pd.DataFrame(np.asarray([t, chaz, shaz25, shaz50, shaz100]).T, 
                           columns = ['time', 'cumulative_hazard', 'smoothed_hazard25', 
                                      'smoothed_hazard50', 'smoothed_hazard100'])
        tmp.to_csv(OUTPUT_DIR + "naf_" + model_ns + ".csv", index = False)
    except:
        print("Well that didn't work. And this message will hardly help, Ha, who knows why.")
        print("Skipping ...")
        pass

In [None]:
def get_class_members(klass):
    ret = dir(klass)
    if hasattr(klass,'__bases__'):
        for base in klass.__bases__:
            ret = ret + get_class_members(base)
    return ret

#get_class_members(hz)

In [None]:
chaz = hz.cumulative_hazard_
t = chaz.index
chaz = chaz.iloc[:,0].values
plt.plot(t, chaz)#"Cumulative Hazard Rate"
plt.show()

shaz = hz.smoothed_hazard_
smoothing_bandwidth_time=.2
shaz = shaz(smoothing_bandwidth_time)
t = shaz.index
shaz = shaz.iloc[:,0].values
plt.plot(t, shaz)#"Smoothed Hazard Rate"

mean_shaz = np.mean( shaz)
plt.axhline(mean_shaz , ls='--', lw=1.0, color='black')
shaz_str = str(100*np.round(mean_shaz,4)) + "%"
plt.annotate(shaz_str, color='black', xy=(0.01,mean_shaz), xytext=(10,4), textcoords='offset points')
plt.show()
print("The failure rate for the drive is %s" % shaz_str)
plt.show()
