# HD_Survival_Analysis
generates fits from lifelines package, saves those fits as data files 

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import lifelines as lil
from lifelines import KaplanMeierFitter
import os
import sys
from bokeh.plotting import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.models import Range1d
output_notebook()

In [None]:
TEST = True
DATA_DIR = "data/"
DATA_FOLDERS = ["2014","2015","data_Q1_2016","data_Q2_2016","data_Q3_2016","data_Q4_2016"]
SUMMARY_DIR = "summary_data/"
PERCENT_TOTAL_REQ = .05
FAILURE_RATE_REQ = 5
MIN_NUMBER_REQ = 100
INPUT_DIR = "survival_data"
OUTPUT_DIR = "web/data"

In [None]:
target_models = []
for df in DATA_FOLDERS:
    print(SUMMARY_DIR + df + '.csv')
    summary_dats = pd.read_csv(SUMMARY_DIR + df + '.csv', header=0, nrows=200)
    summary_dats = summary_dats.sort_values(by="percent_total", ascending=False)
    clipped1 = summary_dats[summary_dats['percent_total'] >= PERCENT_TOTAL_REQ]
    clipped2 = summary_dats[(summary_dats['failure_rate'] >= FAILURE_RATE_REQ) & (summary_dats['drive_count'] >= MIN_NUMBER_REQ)]
    [target_models.append(m) for m in clipped1['model']]
    [target_models.append(m) for m in clipped2['model']]

unique_target_models = np.unique(target_models)
print(unique_target_models)

In [None]:
survival_data = []
km_data = []
for model in unique_target_models:
    model_ns = model.replace(" ", "_")
    pf = INPUT_DIR + "/survival_" +  model_ns + '.csv'# + "data_Q2_2016.csv"
    print(pf)
    tmp = pd.read_csv(pf, header=0)
    yr = 365. * 24.
    tmp['runtime_max'] = tmp['runtime_max']/yr
    tmp['uptime'] = tmp['uptime']/yr
    tmp['runtime_min'] = tmp['runtime_min']/yr
    survival_data.append(tmp)
    km = lil.KaplanMeierFitter()
    try:
        km.fit(durations=tmp['runtime_max'], event_observed=tmp['failure'], entry=tmp['runtime_min'])
        km_data.append(km)
        s = km.survival_function_
        ci = km.confidence_interval_
        i = s.index
        s = s.iloc[:,0].values
        u = ci.iloc[:,0].values
        l = ci.iloc[:,1].values
        tmp = pd.DataFrame(np.asarray([i,s]).T, columns = ['time','surv'])
        tmp.to_csv(OUTPUT_DIR + "/kmfit_alt" + model_ns + ".csv", index = False)
    except:
        print("Probably too few early truncation times and too many events. Try BreslowFlemingHarringtonFitter?")
        print("Skipping ...")
        pass
#         time = s['KM_estimate'].index
#         surv = s['KM_estimate'].values
#         surv_upper = ci['KM_estimate_upper_0.95'].values
#         surv_lower = ci['KM_estimate_lower_0.95'].values
#         kdl = zip(time,surv,surv_upper, surv_lower)
#         model_km = pd.DataFrame(kdl, columns=["time", "surv", "surv_lower", "surv_upper"])
        
#         km_outf = OUTPUT_DIR + "/kmfit_" + model_ns + ".csv",
#         #print km_outf
#         model_km.to_csv(OUTPUT_DIR + "/kmfit_" + model_ns + ".csv", index = False)
#     except:
#         print "Probably too few early truncation times and too many events. Try BreslowFlemingHarringtonFitter?"
#         print "Skipping ..."
#         pass

In [None]:
def km_bokeh_plot(kms, models):
    surv_plt1 = figure(title="Survival Analysis", tools=['hover,box_zoom,wheel_zoom,save,reset'])
    hover = surv_plt1.select(dict(type=HoverTool))
    hover.tooltips = [
        ("Model ", "@model"),
        ("Time ", "@timeline"),
        ("survival fraction ", "@km_surv"),
        ("upper 95% bound ", "@surv_upper"),
        ("lower 95% bound ", "@surv_lower")
        ]
    if len(kms) > 1:
        hover.mode = 'mouse'
    else: 
        hover.mode = 'vline'
    colors = ['#1a1334', '#03c383', '#fbbf45', '#ed0345' ,  '#26294a', '#aad962', '#01545a', '#ef6a32','#017351',
              '#a12a5e', '#710162', '#110141']
    n = 0
    for km in kms:
        s = km.survival_function_
        ci = km.confidence_interval_
        #time = s['KM_estimate'].index
        #surv = s['KM_estimate'].values
        #surv_upper = ci['KM_estimate_upper_0.95'].values
        #surv_lower = ci['KM_estimate_lower_0.95'].values
        
        time = s.index
        surv = s.iloc[:,0].values
        surv_upper = ci.iloc[:,0].values
        surv_lower = ci.iloc[:,1].values
        
        band_x = np.append(time, time[::-1])
        band_y = np.append(surv_upper, surv_lower[::-1])
        source = ColumnDataSource(
            data=dict(
                timeline=[i for i in time],
                km_surv=[i for i in surv],
                model=[models[n] for i in time],
                surv_lower=[i for i in surv_lower],
                surv_upper=[i for i in surv_upper]
            )
        )
        surv_plt1.patch(band_x, band_y, color=colors[n], fill_alpha=0.2)
        surv_plt1.line('timeline', 'km_surv', line_width = 2, alpha=.8, source = source, legend=models[n], color=colors[n])
        n += 1
    surv_plt1.xaxis.axis_label = 'Time (Years)'
    surv_plt1.yaxis.axis_label = 'Kaplan-Meier Estimation (survival fraction)'
    surv_plt1.grid.grid_line_alpha = 0
    surv_plt1.ygrid.band_fill_color = "grey"
    surv_plt1.ygrid.band_fill_alpha = 0.2
    surv_plt1.x_range.range_padding = 0
    surv_plt1.legend.location = "bottom_left"
    surv_plt1.plot_height = 500
    surv_plt1.plot_width = 700
    surv_plt1.min_border_left = 80
    surv_plt1.outline_line_width = 1
    surv_plt1.outline_line_alpha = 0.3
    surv_plt1.y_range = Range1d(0.0, 1.02)
    show(surv_plt1)
         
#km_bokeh_plot(km_data, unique_target_models)
km_bokeh_plot(km_data[0:6], unique_target_models[0:6])

## Hazard rates with Nelson-Aalen
 * The failure rate is the total number of failures within a population, divided by the total time expended by that population, during a particular measurement interval.
 * The hazard function or hazard rate is the failure rate calculated instantaneously.  
 * The cumulative hazard curve is a basic tool: it is the sum of failure rate estimates so it is much more stable than the point-wise instananeous estimates.
 * The hazard curve has a catch: the derivation involves a smoothing kernel smoother applied to the differences of the cumulative hazard curve), and thus it has a free parameter.

In [None]:
from lifelines import NelsonAalenFitter
naf = NelsonAalenFitter()
seagate = survival.loc[survival['model'] == model_map['ST4000DM000']]
naf.fit(seagate['runtime_max'], seagate['failure'], entry=seagate['runtime_min'], label='Seagate ST4000DM000')
fig, axes = plt.subplots(nrows=1, ncols=1, squeeze=False, sharex=True, sharey=True)
naf.plot(ax=axes[0,0],title="Cumulative Hazard Rate")
plt.show()

haz = naf.smoothed_hazard_
smoothing_bandwidth_time=1.2
q = haz(smoothing_bandwidth_time)
mean_haz = np.mean( q[q.columns[0]])

ax = naf.plot_hazard(bandwidth=smoothing_bandwidth_time, title = "Hazard Rate")
ax.axhline(mean_haz , ls='--', lw=1.0, color='black')
haz_str = str(100*np.round(mean_haz,4)) + "%"
ax.annotate(haz_str, color='black', xy=(0.01,mean_haz), xytext=(10,4), textcoords='offset points')
plt.show()
print("The failure rate for these seagate drives is %s" % haz_str)