In [1]:
import pandas as pd
import numpy as np
from make_df import create_df, make_inputs, make_inputs_survival #Custom module here
from naive import naive_fit, get_true_labels #Custom module here
from survival_func import survival_fit #Custom module here


This is a demo of our Hard EM algorithm for survival analysis datasets
with a latent cured subpopulation. We will apply this to a melanoma data set and not only output the probability of being cured, but in addition the weights for the overall population survival function. 

STEP 1: ADD CENSORING INDICATOR COLUMN TO DATASET

In [2]:
mel = pd.read_csv('melanoma.csv')

In [3]:
mel.head()

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,year,thickness,ulcer
0,1,10,3,1,76,1972,6.76,1
1,2,30,3,1,56,1968,0.65,0
2,3,35,2,1,41,1977,1.34,0
3,4,99,3,0,71,1968,2.9,0
4,5,185,1,1,52,1965,12.08,1


For the status column 1 means died from the disease, 2 means alive at then end of study, and 3 means died from unrelated
causes. Use this to make a censoring column as follows:
    
    1 means not censored (label 1)
    2 means censored (label 0)
    3 means cenosred (label 0)

In [4]:
censoring_indicator = mel['status'] 

In [5]:
censoring_indicator = censoring_indicator.replace(3, 0)

In [6]:
censoring_indicator = censoring_indicator.replace(2, 0)

In [7]:
mel['status'] = censoring_indicator

In [8]:
mel.head(10) #Sanity check

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,year,thickness,ulcer
0,1,10,0,1,76,1972,6.76,1
1,2,30,0,1,56,1968,0.65,0
2,3,35,0,1,41,1977,1.34,0
3,4,99,0,0,71,1968,2.9,0
4,5,185,1,1,52,1965,12.08,1
5,6,204,1,1,28,1971,4.84,1
6,7,210,1,1,77,1972,5.16,1
7,8,232,0,0,60,1974,3.22,1
8,9,232,1,1,49,1968,12.88,1
9,10,279,1,0,68,1971,7.41,1


STEP 2: PREPARE DATA TO BE FED INTO CUSTOM ALGORITHM TO GET WEIGHTS FOR OVERALL SURVIVAL FUNCTION

In [9]:
def prepare_data(mel):
    
    censored_time = mel[mel.status==0].time
    noncensored_time = mel[mel.status==1].time
    covariates = ['sex', 'age', 'thickness', 'ulcer'] #Omit time, year, and status
    mel_covariates = mel[covariates] 
    mel_covariates=(mel_covariates-mel_covariates.mean())/mel_covariates.std() #stand. col. wise.
    mel_covariates['status'] = censoring_indicator
    
    #Extract censored inputs (status label 0) and noncensored inputs 
    #(status label 1) using our own function 

    columns = ['status']

    censored_inputs = make_inputs(mel_covariates, 0, columns) #extract censored

    noncensored_inputs = make_inputs(mel_covariates, 1, columns) #extract noncensored
    
    fit = naive_fit(censored_inputs, noncensored_inputs, 'use_HardEM')
    
    y_pred = fit['pred'] #predicted label of the corresponding row 
        
    y_scores = fit['prob'] #This is the probability of *not* being cured
    
    censored_mel = mel_covariates[mel_covariates['status']==0]

    noncensored_mel = mel_covariates[mel_covariates['status']==1]
    
    censored_mel['time'] = censored_time
    
    noncensored_mel['time'] = noncensored_time
    
    final = pd.concat([censored_mel, noncensored_mel]) 

    final['predicted_cure_label'] = y_pred

    final['predicted_prob_cured'] = 1-y_scores
    
    final_cov = final[[col for col in final.columns if col not in ['time', 'predicted_prob_cured']]]
    
    cens_cov = final_cov[final_cov.status==0]

    noncens_cov = final_cov[final_cov.status==1]
    
    final_time_prob_cens = (final[final.status==0])[['time', 'predicted_prob_cured']]

    final_time_prob_noncens = (final[final.status==1])[['time', 'predicted_prob_cured']]
    
    cens_cov = cens_cov.drop(columns=['status'])
    
    times = final_time_prob_cens.values.tolist()
    
    covs = cens_cov.values.tolist()
    
    z = list(zip(times, covs))
    
    censored_inputs = [[arr[0][0], arr[0][1], arr[1]] for arr in z]
    
    noncens_cov = noncens_cov.drop(columns=['status'])

    ntimes = final_time_prob_noncens.values.tolist()

    noncens_covs = noncens_cov.values.tolist()

    zn = list(zip(ntimes, noncens_covs))

    noncensored_inputs = [[arr[0][0], arr[0][1], arr[1]] for arr in zn]
    
    return censored_inputs, noncensored_inputs
        

    

In [10]:
censored_inputs, noncensored_inputs = prepare_data(mel)

survival_fit(censored_inputs, noncensored_inputs, 0.5, 100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return f_raw(*args, **kwargs)
  defvjp(anp.log,    lambda ans, x : lambda g: g / x)
  lambda ans, x, y : unbroadcast_f(y, lambda g: - g * x / y**2))
  lambda ans, x, y : unbroadcast_f(y, lambda g: x * g))
  defvjp(anp.multiply,    lambda ans, x, y : unbroadcast_f(x, lambda g: y * g),
  defvjp(anp.exp,    lambda ans, x : lambda g: ans * g)
  return f_raw(*args, **kwargs)
  defvjp(anp.log,    lambda ans, x : lambda g: g / x)
  def _add(self, x, y):        return x + y


array([ 1.84262720e+03,  1.63409897e+00,  3.14889243e-02, -1.14954340e-02,
        8.87315696e-02,  5.44767435e-02,  4.27932189e-02])