In [1]:
import pandas as pd
import numpy as np
from make_df import create_df, make_inputs, make_inputs_survival #Custom module here
from naive import naive_fit, get_true_labels #Custom module here
import matplotlib.pyplot as plt

This is a demo of our Hard EM algorithm for survival analysis datasets
with a latent cured subpopulation. We will apply this to a melanoma data set.

STEP 1: DATA PREPARATION

In [2]:
mel = pd.read_csv('melanoma.csv')

In [3]:
mel.head()

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,year,thickness,ulcer
0,1,10,3,1,76,1972,6.76,1
1,2,30,3,1,56,1968,0.65,0
2,3,35,2,1,41,1977,1.34,0
3,4,99,3,0,71,1968,2.9,0
4,5,185,1,1,52,1965,12.08,1


For the status column 1 means died from the disease, 2 means alive at then end of study, and 3 means died from unrelated
causes. Use this to make a censoring column as follows:
    
    1 means not censored (label 1)
    2 means censored (label 0)
    3 means cenosred (label 0)

In [4]:
censoring_indicator = mel['status'] 

In [5]:
censoring_indicator = censoring_indicator.replace(3, 0)

In [6]:
censoring_indicator = censoring_indicator.replace(2, 0)

In [7]:
mel['status'] = censoring_indicator

In [8]:
mel.head(10) #Sanity check

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,year,thickness,ulcer
0,1,10,0,1,76,1972,6.76,1
1,2,30,0,1,56,1968,0.65,0
2,3,35,0,1,41,1977,1.34,0
3,4,99,0,0,71,1968,2.9,0
4,5,185,1,1,52,1965,12.08,1
5,6,204,1,1,28,1971,4.84,1
6,7,210,1,1,77,1972,5.16,1
7,8,232,0,0,60,1974,3.22,1
8,9,232,1,1,49,1968,12.88,1
9,10,279,1,0,68,1971,7.41,1


In [9]:
censored_time = mel[mel.status==0].time

In [10]:
noncensored_time = mel[mel.status==1].time

In [11]:
covariates = ['sex', 'age', 'thickness', 'ulcer'] #Omit time, year, and status

In [12]:
mel_covariates = mel[covariates] 

In [13]:
mel_covariates

Unnamed: 0,sex,age,thickness,ulcer
0,1,76,6.76,1
1,1,56,0.65,0
2,1,41,1.34,0
3,0,71,2.90,0
4,1,52,12.08,1
...,...,...,...,...
200,1,29,7.06,1
201,0,40,6.12,0
202,0,42,0.48,0
203,0,50,2.26,0


In [14]:
#Now standardize the dataframe column wise 



mel_covariates=(mel_covariates-mel_covariates.mean())/mel_covariates.std()

In [15]:
mel_covariates['status'] = censoring_indicator 
#add back the censoring indicator column

In [16]:
mel_covariates.head() #Sanity check

Unnamed: 0,sex,age,thickness,ulcer,status
0,1.259824,1.411768,1.297595,1.127628,0
1,1.259824,0.212131,-0.766989,-0.882491,0
2,1.259824,-0.687597,-0.533837,-0.882491,0
3,-0.78989,1.111859,-0.006709,-0.882491,0
4,1.259824,-0.027796,3.095237,1.127628,1


In [17]:
noncensored_proportion = len(mel_covariates[mel_covariates.status==1].index)/len(mel_covariates.index)

noncensored_proportion 
#This is the fraction of our dataset which is *not* censored.

0.2780487804878049

In [18]:
#Extract censored inputs (status label 0) and noncensored inputs 
#(status label 1) using our own function 

columns = ['status']

censored_inputs = make_inputs(mel_covariates, 0, columns) #extract censored

noncensored_inputs = make_inputs(mel_covariates, 1, columns) 
#extract noncensored

STEP 2: FEED INTO CUSTOM ALGORITHM

Now that the data has been prepared we can fit it into my algorithm.

In [19]:
fit = naive_fit(censored_inputs, noncensored_inputs, 'use_HardEM')  

In [20]:
y_pred = fit['pred'] #predicted label of the corresponding row 
        
y_scores = fit['prob'] #This is the probability of *not* being cured
    

    

In [21]:
censored_mel = mel_covariates[mel_covariates['status']==0]

noncensored_mel = mel_covariates[mel_covariates['status']==1]

In [22]:
censored_mel['time'] = censored_time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
noncensored_mel['time'] = noncensored_time

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
censored_mel.head(10)

Unnamed: 0,sex,age,thickness,ulcer,status,time
0,1.259824,1.411768,1.297595,1.127628,0,10
1,1.259824,0.212131,-0.766989,-0.882491,0,30
2,1.259824,-0.687597,-0.533837,-0.882491,0,35
3,-0.78989,1.111859,-0.006709,-0.882491,0,99
7,-0.78989,0.452058,0.10142,1.127628,0,232
11,-0.78989,0.691986,-0.932562,1.127628,0,355
15,1.259824,1.17184,3.257431,1.127628,0,493
26,-0.78989,2.011586,1.899062,1.127628,0,826
43,1.259824,0.691986,-0.550732,-0.882491,0,1427
45,1.259824,1.231822,-0.550732,-0.882491,0,1499


In [25]:
final = pd.concat([censored_mel, noncensored_mel]) 

final['predicted_cure_label'] = y_pred

final['predicted_prob_cured'] = 1-y_scores

In [26]:


final_cov = final[[col for col in final.columns if col not in ['time', 'predicted_prob_cured']]]

In [27]:
final_cov

Unnamed: 0,sex,age,thickness,ulcer,status,predicted_cure_label
0,1.259824,1.411768,1.297595,1.127628,0,1.0
1,1.259824,0.212131,-0.766989,-0.882491,0,0.0
2,1.259824,-0.687597,-0.533837,-0.882491,0,0.0
3,-0.789890,1.111859,-0.006709,-0.882491,0,0.0
7,-0.789890,0.452058,0.101420,1.127628,0,1.0
...,...,...,...,...,...,...
136,-0.789890,-0.627615,-0.222966,1.127628,1,0.0
142,1.259824,-1.107470,0.209549,1.127628,1,1.0
148,1.259824,1.531731,-0.331095,-0.882491,1,0.0
153,-0.789890,0.092167,0.209549,1.127628,1,1.0


In [28]:
cens_cov = final_cov[final_cov.status==0]

noncens_cov = final_cov[final_cov.status==1]

In [29]:
final_time_prob_cens = (final[final.status==0])[['time', 'predicted_prob_cured']]

final_time_prob_noncens = (final[final.status==1])[['time', 'predicted_prob_cured']]

In [30]:
final_time_prob_noncens

Unnamed: 0,time,predicted_prob_cured
4,185,0.183918
5,204,0.375279
6,210,0.222993
8,232,0.179996
9,279,0.356839
10,295,0.477431
12,386,0.430464
13,426,0.265554
14,469,0.652471
16,529,0.298478


In [31]:
cens_cov = cens_cov.drop(columns=['status'])

cens_cov

Unnamed: 0,sex,age,thickness,ulcer,predicted_cure_label
0,1.259824,1.411768,1.297595,1.127628,1.0
1,1.259824,0.212131,-0.766989,-0.882491,0.0
2,1.259824,-0.687597,-0.533837,-0.882491,0.0
3,-0.789890,1.111859,-0.006709,-0.882491,0.0
7,-0.789890,0.452058,0.101420,1.127628,1.0
...,...,...,...,...,...
200,1.259824,-1.407379,1.398966,1.127628,1.0
201,-0.789890,-0.747579,1.081338,-0.882491,0.0
202,-0.789890,-0.627615,-0.824433,-0.882491,0.0
203,-0.789890,-0.147760,-0.222966,-0.882491,0.0


In [32]:
times = final_time_prob_cens.values.tolist()

In [33]:
covs = cens_cov.values.tolist()

In [34]:
z = list(zip(times, covs))

In [35]:
censored_inputs = [[arr[0][0], arr[0][1], arr[1]] for arr in z]

In [36]:
noncens_cov = noncens_cov.drop(columns=['status'])

ntimes = final_time_prob_noncens.values.tolist()

noncens_covs = noncens_cov.values.tolist()

zn = list(zip(ntimes, noncens_covs))

noncensored_inputs = [[arr[0][0], arr[0][1], arr[1]] for arr in zn]



In [88]:
from survival_func import survival_fit_weights, survival_fit

In [86]:
result = None
while result is None:
    try:
        # connect
        result = survival_fit(censored_inputs, noncensored_inputs, 0.5, 100)
    except:
        pass

    

In [93]:
survival_fit_weights(censored_inputs, noncensored_inputs, 0.5, 100)

array([ 1.98948668e+03,  1.56424644e+00,  1.78665659e-02, -1.33429890e-02,
        8.40011350e-02,  4.53737835e-02,  2.58526918e-02])

In [55]:
%load_ext autoreload



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
%autoreload 2