In [1]:
import pandas as pd
import numpy as np
from make_df import create_df, make_inputs
from naive import naive_fit, get_true_labels
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


In this notebook we test our custom 'Hard EM' algorithm against the following simpler alternatives for populating the missing cure labels:

(I) Guessing that a censored individual is cured with a probability of 50%.

(II) Creating one cluster from the noncensord individuals and two clusters from the cesnored individuals, then assigning cure
labels to the censored individuals by comparing which of the two censored clusters is closer to the noncensored one, giving it the label '1' (not cured), and the furhter one the label '0'.

This is achieved by creating test data sets in which we know exactly who is cured and who is censored, then removing the cure labels for the censored individuals, and feeding the censored and noncensored inputs into our algorithm. The latter is achieved
using the 'naive_fit' function with the 'use_HardEM' option.

In [10]:
#To avoid a dataset with only one label we append the following extra columns.

extra0 = pd.DataFrame([[0.1,0.1,0.1,0,1,0]],columns = ['x1', 'x2', 'x3', 'cure_label', 'int', 'censoring_indicator'])

extra1 = pd.DataFrame([[0.1,0.1,0.1,1,1,1]],columns = ['x1', 'x2', 'x3', 'cure_label', 'int', 'censoring_indicator'])

In [11]:
extra0 

Unnamed: 0,x1,x2,x3,cure_label,int,censoring_indicator
0,0.1,0.1,0.1,0,1,0


In [12]:
extra1

Unnamed: 0,x1,x2,x3,cure_label,int,censoring_indicator
0,0.1,0.1,0.1,1,1,1


In [13]:
extra = pd.concat([extra0, extra1])

In [16]:


r_weights = np.random.uniform(-0.5,0.5,(10,4)) #This will be used for the model parameter weights which we use to compute
                                                #the cure lables for our test sets via sigmoid function.
scores = []
covariates = ['x1', 'x2', 'x3'] #We choose three covariates for our datasets for illustration.
dist = [[0, 1], [0, 1], [0, 1]] #We are drawing covariates from a normal distribution with mean 0 and std 1.
cols = ['censoring_indicator', 'cure_label']

for test_model_weights in r_weights:
    
    foo = create_df(covariates, dist, 150, test_model_weights, 0.5) #We have 150 rows in our table and 0.5 is the probability
                                                                    #of being censored given one is not cured we use to 
                                                                    #assign who is censored and who isn't.
    foo = pd.concat([foo, extra])
    
    censored_inputs = make_inputs(foo, 0, cols)

    noncensored_inputs = make_inputs(foo, 1, cols)
    
    fit = naive_fit(censored_inputs, noncensored_inputs, 'use_HardEM')    
    
    y_pred = fit['pred']
    
    y_true = get_true_labels(foo, ['censoring_indicator', 'cure_label'])
    
    y_scores = fit['prob']
    
    hard_acc = accuracy_score(y_true, y_pred) #'hard' is the score our algorithm produces. In this case, accuracy.
    
    hard_auc = roc_auc_score(y_true, y_scores)
    
    fit_naive = naive_fit(censored_inputs, noncensored_inputs, 'use_clustering') #See the opening description.
    
    y_pred_naive = fit_naive['pred']
    
    y_true_naive = get_true_labels(foo, ['censoring_indicator', 'cure_label'])
    
    y_scores_naive = fit_naive['prob']
    
    naive_acc = accuracy_score(y_true_naive, y_pred_naive) #The accuracy of the result produced by naive way of populating
                                                           #the missing lables.
    naive_auc = roc_auc_score(y_true_naive, y_scores_naive)
    

    scores.append([hard_auc, naive_auc, hard_acc, naive_acc ])
    
new_df = pd.DataFrame(columns=['HardEM_auc', 'NaiveEM_auc', 'HardEM_acc', 'NaiveEM_acc'], data=scores)

In [17]:
#A dataframe showing the results of the experiment
new_df

Unnamed: 0,HardEM_auc,NaiveEM_auc,HardEM_acc,NaiveEM_acc
0,0.996623,0.828312,0.782895,0.736842
1,0.986255,0.792749,0.723684,0.776316
2,0.92602,0.991969,0.796053,0.960526
3,0.978946,0.737579,0.888158,0.684211
4,0.853729,0.839373,0.480263,0.730263
5,0.984857,0.751703,0.894737,0.651316
6,0.96307,0.815348,0.756579,0.684211
7,0.997486,0.646697,0.756579,0.592105
8,0.97474,0.752344,0.828947,0.684211
9,0.980064,0.997827,0.703947,0.960526


In [18]:
#How many times our accuracy was higher.

len(new_df[new_df['HardEM_acc']-new_df['NaiveEM_acc']>0].index)

6

In [19]:
#How many times our AUC score was higher.

len(new_df[new_df['HardEM_auc']-new_df['NaiveEM_auc']>0].index)

8

In [40]:
r_weights = np.random.uniform(-0.5,0.5,(10,4))
scores = []
covariates = ['x1', 'x2', 'x3']
dist = [[0, 1], [0, 1], [0, 1]]
cols = ['censoring_indicator', 'cure_label']

for test_model_weights in r_weights: 
    
    for p in probs:
    
        foo = create_df(covariates, dist, 150, test_model_weights, p)
    
        foo = pd.concat([foo, extra])
    
        censored_inputs = make_inputs(foo, 0, cols)

        noncensored_inputs = make_inputs(foo, 1, cols)]
    
        fit = naive_fit(censored_inputs, noncensored_inputs, 'use_HardEM')    
    
        y_pred = fit['pred']
    
        y_true = get_true_labels(foo, ['censoring_indicator', 'cure_label'])
    
        y_scores = fit['prob']
    
        hard_acc = accuracy_score(y_true, y_pred)
    
        hard_auc = roc_auc_score(y_true, y_scores)
    
        fit_naive = naive_fit(censored_inputs, noncensored_inputs, 'fifty_fifty')
    
        y_pred_naive = fit_naive['pred']
    
        y_true_naive = get_true_labels(foo, ['censoring_indicator', 'cure_label'])
    
        y_scores_naive = fit_naive['prob']
    
        naive_acc = accuracy_score(y_true_naive, y_pred_naive)
    
        naive_auc = roc_auc_score(y_true_naive, y_scores_naive)
    

        scores.append([hard_auc, naive_auc, hard_acc, naive_acc ])
    
new_df = pd.DataFrame(columns=['HardEM_auc', 'NaiveEM_auc', 'HardEM_acc', 'NaiveEM_acc'], data=scores)

In [41]:
new_df

Unnamed: 0,HardEM_auc,NaiveEM_auc,HardEM_acc,NaiveEM_acc
0,0.991635,0.972019,0.927632,0.500000
1,0.994697,0.998348,0.967105,0.657895
2,0.991859,0.945641,0.907895,0.651316
3,0.998512,0.981005,0.848684,0.605263
4,0.923519,0.982051,0.828947,0.480263
...,...,...,...,...
95,0.659994,0.777506,0.730263,0.828947
96,0.505040,0.481423,0.789474,0.802632
97,0.575974,0.806140,0.315789,0.809211
98,0.688743,0.928716,0.269737,0.888158


In [42]:
len(new_df[new_df['HardEM_auc']-new_df['NaiveEM_auc']>0].index)

62

In [43]:
len(new_df[new_df['HardEM_acc']-new_df['NaiveEM_acc']>0].index) #How many times our accuracy was higher.

70

In [44]:
how_auc = new_df[new_df['HardEM_auc']-new_df['NaiveEM_auc']>0] #How many times our AUC score was higher.

In [47]:
(how_auc['HardEM_auc']-how_auc['NaiveEM_auc']).mean() #Mean of outperformance for AUC score.

0.1151441516857973

In [48]:
(how_auc['HardEM_acc']-how_auc['NaiveEM_acc']).mean() #Mean of outperformance for accuracy.

0.14526740237691008