In [10]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
import random


# get all data (train and test only)

In [11]:
def to_categorical(y, num_classes=None):
    y = np.array(y, dtype='int').ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    return categorical

def AUC(test_labels,test_prediction):
    n_classes = 2
    # http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        # ( actual labels, predicted probabilities )
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], test_prediction[:, i]) # flip here
        roc_auc[i] = auc(fpr[i], tpr[i])

    return round(roc_auc[0],3) , round(roc_auc[1],3)

def manageDataFrames():
    
    trainList = ["lung1","lung2"]  # , , , ,  ,"oncopanel" , "moffitt","moffittSpore"  ,"oncomap" , ,"lung3" 
    validateList = [] # leave empty
    testList = ["nsclc_rt"] # split to val and test

    dataFrame = pd.DataFrame.from_csv('master_170228.csv', index_col = 0)
    dataFrame = dataFrame [ 
        ( pd.notnull( dataFrame["pathToData"] ) ) &
        ( pd.notnull( dataFrame["pathToMask"] ) ) &
        ( pd.notnull( dataFrame["stackMin"] ) ) &
        ( pd.isnull( dataFrame["patch_failed"] ) ) &
        ( pd.notnull( dataFrame["surv1yr"] ) )  &
        ( pd.notnull( dataFrame["surv2yr"] ) )  &
        ( pd.notnull( dataFrame["histology_grouped"] ) )  &
        ( pd.notnull( dataFrame["stage"] ) )  &
        ( pd.notnull( dataFrame["age"] ) )  
        ]
   
    dataFrame = dataFrame.reset_index(drop=True)
    
    ###### FIX ALL
    
    #1# clean histology - remove smallcell and other
    # histToInclude - only NSCLC
    histToInclude = [1.0,2.0,3.0,4.0]
    # not included - SCLC and other and no data [ 0,5,6,7,8,9 ]
    dataFrame = dataFrame [ dataFrame.histology_grouped.isin(histToInclude) ]
    dataFrame = dataFrame.reset_index(drop=True)
    print ("all patients: " , dataFrame.shape)
    
    #2# use all stages for now.
        
    ###### GET TRAINING / TESTING

    dataFrameTrain = dataFrame [ dataFrame["dataset"].isin(trainList) ]
    dataFrameTrain = dataFrameTrain.reset_index(drop=True)
    print (" final - train patients: " , dataFrameTrain.shape)
    
    dataFrameTest = dataFrame [ dataFrame["dataset"].isin(testList) ]
    dataFrameTest = dataFrameTest.reset_index(drop=True)
    print (" before - test patients : " , dataFrameTest.shape)
    
    ###### FIX TESTING
    
    #3# type of treatment - use only radio or chemoRadio - use .npy file
    
    chemoRadio = np.load("rt_chemoRadio.npy").astype(str)
    dataFrameTest = dataFrameTest [ dataFrameTest["patient"].isin(chemoRadio) ]
   
    #4# (rt only) use all causes of death
    
    print ("final test patients: " , dataFrameTest.shape)

    return dataFrameTrain,dataFrameTest

In [12]:
def manageDataFrames():
    trainList = ["lung1","lung2"]  # , , , ,  ,"oncopanel" , "moffitt","moffittSpore"  ,"oncomap" , ,"lung3" 
    validateList = [] # leave empty
    testList = ["nsclc_rt"] # split to val and test

    dataFrame = pd.DataFrame.from_csv('master_170228.csv', index_col = 0)
    dataFrame = dataFrame [ 
        ( pd.notnull( dataFrame["pathToData"] ) ) &
        ( pd.notnull( dataFrame["pathToMask"] ) ) &
        ( pd.notnull( dataFrame["stackMin"] ) ) &
        ( pd.isnull( dataFrame["patch_failed"] ) ) &
        ( pd.notnull( dataFrame["surv1yr"] ) )  &
        ( pd.notnull( dataFrame["surv2yr"] ) )  &
        ( pd.notnull( dataFrame["histology_grouped"] ) )  &
        ( pd.notnull( dataFrame["stage"] ) )  &
        ( pd.notnull( dataFrame["age"] ) )  
        ]
   
    dataFrame = dataFrame.reset_index(drop=True)
    
    ###### FIX ALL
    
    #1# clean histology - remove smallcell and other
    # histToInclude - only NSCLC
    histToInclude = [1.0,2.0,3.0,4.0]
    # not included - SCLC and other and no data [ 0,5,6,7,8,9 ]
    dataFrame = dataFrame [ dataFrame.histology_grouped.isin(histToInclude) ]
    dataFrame = dataFrame.reset_index(drop=True)

    
    #2# use all stages for now.
    stageToInclude = [1.0,2.0,3.0]
    dataFrame = dataFrame [ dataFrame.stage.isin(stageToInclude) ]
    dataFrame = dataFrame.reset_index(drop=True)
    print ("all patients: " , dataFrame.shape)

        
    ###### GET TRAINING / TESTING

    dataFrameTrain = dataFrame [ dataFrame["dataset"].isin(trainList) ]
    dataFrameTrain = dataFrameTrain.reset_index(drop=True)
    print (" final - train patients: " , dataFrameTrain.shape)
    
    dataFrameTest = dataFrame [ dataFrame["dataset"].isin(testList) ]
    dataFrameTest = dataFrameTest.reset_index(drop=True)
    print (" before - test patients : " , dataFrameTest.shape)
    
    ###### FIX val/TESTING
    
    #3# type of treatment - use only radio or chemoRadio - use .npy file
    
    chemoRadio = np.load("rt_chemoRadio.npy").astype(str)
    dataFrameTest = dataFrameTest [ dataFrameTest["patient"].isin(chemoRadio) ]
   
    #4# (rt only) use all causes of death
        
    #FOR VALIDATION AND TESTING.
    # take test and split it in 30% and 70% (30% and 70% should have equal samples of 0 and 1)
    dataFrameTest = dataFrameTest.reset_index(drop=True)
    print ("validate and test patients " , dataFrameTest.shape)



    ## FOR TRAINING:
    # now we need equal samples of 0's and 1's
    # get numbers
    # zero = dataFrameTrain [  (dataFrameTrain['surv2yr']== 0.0)  ]
    # print ('zeros ' , zero.shape)
    # one = dataFrameTrain [  (dataFrameTrain['surv2yr']== 1.0)  ]
    # print ('ones ' , one.shape)
    # zero = zero.sample(one.shape[0],random_state=1) # RANDOM
    # #put both together
    # dataFrameTrain = pd.DataFrame()
    # dataFrameTrain = dataFrameTrain.append(zero)
    # dataFrameTrain = dataFrameTrain.append(one)
    # dataFrameTrain = dataFrameTrain.reset_index(drop=True)
    # print ('final train size:' , dataFrameTrain.shape)




    # take test and split it in 30% and 70% (30% and 70% should have equal samples of 0 and 1)
    # get 30% and 70%
    thirty = int(dataFrameTest.shape[0]*0.3)   ######################################
    if thirty % 2 != 0:
        thirty = thirty + 1
    seventy = dataFrameTest.shape[0] - thirty

    # get 0's and 1's.
    zero = dataFrameTest [  (dataFrameTest['surv2yr']== 0.0)  ]
    one = dataFrameTest [  (dataFrameTest['surv2yr']== 1.0)  ]

    # split to val and test
    half = int(thirty/2.0)

    trueList = [True for i in range (half)]
    #
    zeroFalseList = [False for i in range (zero.shape[0] - half )]
    zero_msk = trueList + zeroFalseList
    random.seed(41)
    random.shuffle(zero_msk)
    zero_msk = np.array(zero_msk)
    #
    oneFalseList = [False for i in range (one.shape[0] - half )]
    one_msk = trueList + oneFalseList
    random.seed(41)
    random.shuffle(one_msk)
    one_msk = np.array(one_msk)

    # VALIDATE
    zero_val = zero[zero_msk]
    one_val = one[one_msk]
    dataFrameValidate = pd.DataFrame()
    dataFrameValidate = dataFrameValidate.append(zero_val)
    dataFrameValidate = dataFrameValidate.append(one_val)
    dataFrameValidate = dataFrameValidate.reset_index(drop=True)
    print ('final validate size:' , dataFrameValidate.shape)

    # TEST
    zero_test = zero[~zero_msk]
    one_test = one[~one_msk]
    dataFrameTest = pd.DataFrame()
    dataFrameTest = dataFrameTest.append(zero_test)
    dataFrameTest = dataFrameTest.append(one_test)
    dataFrameTest = dataFrameTest.reset_index(drop=True)
    print ('final test size:' , dataFrameTest.shape)


    return dataFrameTrain,dataFrameValidate,dataFrameTest

In [13]:
dataFrameTrain,val,dataFrameTest = manageDataFrames()

('all patients: ', (902, 32))
(' final - train patients: ', (221, 32))
(' before - test patients : ', (450, 32))
('validate and test patients ', (293, 32))
('final validate size:', (88, 32))
('final test size:', (205, 32))


In [5]:
dataFrameTest.stage.tolist()

[3.0,
 3.0,
 3.0,
 2.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 4.0,
 2.0,
 4.0,
 2.0,
 3.0,
 3.0,
 3.0,
 4.0,
 2.0,
 3.0,
 3.0,
 4.0,
 0.0,
 3.0,
 4.0,
 3.0,
 3.0,
 4.0,
 4.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 1.0,
 2.0,
 3.0,
 3.0,
 3.0,
 3.0,
 1.0,
 1.0,
 4.0,
 1.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 2.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 1.0,
 1.0,
 3.0,
 2.0,
 3.0,
 3.0,
 1.0,
 1.0,
 1.0,
 2.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 4.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 3.0,
 3.0,
 3.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 4.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 1.0,
 1.0,
 1.0,
 3.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0,
 3.0

## voxel count i.e. volume

In [4]:
# add columns for both
# train
voxelList = dataFrameTrain.voxelCountList.tolist()
dataFrameTrain['volume'] =  [ int(x[1:-1].split(",")[0]) if "," in x[1:-1] else int(x[1:-1]) for x in voxelList ] 
# test
voxelList = dataFrameTest.voxelCountList.tolist()
dataFrameTest['volume'] =  [ int(x[1:-1].split(",")[0]) if "," in x[1:-1] else int(x[1:-1]) for x in voxelList ] 

In [5]:
# which to work on
dataset = dataFrameTest

voxelList = dataset.voxelCountList.tolist()
voxelListClean = np.array( [ int(x[1:-1].split(",")[0]) if "," in x[1:-1] else int(x[1:-1]) for x in voxelList ] )
y =  np.array( dataset.surv2yr.tolist() )
# y = np.array( [ 2 if x==0 else 1 for x in dataset.surv2yr.tolist() ] )
print voxelListClean.shape , y.shape
# 0 didnt survive, 1 survived
# 1 survived, 2 didnt survived

(240L,) (240L,)


In [11]:
fpr, tpr, thresholds = metrics.roc_curve(y, voxelListClean, pos_label=0)
metrics.auc(fpr, tpr)

0.53036620109790844

In [37]:
# prepare training
xTrainRF = dataFrameTrain[['stage','age','histology_grouped','volume']]
yTrainRF = np.array( dataFrameTrain.surv2yr.tolist() , 'int64' )
print xTrainRF.shape , yTrainRF.shape

# prepare testing
xTestRF = dataFrameTest[['stage','age','histology_grouped','volume']]
yTestRF = np.array( dataFrameTest.surv2yr.tolist() , 'int64' )
print xTestRF.shape , yTestRF.shape

clf = RandomForestClassifier(n_jobs= 200, criterion = 'entropy' )
clf.fit(xTrainRF, yTrainRF)

preds = clf.predict_proba( xTestRF )

yTestRFCat = to_categorical(yTestRF, 2)
auc1,auc2 = AUC(yTestRFCat,preds)
print auc1,auc2 


(315, 4) (315L,)
(240, 4) (240L,)
0.526 0.526
