In [126]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc


# get all data (train and test only)

In [164]:
def to_categorical(y, num_classes=None):
    y = np.array(y, dtype='int').ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    return categorical

def AUC(test_labels,test_prediction):
    n_classes = 2
    # http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        # ( actual labels, predicted probabilities )
        fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], test_prediction[:, i]) # flip here
        roc_auc[i] = auc(fpr[i], tpr[i])

    return round(roc_auc[0],3) , round(roc_auc[1],3)

def manageDataFrames():
    
    trainList = ["lung1","lung2"]  # , , , ,  ,"oncopanel" , "moffitt","moffittSpore"  ,"oncomap" , ,"lung3" 
    validateList = [] # leave empty
    testList = ["nsclc_rt"] # split to val and test

    dataFrame = pd.DataFrame.from_csv('master_170228.csv', index_col = 0)
    dataFrame = dataFrame [ 
        ( pd.notnull( dataFrame["pathToData"] ) ) &
        ( pd.notnull( dataFrame["pathToMask"] ) ) &
        ( pd.notnull( dataFrame["stackMin"] ) ) &
        ( pd.isnull( dataFrame["patch_failed"] ) ) &
        ( pd.notnull( dataFrame["surv1yr"] ) )  &
        ( pd.notnull( dataFrame["surv2yr"] ) )  &
        ( pd.notnull( dataFrame["histology_grouped"] ) )  &
        ( pd.notnull( dataFrame["stage"] ) )  &
        ( pd.notnull( dataFrame["age"] ) )  
        ]
   
    dataFrame = dataFrame.reset_index(drop=True)
    
    ###### FIX ALL
    
    #1# clean histology - remove smallcell and other
    # histToInclude - only NSCLC
    histToInclude = [1.0,2.0,3.0,4.0]
    # not included - SCLC and other and no data [ 0,5,6,7,8,9 ]
    dataFrame = dataFrame [ dataFrame.histology_grouped.isin(histToInclude) ]
    dataFrame = dataFrame.reset_index(drop=True)
    print ("all patients: " , dataFrame.shape)
    
    #2# use all stages for now.
        
    ###### GET TRAINING / TESTING

    dataFrameTrain = dataFrame [ dataFrame["dataset"].isin(trainList) ]
    dataFrameTrain = dataFrameTrain.reset_index(drop=True)
    print (" final - train patients: " , dataFrameTrain.shape)
    
    dataFrameTest = dataFrame [ dataFrame["dataset"].isin(testList) ]
    dataFrameTest = dataFrameTest.reset_index(drop=True)
    print (" before - test patients : " , dataFrameTest.shape)
    
    ###### FIX TESTING
    
    #3# type of treatment - use only radio or chemoRadio - use .npy file
    
    chemoRadio = np.load("rt_chemoRadio.npy").astype(str)
    dataFrameTest = dataFrameTest [ dataFrameTest["patient"].isin(chemoRadio) ]
   
    #4# (rt only) use all causes of death
    
    print ("final test patients: " , dataFrameTest.shape)

    return dataFrameTrain,dataFrameTest

In [165]:
dataFrameTrain,dataFrameTest = manageDataFrames()

('all patients: ', (1070, 32))
(' final - train patients: ', (315, 32))
(' before - test patients : ', (485, 32))
('final test patients: ', (240, 32))


## voxel count i.e. volume

In [166]:
# add columns for both
# train
voxelList = dataFrameTrain.voxelCountList.tolist()
dataFrameTrain['volume'] =  [ int(x[1:-1].split(",")[0]) if "," in x[1:-1] else int(x[1:-1]) for x in voxelList ] 
# test
voxelList = dataFrameTest.voxelCountList.tolist()
dataFrameTest['volume'] =  [ int(x[1:-1].split(",")[0]) if "," in x[1:-1] else int(x[1:-1]) for x in voxelList ] 

In [167]:
# which to work on
dataset = dataFrameTest

voxelList = dataset.voxelCountList.tolist()
voxelListClean = np.array( [ int(x[1:-1].split(",")[0]) if "," in x[1:-1] else int(x[1:-1]) for x in voxelList ] )
y =  np.array( dataset.surv2yr.tolist() )
# y = np.array( [ 2 if x==0 else 1 for x in dataset.surv2yr.tolist() ] )
print voxelListClean.shape , y.shape
# 0 didnt survive, 1 survived
# 1 survived, 2 didnt survived

(240L,) (240L,)


In [168]:
fpr, tpr, thresholds = metrics.roc_curve(y, voxelListClean, pos_label=1)
metrics.auc(fpr, tpr)

0.46963379890209145

In [193]:
# prepare training
xTrainRF = dataFrameTrain[['stage','age','histology_grouped','volume']]
yTrainRF = np.array( dataFrameTrain.surv2yr.tolist() , 'int64' )
print xTrainRF.shape , yTrainRF.shape

# prepare testing
xTestRF = dataFrameTest[['stage','age','histology_grouped','volume']]
yTestRF = np.array( dataFrameTest.surv2yr.tolist() , 'int64' )
print xTestRF.shape , yTestRF.shape

clf = RandomForestClassifier(n_jobs= , criterion = 'entropy' )
clf.fit(xTrainRF, yTrainRF)

preds = clf.predict_proba( xTestRF )

yTestRFCat = to_categorical(yTestRF, 2)
auc1,auc2 = AUC(yTestRFCat,preds)
print auc1,auc2 


(315, 4) (315L,)
(240, 4) (240L,)
0.572 0.572
