We're going to first try training a CNN on the individual images.
We will be using binary cross entropy across the 17 regions.

In [1]:
import HelperFuncs as hfuncs
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils.data_utils import Sequence
import h5py
import os

BATCH_SIZE = 1
FINAL_WIDTH = 400
FINAL_HEIGHT = 600
CHANNELS = 1
ZONES = 17
ANGLES = 4 #Just using 4 angles here
#Create directories for sequencer function if they don't exist
for d in ['temp/train_scan/','temp/test_scan/','temp/val_scan/']:
    if not os.path.isdir(d):
        print("Created directory: {}".format(d))
        os.makedirs(d)
        
class LegScanSequencer(Sequence):
    idx_dict={}
    
    def __init__(self,num_batches,bucket_name,mode="train_scan"):
        self.num_batches = num_batches
        self.bucket_name = bucket_name
        self.mode = mode
        self.key_id, self.secret_key = hfuncs.GetAWSCredentials()
        self.mode = mode
        self.angles = np.arange(0,64,64//ANGLES)
    def __len__(self):
        return self.num_batches
    def on_epoch_end(self):
        pass
    def __getitem__(self,idx):
        #Get Client
        client = hfuncs.GetAWSClient(self.key_id,self.secret_key)
        bucket = client.Bucket(self.bucket_name)
        
        #Initialize vectors
        X_train = np.zeros((BATCH_SIZE,ANGLES,FINAL_WIDTH,FINAL_HEIGHT//2,CHANNELS))
        y_train = np.zeros((BATCH_SIZE,1))
        
        j=0
        for i in range(idx*BATCH_SIZE,(idx+1)*BATCH_SIZE):
            #Download batch at index
            path = "temp/{}/batch_{}.hdf5".format(self.mode,i)
            key = "{}/batch_{}.hdf5".format(self.mode,i)
            bucket.download_file(Key=key,Filename=path)
        
            f = h5py.File(path,"r")
            try:
                X_train[j,:,:,:,:] = f['/image'].value[self.angles,:,:FINAL_HEIGHT//2,:]
                r_leg = [12,14,13,15]
                l_leg = [13,15]
                r_y = np.amax(f['/labels'].value[r_leg])
                #l_y = np.amax(f['/labels'].value[l_leg])
                y_train[j,:] = r_y
                j += 1
            finally:
                f.close()
                os.remove(path) 
        return X_train, y_train



Using TensorFlow backend.


In [2]:
from twilio.rest import Client
import configparser
from keras.callbacks import Callback

class SMSNotifier(Callback):
    def on_epoch_end(self,epoch,logs=None):
        #Execute every other epoch
        if epoch % 2 == 0:
            #Get config credentials
            config = configparser.ConfigParser()
            config.read('twilio.conf')
            account_sid = config['DEFAULT']['AccountID']
            auth_token = config['DEFAULT']['AuthToken']
            #Get client
            client = Client(account_sid, auth_token)
            #Create message
            if logs is not None:
                message = "Layer {} complete.".format(epoch)
            else:
                message = "Layer {} complete.".format(epoch)
            #Sendmessage
            message = client.messages.create(
                to="+16178884129", 
                from_="+18572142288",
                body=message)
        else:
            pass

In [3]:
#Build pre-trained V2 model
import numpy as np
from keras.layers import Input,Flatten,Dense,Concatenate,Dropout,concatenate,GlobalMaxPool2D
from keras.models import Model
from datetime import datetime
from keras.callbacks import TensorBoard,EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
from keras.optimizers import Adam
from keras.metrics import binary_accuracy
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.losses import binary_crossentropy
from keras.applications.inception_v3 import InceptionV3,preprocess_input
from keras.layers.core import Lambda
import tensorflow as tf

def ToRGB(x):
    max_v = tf.reduce_max(x)
    min_v = tf.reduce_min(x)
    max_rgb = tf.constant(255,dtype=x.dtype)
    x = tf.floordiv(tf.multiply(tf.subtract(x,min_v),max_rgb),tf.subtract(max_v,min_v))
    return x
def ToGreyScale(x):
    #Divide RGB into 3
    scalar = tf.constant(3,dtype=x.dtype)
    x = tf.floordiv(x,scalar)
    shape = x.get_shape()
    #assume channel_last
    mult = [[1 for d in shape[:-1]],[3]]
    mult = [val for sublist in mult for val in sublist]
    return tf.tile(x,mult)
def ToNewShape(x):
    ndim = len(x.shape)
    if ndim == 5:
        return tf.reverse(tf.transpose(x,[0,1,3,2,4]),[-3])
    elif ndim == 4:
        return tf.reverse(tf.transpose(x,[0,2,1,3]),[-3])
    else:
        raise ValueError("Unexpected number of dims!")

def getSingleLegModel(layer_idx):
    #Single model image
    input_img = Input(shape=(FINAL_WIDTH,FINAL_HEIGHT//2,CHANNELS))
    
    #preprocess and extract channels
    input_img_pp = Lambda(ToRGB)(input_img)
    input_img_pp = Lambda(ToGreyScale)(input_img_pp)
    input_img_pp = Lambda(preprocess_input)(input_img_pp)
    input_img_pp = Lambda(ToNewShape)(input_img_pp)
    
    #Load resnet
    incep = InceptionV3(include_top=False,
                          weights='imagenet',
                          input_tensor=None,
                          input_shape=(FINAL_HEIGHT//2,FINAL_WIDTH,3),
                          pooling='max')
    for l in incep.layers:
        l.trainable=False

    #Take off top
    reduced_net = Model(incep.input,incep.get_layer('mixed{}'.format(layer_idx)).output)
    
    #Add to rest of the model
    output = reduced_net(input_img_pp)
    output = Flatten()(output)
    intermediate_model = Model(input_img,output)
    
    #Time distributed model
    input_scan = Input(shape=(ANGLES,FINAL_WIDTH,FINAL_HEIGHT//2,CHANNELS))  
    sequenced_model = TimeDistributed(intermediate_model)(input_scan)
  
    #One lstm layer for now
    #lstm = LSTM(lstm_dim,recurrent_dropout=0.10)(sequenced_model)
    
    #Finally, 1 dense layers
    #out = Dense(1,activation='sigmoid',use_bias=False)(lstm)
    #complete model
    try:
        return Model(input_scan,sequenced_model)#Model(input_scan, out)
    finally:
        del intermediate_model,sequenced_model,incep
        



In [157]:
#Test how many positive samples
import pickle
labels = hfuncs.GetLabelsDict(r'stage1_labels.csv')
filename = "data_separated.pickle"
with open(filename,"rb") as f:
   save = pickle.load(f)
   K_test= save['K_test']
   K_val = save['K_val']
   K_train = save['K_train']
s = 0
pos = 0
for k in K_train:
    k_clean = k.replace("DHSData/","").replace(".a3daps","")
    if k_clean in labels.keys():
        label = np.array(labels[k_clean])
        val = np.amax(label[[12,14]])
        if val == 1:
            s += 1
            pos += 1
        else:
            s += 1
print("total={},pos={}".format(s,pos))

total=687,pos=126


In [None]:
#Use model as a feature extractor and use traditional ML to sdeterine whther features have any predictive power
import h5py
from keras import backend as K
K.set_learning_phase(0)

#Bucket with clean data
UPLOAD_BUCKET = 'cleandhsdata' #bucket where clean data was stored
TEMP_DIR = 'temp' #Directory for file upload/downloads
key_id, secret_key = hfuncs.GetAWSCredentials()
client = hfuncs.GetAWSClient(key_id,secret_key)
bucket = client.Bucket(UPLOAD_BUCKET)

#Initialize train sequencer
mode ="train_scan"
num_batches_train = (sum([1 if "{}/".format(mode) in k.key else 0 for k in bucket.objects.all()])-1)//BATCH_SIZE #train,test,val root directories have their own keys
train_seq = LegScanSequencer(num_batches_train,UPLOAD_BUCKET,mode=mode)

#Initialize validation sequencer
mode = "val_scan"
num_batches_val = (sum([1 if "{}/".format(mode) in k.key else 0 for k in bucket.objects.all()])-1)//BATCH_SIZE #train,test,val root directories have their own keys
val_seq = LegScanSequencer(num_batches_val,UPLOAD_BUCKET,mode=mode)

#Create notifier
notify = SMSNotifier()

#Create function that creates data set for given layer
def CreateFeatureDataSet(layer_idx,dir_name = 'featureextraction',max_batches=800):
    #Get model and output size
    model = getSingleLegModel(layer_idx)
    output_size = model.output_shape[1]*model.output_shape[2]
    
    #Variables to iterate over
    #modes = ['train','val']
    #num_batches = [num_batches_train,num_batches_val]
    #generators = [train_seq,val_seq]
    modes = ['train']
    num_batches = [num_batches_train]
    generators = [train_seq]
    
    
    for mode,num_b,gen in zip(modes,num_batches,generators):
        #Initialize dataset array
        X_d = np.zeros((min(num_b,max_batches),output_size))
        y_d = np.zeros((min(num_b,max_batches)))

        #For every item in train generator, transform data and store in dataset array
        for i in range(min(num_b,max_batches)):
            print("Storing {} in {} set...".format(i,mode))
            X, y = gen.__getitem__(i)
            X = model.predict(X)
            X_d[i,:] = X.flatten()
            y_d[i] = y[0,0]
            i += 1

        #Store data set in s3
        key_suffix = "{}_layer{}.hdf5".format(mode,layer_idx)
        filename = os.path.join(TEMP_DIR,key_suffix)
        key = "{}/{}".format(dir_name,key_suffix)

        #Save in local hdf5 file
        with h5py.File(filename,"w") as f:
            dset = f.create_dataset('features',data=X_d)
            dset2 = f.create_dataset('labels',data=y_d)

        #Upload file to bucket, then delete
        try:
            bucket.upload_file(Filename=filename,Key=key)
            print("Completed {} upload for layer {}".format(mode,layer_idx))
        finally:
            os.remove(filename)

        #Delete train arrays to save memory
        del X_d,y_d
    
    #Send notification that job was completed
    try:
        notify.on_epoch_end(layer_idx)
    except:
        print("Couldn't send notification!")
        
for l in range(8,11):
    try:
        CreateFeatureDataSet(l)
    except:
        print("Failed to create feature set {}".format(l))
os.system("aws ec2 stop-instances --instance-ids i-03cdf5bc4dae76bb5")
    

Storing 0 in train set...
Storing 1 in train set...
Storing 2 in train set...
Storing 3 in train set...
Storing 4 in train set...
Storing 5 in train set...
Storing 6 in train set...
Storing 7 in train set...
Storing 8 in train set...
Storing 9 in train set...
Storing 10 in train set...
Storing 11 in train set...
Storing 12 in train set...
Storing 13 in train set...
Storing 14 in train set...
Storing 15 in train set...
Storing 16 in train set...
Storing 17 in train set...
Storing 18 in train set...
Storing 19 in train set...
Storing 20 in train set...
Storing 21 in train set...
Storing 22 in train set...
Storing 23 in train set...
Storing 24 in train set...
Storing 25 in train set...
Storing 26 in train set...
Storing 27 in train set...
Storing 28 in train set...
Storing 29 in train set...
Storing 30 in train set...
Storing 31 in train set...
Storing 32 in train set...
Storing 33 in train set...
Storing 34 in train set...
Storing 35 in train set...
Storing 36 in train set...
Storing 37 

Storing 297 in train set...
Storing 298 in train set...
Storing 299 in train set...
Storing 300 in train set...
Storing 301 in train set...
Storing 302 in train set...
Storing 303 in train set...
Storing 304 in train set...
Storing 305 in train set...
Storing 306 in train set...
Storing 307 in train set...
Storing 308 in train set...
Storing 309 in train set...
Storing 310 in train set...
Storing 311 in train set...
Storing 312 in train set...
Storing 313 in train set...
Storing 314 in train set...
Storing 315 in train set...
Storing 316 in train set...
Storing 317 in train set...
Storing 318 in train set...
Storing 319 in train set...
Storing 320 in train set...
Storing 321 in train set...
Storing 322 in train set...
Storing 323 in train set...
Storing 324 in train set...
Storing 325 in train set...
Storing 326 in train set...
Storing 327 in train set...
Storing 328 in train set...
Storing 329 in train set...
Storing 330 in train set...
Storing 331 in train set...
Storing 332 in train

Storing 590 in train set...
Storing 591 in train set...
Storing 592 in train set...
Storing 593 in train set...
Storing 594 in train set...
Storing 595 in train set...
Storing 596 in train set...
Storing 597 in train set...
Storing 598 in train set...
Storing 599 in train set...
Storing 600 in train set...
Storing 601 in train set...
Storing 602 in train set...
Storing 603 in train set...
Storing 604 in train set...
Storing 605 in train set...
Storing 606 in train set...
Storing 607 in train set...
Storing 608 in train set...
Storing 609 in train set...
Storing 610 in train set...
Storing 611 in train set...
Storing 612 in train set...
Storing 613 in train set...
Storing 614 in train set...
Storing 615 in train set...
Storing 616 in train set...
Storing 617 in train set...
Storing 618 in train set...
Storing 619 in train set...
Storing 620 in train set...
Storing 621 in train set...
Storing 622 in train set...
Storing 623 in train set...
Storing 624 in train set...
Storing 625 in train

Storing 85 in train set...
Storing 86 in train set...
Storing 87 in train set...
Storing 88 in train set...
Storing 89 in train set...
Storing 90 in train set...
Storing 91 in train set...
Storing 92 in train set...
Storing 93 in train set...
Storing 94 in train set...
Storing 95 in train set...
Storing 96 in train set...
Storing 97 in train set...
Storing 98 in train set...
Storing 99 in train set...
Storing 100 in train set...
Storing 101 in train set...
Storing 102 in train set...
Storing 103 in train set...
Storing 104 in train set...
Storing 105 in train set...
Storing 106 in train set...
Storing 107 in train set...
Storing 108 in train set...
Storing 109 in train set...
Storing 110 in train set...
Storing 111 in train set...
Storing 112 in train set...
Storing 113 in train set...
Storing 114 in train set...
Storing 115 in train set...
Storing 116 in train set...
Storing 117 in train set...
Storing 118 in train set...
Storing 119 in train set...
Storing 120 in train set...
Storing

Storing 379 in train set...
Storing 380 in train set...
Storing 381 in train set...
Storing 382 in train set...
Storing 383 in train set...
Storing 384 in train set...
Storing 385 in train set...
Storing 386 in train set...
Storing 387 in train set...
Storing 388 in train set...
Storing 389 in train set...
Storing 390 in train set...
Storing 391 in train set...
Storing 392 in train set...
Storing 393 in train set...
Storing 394 in train set...
Storing 395 in train set...
Storing 396 in train set...
Storing 397 in train set...
Storing 398 in train set...
Storing 399 in train set...
Storing 400 in train set...
Storing 401 in train set...
Storing 402 in train set...
Storing 403 in train set...
Storing 404 in train set...
Storing 405 in train set...
Storing 406 in train set...
Storing 407 in train set...
Storing 408 in train set...
Storing 409 in train set...
Storing 410 in train set...
Storing 411 in train set...
Storing 412 in train set...
Storing 413 in train set...
Storing 414 in train

Storing 672 in train set...
Storing 673 in train set...
Storing 674 in train set...
Storing 675 in train set...
Storing 676 in train set...
Storing 677 in train set...
Storing 678 in train set...
Storing 679 in train set...
Storing 680 in train set...
Storing 681 in train set...
Storing 682 in train set...
Storing 683 in train set...
Storing 684 in train set...
Storing 685 in train set...
Storing 686 in train set...
Storing 687 in train set...
Storing 688 in train set...
Storing 689 in train set...
Storing 690 in train set...
Storing 691 in train set...
Storing 692 in train set...
Storing 693 in train set...
Storing 694 in train set...
Storing 695 in train set...
Storing 696 in train set...
Storing 697 in train set...
Storing 698 in train set...
Storing 699 in train set...
Storing 700 in train set...
Storing 701 in train set...
Storing 702 in train set...
Storing 703 in train set...
Storing 704 in train set...
Storing 705 in train set...
Storing 706 in train set...
Storing 707 in train

Storing 168 in train set...
Storing 169 in train set...
Storing 170 in train set...
Storing 171 in train set...
Storing 172 in train set...
Storing 173 in train set...
Storing 174 in train set...
Storing 175 in train set...
Storing 176 in train set...
Storing 177 in train set...
Storing 178 in train set...
Storing 179 in train set...
Storing 180 in train set...
Storing 181 in train set...
Storing 182 in train set...
Storing 183 in train set...
Storing 184 in train set...
Storing 185 in train set...
Storing 186 in train set...
Storing 187 in train set...
Storing 188 in train set...
Storing 189 in train set...
Storing 190 in train set...
Storing 191 in train set...
Storing 192 in train set...
Storing 193 in train set...
Storing 194 in train set...
Storing 195 in train set...
Storing 196 in train set...
Storing 197 in train set...
Storing 198 in train set...
Storing 199 in train set...
Storing 200 in train set...
Storing 201 in train set...
Storing 202 in train set...
Storing 203 in train

Storing 461 in train set...
Storing 462 in train set...
Storing 463 in train set...
Storing 464 in train set...
Storing 465 in train set...
Storing 466 in train set...
Storing 467 in train set...
Storing 468 in train set...
Storing 469 in train set...
Storing 470 in train set...
Storing 471 in train set...
Storing 472 in train set...
Storing 473 in train set...
Storing 474 in train set...
Storing 475 in train set...
Storing 476 in train set...
Storing 477 in train set...
Storing 478 in train set...
Storing 479 in train set...
Storing 480 in train set...
Storing 481 in train set...
Storing 482 in train set...
Storing 483 in train set...
Storing 484 in train set...
Storing 485 in train set...
Storing 486 in train set...
Storing 487 in train set...
Storing 488 in train set...
Storing 489 in train set...
Storing 490 in train set...
Storing 491 in train set...
Storing 492 in train set...
Storing 493 in train set...
Storing 494 in train set...
Storing 495 in train set...
Storing 496 in train

In [5]:
#For every layer in the data set, we're going to train a gardient booster classifier 
#using hyperparameters we selected from before
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss,make_scorer
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
import gc

%matplotlib inline

def getScores(layer_idx,dir_name = 'featureextraction'):
    #Scorer
    log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)
    
    #Classifier
    clf = GradientBoostingClassifier(min_samples_split=3,max_depth=5,learning_rate=0.05,n_estimators=100)
    
    #Grab data
    #Download dataset
    print("Downloading dataset for layer {}...".format(layer_idx))
    mode = 'train'
    key_suffix = "{}_layer{}.hdf5".format(mode,layer_idx)
    filename = os.path.join(TEMP_DIR,key_suffix)
    key = "{}/{}".format(dir_name,key_suffix)
 
    bucket.download_file(Key=key,Filename=filename)
    
    try:
        #Cross validation parameters
        #cv = 5
        train_score_ary = []
        test_score_ary = []
        #X_index = np.zeros((800,1))
       # kf = KFold(n_splits=cv,random_state = 0,shuffle = True)

        #Open downloaded file and load data
        with h5py.File(filename,"r") as f:
            print("Running cross validation...")
            #for train_index,test_index in kf.split(X_index):
            gc.collect()
            #Fit on train data
            X_train = f['/features'].vaue[:700]
            y_train = f['/labels'].value[:700]
            clf.fit(X_train,y_train)

            #Train Score
            train_score = log_loss_scorer(clf,X_train,y_train)
            del X_train,y_train #Save memory

            #Val Score
            X_test = f['/features'].value[700:800] 
            y_test = f['/labels'].value[700:800]

            #Val score
            test_score = log_loss_scorer(clf,X_test,y_test)
            del X_test, y_test

            #Append to lists
            train_score_ary.append(train_score)
            test_score_ary.append(test_score)         

        #Return dictionary with cross-validation data using 5 fold stratified validation
        mean_train = np.mean(train_score_ary)
        #std_train = np.std(train_score_ary)
        mean_test = np.mean(test_score_ary)
       # std_test = np.std(test_score_ary)

        return mean_test,mean_train
    finally:
        os.remove(filename)

#num_layers = 8
#mean_train_ary = np.zeros(num_layers)
#std_train_ary = np.zeros(num_layers)
#mean_test_ary = np.zeros(num_layers)
#std_test_ary = np.zeros(num_layers)

#for l in range(1):
#    gc.collect()
#    print("Training layer {}".format(l))
#    mean_test,std_test,mean_train,std_train = getScores(l)
#    mean_train_ary[l] = mean_train
#    std_train_ary[l] = std_train
#    mean_test_ary[l] = mean_test
#    std_test_ary[l] = std_test

#layers = np.arange(0,8)
#plt.figure()
#plt.errorbar(x=layers,y=mean_test_ary,yerr=std_test_ary)


In [6]:
#For every layer in the data set, we're going to train a gardient booster classifier 
#using hyperparameters we selected from before
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss,make_scorer
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
import gc

%matplotlib inline

#Cross validation parameters
#Scorer
layer_idx = 1
dir_name = 'featureextraction'
 #Scorer
log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

#Classifier
clf = GradientBoostingClassifier(min_samples_split=3,max_depth=5,learning_rate=0.05,n_estimators=100)

#Grab data
#Download dataset
print("Downloading dataset for layer {}...".format(layer_idx))
mode = 'train'
key_suffix = "{}_layer{}.hdf5".format(mode,layer_idx)
filename = os.path.join(TEMP_DIR,key_suffix)
key = "{}/{}".format(dir_name,key_suffix)

if not os.path.exists(filename):
    bucket.download_file(Key=key,Filename=filename)
#Cross validation parameters
#cv = 5
train_score_ary = []
test_score_ary = []
#X_index = np.zeros((800,1))
# kf = KFold(n_splits=cv,random_state = 0,shuffle = True)

#Open downloaded file and load data
with h5py.File(filename,"r") as f:
    print("Running cross validation...")
    #for train_index,test_index in kf.split(X_index):
    #Fit on train data
    #del X_train
    gc.collect()
    d = f['/features']
    X_train = np.zeros((700,d.shape[1]))
    d.read_direct(X_train,np.s_[:700],np.s_[:700])
    
    d = f['/labels']
    y_train = np.zeros(700)
    d.read_direct(y_train,np.s_[:700],np.s_[:700])
    
    clf.fit(X_train,y_train)

    #Train Score
    train_score = log_loss_scorer(clf,X_train,y_train)
    del X_train,y_train #Save memory

    #Val Score
    X_test = np.zeros((100,f['/features'].shape[1]))
    X_test = f['/features'].read_direct(X_test,np.s_[700:800],np.s_[700:800])
    y_test = np.zeros(100)
    y_test = f['/labels'].read_direct(y_test,np.s_[700:800],np.s_[700:800])

    #Val score
    test_score = log_loss_scorer(clf,X_test,y_test)
    del X_test, y_test

    #Append to lists
    train_score_ary.append(train_score)
    test_score_ary.append(test_score)         

#Return dictionary with cross-validation data using 5 fold stratified validation
mean_train = np.mean(train_score_ary)
mean_test = np.mean(test_score_ary)


Downloading dataset for layer 1...
Running cross validation...


MemoryError: 