We're going to first try training a CNN on the individual images.
We will be using binary cross entropy across the 17 regions.

In [1]:
import HelperFuncs as hfuncs
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils.data_utils import Sequence
import h5py
import os

BATCH_SIZE = 1
FINAL_WIDTH = 400
FINAL_HEIGHT = 600
CHANNELS = 1
ZONES = 17
ANGLES = 4 #Just using 4 angles here
#Create directories for sequencer function if they don't exist
for d in ['temp/train_scan/','temp/test_scan/','temp/val_scan/']:
    if not os.path.isdir(d):
        print("Created directory: {}".format(d))
        os.makedirs(d)
        
class LegScanSequencer(Sequence):
    idx_dict={}
    
    def __init__(self,num_batches,bucket_name,mode="train_scan"):
        self.num_batches = num_batches
        self.bucket_name = bucket_name
        self.mode = mode
        self.key_id, self.secret_key = hfuncs.GetAWSCredentials()
        self.mode = mode
        self.angles = np.arange(0,64,64//ANGLES)
    def __len__(self):
        return self.num_batches
    def on_epoch_end(self):
        pass
    def __getitem__(self,idx):
        #Get Client
        client = hfuncs.GetAWSClient(self.key_id,self.secret_key)
        bucket = client.Bucket(self.bucket_name)
        
        #Initialize vectors
        X_train = np.zeros((BATCH_SIZE,ANGLES,FINAL_WIDTH,FINAL_HEIGHT//2,CHANNELS))
        y_train = np.zeros((BATCH_SIZE,1))
        
        j=0
        for i in range(idx*BATCH_SIZE,(idx+1)*BATCH_SIZE):
            #Download batch at index
            path = "temp/{}/batch_{}.hdf5".format(self.mode,i)
            key = "{}/batch_{}.hdf5".format(self.mode,i)
            bucket.download_file(Key=key,Filename=path)
        
            f = h5py.File(path,"r")
            try:
                X_train[j,:,:,:,:] = f['/image'].value[self.angles,:,:FINAL_HEIGHT//2,:]
                r_leg = [12,14,13,15]
                l_leg = [13,15]
                r_y = np.amax(f['/labels'].value[r_leg])
                #l_y = np.amax(f['/labels'].value[l_leg])
                y_train[j,:] = r_y
                j += 1
            finally:
                f.close()
                os.remove(path) 
        return X_train, y_train



Using TensorFlow backend.


In [2]:
from twilio.rest import Client
import configparser
from keras.callbacks import Callback

class SMSNotifier(Callback):
    def on_epoch_end(self,epoch,logs=None):
        #Execute every other epoch
        if epoch % 2 == 0:
            #Get config credentials
            config = configparser.ConfigParser()
            config.read('twilio.conf')
            account_sid = config['DEFAULT']['AccountID']
            auth_token = config['DEFAULT']['AuthToken']
            #Get client
            client = Client(account_sid, auth_token)
            #Create message
            if logs is not None:
                message = "Layer {} complete.".format(epoch)
            else:
                message = "Layer {} complete.".format(epoch)
            #Sendmessage
            message = client.messages.create(
                to="+16178884129", 
                from_="+18572142288",
                body=message)
        else:
            pass

In [3]:
#Build pre-trained V2 model
import numpy as np
from keras.layers import Input,Flatten,Dense,Concatenate,Dropout,concatenate,GlobalMaxPool2D
from keras.models import Model
from datetime import datetime
from keras.callbacks import TensorBoard,EarlyStopping,ModelCheckpoint,ReduceLROnPlateau
from keras.optimizers import Adam
from keras.metrics import binary_accuracy
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.losses import binary_crossentropy
from keras.applications.inception_v3 import InceptionV3,preprocess_input
from keras.layers.core import Lambda
import tensorflow as tf

def ToRGB(x):
    max_v = tf.reduce_max(x)
    min_v = tf.reduce_min(x)
    max_rgb = tf.constant(255,dtype=x.dtype)
    x = tf.floordiv(tf.multiply(tf.subtract(x,min_v),max_rgb),tf.subtract(max_v,min_v))
    return x
def ToGreyScale(x):
    #Divide RGB into 3
    scalar = tf.constant(3,dtype=x.dtype)
    x = tf.floordiv(x,scalar)
    shape = x.get_shape()
    #assume channel_last
    mult = [[1 for d in shape[:-1]],[3]]
    mult = [val for sublist in mult for val in sublist]
    return tf.tile(x,mult)
def ToNewShape(x):
    ndim = len(x.shape)
    if ndim == 5:
        return tf.reverse(tf.transpose(x,[0,1,3,2,4]),[-3])
    elif ndim == 4:
        return tf.reverse(tf.transpose(x,[0,2,1,3]),[-3])
    else:
        raise ValueError("Unexpected number of dims!")

def getSingleLegModel(layer_idx):
    #Single model image
    input_img = Input(shape=(FINAL_WIDTH,FINAL_HEIGHT//2,CHANNELS))
    
    #preprocess and extract channels
    input_img_pp = Lambda(ToRGB)(input_img)
    input_img_pp = Lambda(ToGreyScale)(input_img_pp)
    input_img_pp = Lambda(preprocess_input)(input_img_pp)
    input_img_pp = Lambda(ToNewShape)(input_img_pp)
    
    #Load resnet
    incep = InceptionV3(include_top=False,
                          weights='imagenet',
                          input_tensor=None,
                          input_shape=(FINAL_HEIGHT//2,FINAL_WIDTH,3),
                          pooling='max')
    for l in incep.layers:
        l.trainable=False

    #Take off top
    reduced_net = Model(incep.input,incep.get_layer('mixed{}'.format(layer_idx)).output)
    
    #Add to rest of the model
    output = reduced_net(input_img_pp)
    output = Flatten()(output)
    intermediate_model = Model(input_img,output)
    
    #Time distributed model
    input_scan = Input(shape=(ANGLES,FINAL_WIDTH,FINAL_HEIGHT//2,CHANNELS))  
    sequenced_model = TimeDistributed(intermediate_model)(input_scan)
  
    #One lstm layer for now
    #lstm = LSTM(lstm_dim,recurrent_dropout=0.10)(sequenced_model)
    
    #Finally, 1 dense layers
    #out = Dense(1,activation='sigmoid',use_bias=False)(lstm)
    #complete model
    try:
        return Model(input_scan,sequenced_model)#Model(input_scan, out)
    finally:
        del intermediate_model,sequenced_model,incep
        



In [157]:
#Test how many positive samples
import pickle
labels = hfuncs.GetLabelsDict(r'stage1_labels.csv')
filename = "data_separated.pickle"
with open(filename,"rb") as f:
   save = pickle.load(f)
   K_test= save['K_test']
   K_val = save['K_val']
   K_train = save['K_train']
s = 0
pos = 0
for k in K_train:
    k_clean = k.replace("DHSData/","").replace(".a3daps","")
    if k_clean in labels.keys():
        label = np.array(labels[k_clean])
        val = np.amax(label[[12,14]])
        if val == 1:
            s += 1
            pos += 1
        else:
            s += 1
print("total={},pos={}".format(s,pos))

total=687,pos=126


In [4]:
#Use model as a feature extractor and use traditional ML to sdeterine whther features have any predictive power
import h5py
from keras import backend as K
K.set_learning_phase(0)

#Bucket with clean data
UPLOAD_BUCKET = 'cleandhsdata' #bucket where clean data was stored
TEMP_DIR = 'temp' #Directory for file upload/downloads
key_id, secret_key = hfuncs.GetAWSCredentials()
client = hfuncs.GetAWSClient(key_id,secret_key)
bucket = client.Bucket(UPLOAD_BUCKET)

#Initialize train sequencer
mode ="train_scan"
num_batches_train = (sum([1 if "{}/".format(mode) in k.key else 0 for k in bucket.objects.all()])-1)//BATCH_SIZE #train,test,val root directories have their own keys
train_seq = LegScanSequencer(num_batches_train,UPLOAD_BUCKET,mode=mode)

#Initialize validation sequencer
mode = "val_scan"
num_batches_val = (sum([1 if "{}/".format(mode) in k.key else 0 for k in bucket.objects.all()])-1)//BATCH_SIZE #train,test,val root directories have their own keys
val_seq = LegScanSequencer(num_batches_val,UPLOAD_BUCKET,mode=mode)

#Create notifier
notify = SMSNotifier()

#Create function that creates data set for given layer
def CreateFeatureDataSet(layer_idx,dir_name = 'featureextraction',max_batches=800):
    #Get model and output size
    model = getSingleLegModel(layer_idx)
    output_size = model.output_shape[1]*model.output_shape[2]
    
    #Variables to iterate over
    #modes = ['train','val']
    #num_batches = [num_batches_train,num_batches_val]
    #generators = [train_seq,val_seq]
    modes = ['train']
    num_batches = [num_batches_train]
    generators = [train_seq]
    
    
    for mode,num_b,gen in zip(modes,num_batches,generators):
        #Initialize dataset array
        X_d = np.zeros((min(num_b,max_batches),output_size))
        y_d = np.zeros((min(num_b,max_batches)))

        #For every item in train generator, transform data and store in dataset array
        for i in range(min(num_b,max_batches)):
            print("Storing {} in {} set...".format(i,mode))
            X, y = gen.__getitem__(i)
            X = model.predict(X)
            X_d[i,:] = X.flatten()
            y_d[i] = y[0,0]
            i += 1

        #Store data set in s3
        key_suffix = "{}_layer{}.hdf5".format(mode,layer_idx)
        filename = os.path.join(TEMP_DIR,key_suffix)
        key = "{}/{}".format(dir_name,key_suffix)

        #Save in local hdf5 file
        with h5py.File(filename,"w") as f:
            dset = f.create_dataset('features',data=X_d)
            dset2 = f.create_dataset('labels',data=y_d)

        #Upload file to bucket, then delete
        try:
            bucket.upload_file(Filename=filename,Key=key)
            print("Completed {} upload for layer {}".format(mode,layer_idx))
        finally:
            os.remove(filename)

        #Delete train arrays to save memory
        del X_d,y_d
    
    #Send notification that job was completed
    try:
        notify.on_epoch_end(layer_idx)
    except:
        print("Couldn't send notification!")
        
#for l in range(11):
 #   try:
 #       CreateFeatureDataSet(l)
 #   except:
   #     print("Failed to create feature set {}".format(l))

In [5]:
#For every layer in the data set, we're going to train a gardient booster classifier 
#using hyperparameters we selected from before
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss,make_scorer
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
import gc

%matplotlib inline

def getScores(layer_idx,dir_name = 'featureextraction'):
    #Scorer
    log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)
    
    #Classifier
    clf = GradientBoostingClassifier(min_samples_split=3,max_depth=5,learning_rate=0.05,n_estimators=100)
    
    #Grab data
    #Download dataset
    print("Downloading dataset for layer {}...".format(layer_idx))
    mode = 'train'
    key_suffix = "{}_layer{}.hdf5".format(mode,layer_idx)
    filename = os.path.join(TEMP_DIR,key_suffix)
    key = "{}/{}".format(dir_name,key_suffix)
 
    bucket.download_file(Key=key,Filename=filename)
    
    try:
        #Cross validation parameters
        #cv = 5
        train_score_ary = []
        test_score_ary = []
        #X_index = np.zeros((800,1))
       # kf = KFold(n_splits=cv,random_state = 0,shuffle = True)

        #Open downloaded file and load data
        with h5py.File(filename,"r") as f:
            print("Running cross validation...")
            #for train_index,test_index in kf.split(X_index):
            gc.collect()
            #Fit on train data
            X_train = f['/features'].vaue[:700]
            y_train = f['/labels'].value[:700]
            clf.fit(X_train,y_train)

            #Train Score
            train_score = log_loss_scorer(clf,X_train,y_train)
            del X_train,y_train #Save memory

            #Val Score
            X_test = f['/features'].value[700:800] 
            y_test = f['/labels'].value[700:800]

            #Val score
            test_score = log_loss_scorer(clf,X_test,y_test)
            del X_test, y_test

            #Append to lists
            train_score_ary.append(train_score)
            test_score_ary.append(test_score)         

        #Return dictionary with cross-validation data using 5 fold stratified validation
        mean_train = np.mean(train_score_ary)
        #std_train = np.std(train_score_ary)
        mean_test = np.mean(test_score_ary)
       # std_test = np.std(test_score_ary)

        return mean_test,mean_train
    finally:
        os.remove(filename)

#num_layers = 8
#mean_train_ary = np.zeros(num_layers)
#std_train_ary = np.zeros(num_layers)
#mean_test_ary = np.zeros(num_layers)
#std_test_ary = np.zeros(num_layers)

#for l in range(1):
#    gc.collect()
#    print("Training layer {}".format(l))
#    mean_test,std_test,mean_train,std_train = getScores(l)
#    mean_train_ary[l] = mean_train
#    std_train_ary[l] = std_train
#    mean_test_ary[l] = mean_test
#    std_test_ary[l] = std_test

#layers = np.arange(0,8)
#plt.figure()
#plt.errorbar(x=layers,y=mean_test_ary,yerr=std_test_ary)


In [6]:
#For every layer in the data set, we're going to train a gardient booster classifier 
#using hyperparameters we selected from before
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss,make_scorer
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
import gc

%matplotlib inline

#Cross validation parameters
#Scorer
layer_idx = 1
dir_name = 'featureextraction'
 #Scorer
log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

#Classifier
clf = GradientBoostingClassifier(min_samples_split=3,max_depth=5,learning_rate=0.05,n_estimators=100)

#Grab data
#Download dataset
print("Downloading dataset for layer {}...".format(layer_idx))
mode = 'train'
key_suffix = "{}_layer{}.hdf5".format(mode,layer_idx)
filename = os.path.join(TEMP_DIR,key_suffix)
key = "{}/{}".format(dir_name,key_suffix)

if not os.path.exists(filename):
    bucket.download_file(Key=key,Filename=filename)
#Cross validation parameters
#cv = 5
train_score_ary = []
test_score_ary = []
#X_index = np.zeros((800,1))
# kf = KFold(n_splits=cv,random_state = 0,shuffle = True)

#Open downloaded file and load data
with h5py.File(filename,"r") as f:
    print("Running cross validation...")
    #for train_index,test_index in kf.split(X_index):
    #Fit on train data
    #del X_train
    gc.collect()
    d = f['/features']
    X_train = np.zeros((700,d.shape[1]))
    d.read_direct(X_train,np.s_[:700],np.s_[:700])
    
    d = f['/labels']
    y_train = np.zeros(700)
    d.read_direct(y_train,np.s_[:700],np.s_[:700])
    
    clf.fit(X_train,y_train)

    #Train Score
    train_score = log_loss_scorer(clf,X_train,y_train)
    del X_train,y_train #Save memory

    #Val Score
    X_test = np.zeros((100,f['/features'].shape[1]))
    X_test = f['/features'].read_direct(X_test,np.s_[700:800],np.s_[700:800])
    y_test = np.zeros(100)
    y_test = f['/labels'].read_direct(y_test,np.s_[700:800],np.s_[700:800])

    #Val score
    test_score = log_loss_scorer(clf,X_test,y_test)
    del X_test, y_test

    #Append to lists
    train_score_ary.append(train_score)
    test_score_ary.append(test_score)         

#Return dictionary with cross-validation data using 5 fold stratified validation
mean_train = np.mean(train_score_ary)
mean_test = np.mean(test_score_ary)


Downloading dataset for layer 1...
Running cross validation...


MemoryError: 