# Code for Ice-Cube 3D CNN

- Oct 29, 2018: This code just makes plots for previously trained CNNs

In [1]:
import sys
import os

import matplotlib.pyplot as plt
import numpy as np
import glob
import pickle
import time

In [2]:
%matplotlib widget
# %matplotlib inline

Useful blog for keras conv3D: http://learnandshare645.blogspot.com/2016/06/3d-cnn-in-keras-action-recognition.html

In [3]:
# keras modules
import keras
from keras import layers, models, optimizers, callbacks  # or tensorflow.keras as keras
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve, auc, roc_auc_score
from keras.models import load_model



Using TensorFlow backend.


### Steps:
- Data processing
    - Read raw data
    - Process data
    - Save process data
    - Read processed data
- Model
    - Define model
    - Train model
    - Validate model
    - Plot accuracy and loss
    - Save model
- Test
    - Read model
    - Read training data
    - Get weights
    - Test model
    - Plot ROC curve


## Modules

### Extracting data

In [None]:
# Load data from files
def f_load_data(data_dir,f1,f2,f3):
    ''' Load extracted data from files. Three files for xdata,ydata,weights.
    arguments: data directory, f1,f2,f3 
    returns : inpx,inpy,weights as arrays
    '''

    inpx=np.load(data_dir+f1+'.npy')
    inpy=np.load(data_dir+f2+'.npy')
    wts=np.load(data_dir+f3+'.npy')
    print(inpx.shape,inpy.shape)
    
    
    return inpx,inpy,wts


### Format data


In [5]:
#### Shuffle and split data ####

def f_shuffle_data(inpx,inpy,wts):
    ## Shuffle data
    
    # Setting seed
    seed=243
    np.random.seed(seed=seed)

    ## Get shuffled array of indices
    shuffle_arr=np.arange(inpx.shape[0])
    np.random.shuffle(shuffle_arr)
    inpx=inpx[shuffle_arr]
    inpy=inpy[shuffle_arr]
    wts=wts[shuffle_arr]

    return inpx,inpy,wts

def f_drop_data(inpx,inpy,wts,data_size):
    # Drop data for quick training. Just taking the slice of the data from the top.
    
    full_size=inpy.shape[0]
    assert(data_size<=full_size),"data_size: %s in f_drop_data is more than full data size: %s"%(data_size,full_size)
        
    temp=inpx[:data_size]
    del(inpx)
    inpx=temp.copy()
    temp=inpy[:data_size]
    del(inpy)
    inpy=temp.copy()
    temp=wts[:data_size]
    del(wts)
    wts=temp.copy()
    
    del(temp)    
    
    
    return (inpx,inpy,wts)        
        
       

def f_split_data(inpx,inpy,wts,test_fraction):
    '''
    Split data for training and test. validation from training piece of data.
    !! Warning this code deletes inpx,inpy inside the function. can't help it because the arrays are too big!!
    '''
    
    num=inpx.shape[0]
    test_idx=int(test_fraction*num)
    train_idx=num-test_idx

    train_x=inpx[:train_idx]
    train_y=inpy[:train_idx]
    train_wts=wts[:train_idx]
    
    test_x=inpx[train_idx:]
    test_y=inpy[train_idx:]
    test_wts=wts[train_idx:]
    
    return train_x,train_y,train_wts,test_x,test_y,test_wts


def f_format_data(inpx,inpy,wts,shuffle_flag=True,drop_data=True,data_size=1000,test_fraction=0.25):
    ''' Shuffle, drop and split data for train-test
    '''
    # Shuffle data
    if shuffle_flag: inpx,inpy,wts=f_shuffle_data(inpx,inpy,wts)
    # Drop data
    if drop_data: inpx,inpy,wts=f_drop_data(inpx,inpy,wts,data_size)

#     print(inpy[inpy==0.0].shape,inpy[inpy>0.0].shape,inpy.shape)
    
#     # Plot data
#     plt.figure()
#     plt.plot(inpy[:],linestyle='',marker='*',markersize=1)
#     plt.title("Plot of y data after shuffle")
#     plt.show() 
    
    # Split data into train-test.
    train_x,train_y,train_wts,test_x,test_y,test_wts=f_split_data(inpx,inpy,wts,test_fraction)
    
    print('Data sizes: train_x{0},train_y{1},test_x{2},test_y{3}'.format(train_x.shape,train_y.shape,test_x.shape,test_y.shape))

    return train_x,train_y,train_wts,test_x,test_y,test_wts


### Model details

In [6]:
### Defining all the models tried in the study

def f_define_model(inpx,name):
    '''
    Function that defines the model and compiles it.
    '''
    
    inputs = layers.Input(shape=inpx.shape[1:])
    h = inputs
    
    # Choose model
    if name=='1':
        print("model %s"%name)
        # Convolutional layers
        conv_sizes=[10, 10, 10]
        conv_args = dict(kernel_size=(3, 3, 3), activation='relu', padding='same')
        for conv_size in conv_sizes:
            h = layers.Conv3D(conv_size, **conv_args)(h)
            h = layers.MaxPooling3D(pool_size=(2, 2, 2))(h)
    #         h = layers.Dropout(0.5)(h)
        h = layers.Flatten()(h)

        # Fully connected  layers
        h = layers.Dense(10, activation='relu')(h)
        #    h = layers.Dropout(0.5)(h)

        # Ouptut layer
        outputs = layers.Dense(1, activation='sigmoid')(h)
    
    
    elif name=='2':
        print("model %s"%name)
        # Convolutional layers
        conv_sizes=[10,10,10]
        conv_args = dict(kernel_size=(3, 3, 3), activation='relu', padding='same')
        for conv_size in conv_sizes:
            h = layers.Conv3D(conv_size, **conv_args)(h)
            h = layers.MaxPooling3D(pool_size=(2, 2, 2))(h)
            h = layers.Dropout(0.5)(h)
        h = layers.Flatten()(h)

        # Fully connected  layers
        h = layers.Dense(64, activation='relu')(h)
        h = layers.Dropout(0.5)(h)

        # Ouptut layer
        outputs = layers.Dense(1, activation='sigmoid')(h)
        
    elif name=='3':
        print("model %s"%name)
        # Convolutional layers
        conv_sizes=[6,6,6]
        conv_args = dict(kernel_size=(3, 3, 3), activation='relu', padding='same')
        for conv_size in conv_sizes:
            h = layers.Conv3D(conv_size, **conv_args)(h)
            h = layers.MaxPooling3D(pool_size=(2, 2, 2))(h)
            h = layers.Dropout(0.5)(h)
        h = layers.Flatten()(h)

        # Fully connected  layers
        h = layers.Dense(64, activation='relu')(h)
        h = layers.Dropout(0.5)(h)

        # Ouptut layer
        outputs = layers.Dense(1, activation='sigmoid')(h)
    
    elif name=='4':
        print("model %s"%name)
        # Convolutional layers
        conv_sizes=[6,6,6]
        conv_args = dict(kernel_size=(3, 3, 3), activation='relu', padding='same')
        for conv_size in conv_sizes:
            h = layers.Conv3D(conv_size, **conv_args)(h)
            h = layers.MaxPooling3D(pool_size=(2, 2, 2))(h)
            h = layers.Dropout(0.5)(h)
        h = layers.Flatten()(h)

        # Fully connected  layers
        h = layers.Dense(120, activation='relu')(h)
        h = layers.Dropout(0.5)(h)

        # Ouptut layer
        outputs = layers.Dense(1, activation='sigmoid')(h)
        
    elif name=='5':
        print("model %s"%name)
        # Convolutional layers
        conv_sizes=[6,6]
        conv_args = dict(kernel_size=(2, 4, 15), activation='relu', padding='same')
        for conv_size in conv_sizes:
            h = layers.Conv3D(conv_size, **conv_args)(h)
            h = layers.MaxPooling3D(pool_size=(3, 3, 3))(h)
            h = layers.Dropout(0.5)(h)
        h = layers.Flatten()(h)

        # Fully connected  layers
        h = layers.Dense(120, activation='relu')(h)
        h = layers.Dropout(0.5)(h)

        # Ouptut layer
        outputs = layers.Dense(1, activation='sigmoid')(h)
        
    
    ############################################
    ####### Compile model ######################
    ############################################
    
    model = models.Model(inputs, outputs)
    model.compile(optimizer=optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
#     model.summary()

    return model


### Train and perform fit

In [27]:

def f_train_model(model,inpx,inpy):
    '''
    Train model. Returns just history.history
    '''
    cv_fraction=0.33 # Fraction of data for cross validation
    
    history=model.fit(x=inpx, y=inpy,
                    batch_size=32,
                    epochs=5,
                    verbose=1,
#                     callbacks = [callbacks.ModelCheckpoint('./rpv_weights.h5')],
                    validation_split=cv_fraction,
                    shuffle=True
                )
    
    print("Number of parameters",model.count_params())
    
    return history.history

def f_plot_learning(history):
    
    plt.figure()
    # Plot training & validation accuracy values
    plt.plot(history['acc'],label='Train')
    plt.plot(history['val_acc'],label='Validation')
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    plt.show()

    # Plot training & validation loss values
    plt.figure()
    plt.plot(history['loss'],label='Train')
    plt.plot(history['val_loss'],label='Validation')
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper left')
    plt.show()

def f_plot_roc_curve(fpr,tpr):
    '''
    Module for roc plot and printing AUC
    '''
    plt.figure()
    # plt.plot(fpr,tpr)
    plt.scatter(fpr,tpr)
    plt.xscale('log')
    plt.show()

    # AUC 
    auc_val = auc(fpr, tpr)
    print("AUC: ",auc_val)
    

def f_test_model(xdata,ydata,wts,model,model_name,model_save_dir,test_status=False):
    '''
    Test model and make ROC plot
    If model has been tested, store the y-predict values
    and read them in next time.

    '''
    
    test_file_name=model_save_dir+'y-predict_model-'+str(model_name)+'.pred'
    
    
#     model.evaluate(xdata,ydata,sample_weights=wts,verbose=1)
    if not test_status:# Predict values and store to file.
        y_pred=model.predict(xdata,verbose=1)
        # Save prediction file
        np.savetxt(test_file_name,y_pred)
        
    else: # Load y_predictions from file.
        print("Using test prediction from previous test",test_file_name)
        y_pred=np.loadtxt(test_file_name)
        
#     print(y_pred)
    fpr,tpr,threshold=roc_curve(ydata,y_pred,sample_weight=wts)
    print(fpr.shape,tpr.shape,threshold.shape)
    f_plot_roc_curve(fpr,tpr)


def f_perform_fit(train_x,train_y,train_wts,test_x,test_y,test_wts,model_dict,train_status=False,test_status=False):
    '''
    Compile, train, save and test the model.
    Steps:
    - Compile
    - Train
    - Save
    - Read
    - Plot
    - Test
    
    Note: Cross-validation data is built into the training. So, train_{x/y} contains the training and cval data.
    '''
    
    model_save_dir='/global/project/projectdirs/dasrepo/vpa/ice_cube/data_for_cnn/saved_models/'
    model_name=model_dict['name'] # string for the model
    fname_model,fname_history='model_{0}.h5'.format(model_name),'history_{0}.pickle'.format(model_name)
    
    if not train_status: # If not trained before, train the model and save it.

        ########################
        # Compile model
        model=f_define_model(train_x,model_name)
        # Train model
        history=f_train_model(model,train_x,train_y)

        ########################
        # Save model and history
        model.save(model_save_dir+fname_model)
        with open(model_save_dir+fname_history, 'wb') as f:
                pickle.dump(history, f)
    
    else:
        print("Using trained model")

        
    ########################
    ### Read model and history
    
    ### Check if files exist
    assert os.path.exists(model_save_dir+fname_model),"Model not saved"
    assert os.path.exists(model_save_dir+fname_history),"History not saved"
    
    model=load_model(model_save_dir+fname_model)
    with open(model_save_dir+fname_history,'rb') as f:
        history= pickle.load(f)
    
    ########################
    model.summary()
    # Plot tested model
    f_plot_learning(history)
    
    ########################
    # Test model
    f_test_model(test_x,test_y,test_wts,model,model_dict['name'],model_save_dir,test_status)

    model_dict['model'],model_dict['history']=model,history
    
    return model_dict



## Execution starts

In [15]:
def f_get_ydata_and_wts(data_dir,f1,f2):
    ''' Load extracted data from files. Just extracting ydata and weights
    returns : inpx,inpy,weights as arrays
    '''

    inpy=np.load(data_dir+f1+'.npy')
    wts=np.load(data_dir+f1+'.npy')
    
    return(inpy,wts)
    

def f_plot_fit(inpy,wts,model_dict):
    '''
    Plot fit results.
    '''
    
    model_save_dir='/global/project/projectdirs/dasrepo/vpa/ice_cube/data_for_cnn/saved_models/'
    model_name=model_dict['name'] # string for the model
    fname_model,fname_history='model_{0}.h5'.format(model_name),'history_{0}.pickle'.format(model_name)
    
        
    ########################
    ### Read model and history
    
    ### Check if files exist
    assert os.path.exists(model_save_dir+fname_model),"Model not saved"
    assert os.path.exists(model_save_dir+fname_history),"History not saved"
    
    model=load_model(model_save_dir+fname_model)
    with open(model_save_dir+fname_history,'rb') as f:
        history= pickle.load(f)
    
    ########################
    model.summary()
    # Plot tested model
    f_plot_learning(history)
    
    ########################
    # Get test predictions
    
    test_file_name=model_save_dir+'y-predict_model-'+str(model_name)+'.pred'
    assert os.path.exists(model_save_dir+test_file_name),"y-preditions not saved"
    print("Using test prediction from previous test",test_file_name)
    y_pred=np.loadtxt(test_file_name)
    assert(test_y.shape==y_pred.shape),"Data %s and prediction arrays %s are not of the same size"%(test_y.shape,y_pred.shape)
    
    fpr,tpr,threshold=roc_curve(ydata,y_pred,sample_weight=wts)
    print(fpr.shape,tpr.shape,threshold.shape)
    f_plot_roc_curve(fpr,tpr)
    
    model_dict['model'],model_dict['history']=model,history
    
    return model_dict


def f_format_data(inpy,wts,test_fraction):
    
    num=inpy.shape[0]
    test_idx=int(test_fraction*num)
    train_idx=num-test_idx

    test_y=inpy[train_idx:]
    test_wts=wts[train_idx:]
    
    return test_y,test_wts

In [10]:
if __name__=='__main__':
    ###Extract data : Only extract y-data and weights for tests.
    data_dir='/global/project/projectdirs/dasrepo/vpa/ice_cube/data_for_cnn/extracted_data_v/data/'
    f1,f2='processed_input_regular_y','processed_input_regular_wts'
    inpy,wts=f_get_ydata_and_wts(data_dir,f1,f2)
    test_y,test_wts=f_format_data(inpy,wts,test_fraction=0.25)


In [13]:
print(inpy.shape,wts.shape,test_y.shape,test_wts.shape)

(136066,) (136066,) (34016,) (34016,)


### Models

In [2]:
model_loc='/global/project/projectdirs/dasrepo/vpa/ice_cube/data_for_cnn/saved_models/'
dict_list=[]
# for i in range(1,6):
for i in range(1,2):
    model_dict={'name':str(i),'description':None,'model':None,'history':None}
    print(i,model_dict)
#     model_dict=f_plot_fit(test_y,test_wts,model_dict)
#     dict_list.append(model_dict)

1 {'name': '1', 'description': None, 'model': None, 'history': None}


### Comparing models

In [33]:
### Comparing different models:

for num,md in enumerate([model_dict1,model_dict2,model_dict3,model_dict4,model_dict5]):
    hist=md
#     print(md)
    print('Model %s'%(num+1))
    for key in hist.keys():
        print(key,hist[key])
        

Model 1
('model', <keras.engine.training.Model object at 0x2b6b2716ea50>)
('history', {'acc': [0.8359440129893263, 0.8894885408008836, 0.904977110848721, 0.912026677198011, 0.9179646936665], 'loss': [0.4940939987963115, 0.31283381096304425, 0.2737800265921407, 0.24922132658353088, 0.22983019208651087], 'val_acc': [0.8675950945784723, 0.8965762983674083, 0.9038809870255546, 0.9076224129263061, 0.916500875970532], 'val_loss': [0.3586907645015842, 0.287272262804921, 0.2866968094362769, 0.24678741439057064, 0.23340108330942097]})
('name', '1')
('description', 'simplest')
Model 2
('model', <keras.engine.training.Model object at 0x2b6b27902750>)
('history', {'acc': [0.7970105158495139, 0.8128354759946004, 0.822751670982107, 0.832258347597707, 0.8416187676448132], 'loss': [1.4162641630844435, 0.47639839410129786, 0.44579309258209165, 0.4209624629782049, 0.40285478480902426], 'val_acc': [0.8058615672452775, 0.8069899337862402, 0.8100187071330347, 0.8137898268883573, 0.8116221753754553], 'val_l

In [15]:
# model_dict1

### Re-plot

In [16]:
# # Re-plot
# m=model_dict1
# f_plot_learning(m['history'])
# f_test_model(test_x,test_y,m['model'])

## -----------------------------------------------

### Questions:
- Why are fpr and tpr different for 2 different models?


#### Notes:
- model.fit 
    - batch_size= sample of data used for training (subset of full training set). 
    - epoch= number of runs over training data
    - callbacks=
    
- for layers.Input need size (x,y,z,1) in channels_last mode.

#### Roc curve notes:
- We know y-value depending on signal or background (0 or 1).
- The 3D-Cnn gives us a prediction for y, as a float between 0 or 1.
- We must use a cut (threshold) to determine what constitues 0 / 1. Eg. 0.5
- This gives us a false +ve rate a, true +ve .(fpr and tpr)
- Roc curve plots this when varying the threshold
- AUC gives area under this curve.

In [17]:
# Plotting weights
print(train_wts.shape,test_wts.shape)

# Train data 
plt.figure()
plt.plot(train_wts)
plt.title("train + cv data weigts ")
plt.show()

plt.figure()
plt.plot(test_wts)
plt.title("test data weights")
plt.show()



((102050,), (34016,))


FigureCanvasNbAgg()

FigureCanvasNbAgg()

## To do
- pick the best model
- test on reserve data set
- running with multiple cores on a batch node.
- using multiple nodes
- using GPU nodes
- Test a host of models using ipyparallel
- make changes to incorporate regular data in training and reserved data in testing
- way to store tested values for easy plotting

## Notes:
- Code to 
    - show model summary and plots.
    - just view existing plots in files.