### This notebook implements a custom `GridSearchCV` on the ELG dataset, including resampling of the training set at each CV split

In [1]:
#import keras
#from keras.models import Sequential
#from keras.layers import Dense, Dropout, Activation
#from keras.optimizers import SGD
from imblearn.over_sampling import SMOTE
import numpy as np
#import jplus
import class_tools as tools 
from sklearn.preprocessing import StandardScaler 
import os
import tensorflow as tf
# The following removes deprecation messages from tensorflow
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#tf.debugging.set_log_device_placement(True)






Using TensorFlow backend.


In [2]:
#from keras import metrics
from sklearn import metrics


In [3]:
#load databases

#DataDir = './data/'
DataDir = '.'
import pickle
dset = pickle.load(open('%s/training_full.data'%DataDir,mode='rb'),encoding='latin1')
print (dset.keys())
ngal = len(dset['class'])
print (ngal)
dset_original = dset.copy() # keeping a copy of original datapoints for 


dict_keys(['rSDSS', 'iSDSS', 'obj', 'gSDSS', 'J0395', 'zSDSS', 'J0378', 'J0430', 'uSDSS', 'dm_j0660', 'J0660', 'J0410', 'J0515', 'J0861', 'class'])
751


In [4]:
def default_pars():
    defpars = {}
    nf = 20
    defpars['optimizer'] = 'adam'
    defpars['activation'] = 'sigmoid'
    defpars['dropout'] = 0.0
    defpars['nfeat'] = nf
    defpars['epochs'] = 10
    defpars['batch_size'] = 32
    defpars['neurons'] = [int(nf)/2, int(nf)/2]
    return defpars

def collect_pars(pars):
    # collecting parameters
    
    # retrieve all parameters and default values
    allpars = default_pars()
    # dict to store parameters to be used
    out_pars = {}
    for key in allpars:
        if key in pars: # if key found in input pars, then use that
            out_pars[key] = pars[key]
        else: # otherwise use default value
            out_pars[key] = allpars[key]
    
    return out_pars
#activation, dropout, nfeat, epochs, batch_size,neurons


In [5]:
from sklearn.metrics import f1_score
def gridsearchcv_elgs(pars, kfold, ith, index, importance, inames, verbose=False):

    pv = collect_pars(pars)
    nr = 1000
    nrt = int(nr/kfold)
    
    #activation, dropout, nfeat, epochs, batch_size, neurons = collect_pars(pars)    
    train, test = tools.CV_split(dset, kfold,ith)
    if verbose:
        print ('Original trainset contains %d, test contains %d',
               len(train['class']), len(test['class']))
    train_rs = tools.resample_errors(train, nr=nr,balance_set=True)
    test_rs = tools.resample_errors(test,nr=nrt, balance_set=True)
    
    feat_train, err_train, fname_train = tools.get_features(train_rs)
    feat_test, err_test, fname_test    = tools.get_features(test_rs)
    y_train = [tools.class_to_int(x) for x in train_rs['class']]
    y_test = [tools.class_to_int(x) for x in test_rs['class']]
    
    del train, test, train_rs, test_rs 
    
    feats_id = index[0:pv['nfeat']]
    
    x_train = [np.array(x)[feats_id] for x in feat_train]
    x_test = [np.array(x)[feats_id] for x in feat_test]

    y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=len(np.unique(y_train)))
    y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=len(np.unique(y_test)))


    Scaledata = True
    if Scaledata:
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train_sc = scaler.transform(x_train)
        x_test_sc = scaler.transform(x_test)
        #x_test  = scaler.transform(x_test)

    nfeat = int(pv['nfeat'])
    n_layers = len(pv['neurons'])
    
    modlist = []
    for i in range(n_layers):
        modlist.append(tf.keras.layers.Dense(pv['neurons'][i], 
                    activation=pv['activation'], input_dim=pv['nfeat']))
        modlist.append(tf.keras.layers.Dropout(pv['dropout']))
    
    modlist.append(tf.keras.layers.Dense(4, activation='softmax'))
    
    model = tf.keras.Sequential(modlist)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=pv['optimizer'])

    model.fit(x_train_sc, y_train_cat,
              shuffle=True,
              verbose=0,
              epochs=pv['epochs'],
              batch_size=pv['batch_size'],
              validation_data=(x_test_sc, y_test_cat)
              )

    pred_te = model.predict(x_test_sc)
    pred_test = [np.argmax(x) for x in pred_te]
    y_new_test = [np.argmax(x) for x in y_test_cat]

    score_test = f1_score(y_new_test, pred_test,average=None)
    return score_test[2]

In [6]:
#define grid
import itertools

grid = {'activation':['relu','sigmoid'],
        'optimizer':['adam','sgd'],
        'dropout':[0.1],#,0.2],
        'nfeat':[20],
        'epochs':[10,20,50],
        'batch_size':[64],
        'neurons':[ 
            [10,10],
            [20,20]
        ]
}

# define other parameters

kfold = 5 #k-fold CV
score_func = metrics.f1_score


ngridpoints = 1
for key in grid.keys():
    ngridpoints *= len(grid[key])
    
print ('There are %d grid points to evaluate.'%ngridpoints)
params = list(grid.keys())
nparams = len(grid.keys())
scores = np.zeros(ngridpoints)
    
if nparams == 3:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])))
if nparams == 4:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])), range(len(grid[params[3]])))
if nparams == 5:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])), range(len(grid[params[3]])),
                             range(len(grid[params[4]])))
if nparams == 6:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])), range(len(grid[params[3]])),
                             range(len(grid[params[4]])), range(len(grid[params[5]])))

if nparams == 7:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])), range(len(grid[params[3]])),
                             range(len(grid[params[4]])), range(len(grid[params[5]])),
                             range(len(grid[params[6]])))

    

prodlist = list(prod)

#ll = list(prod)
#print
#print len(ll)
#print ll[0]

# TODO: Now build the function that iterates over each grid parameter and evaluates the resulting NN.

There are 24 grid points to evaluate.


In [7]:
import multiprocessing as mp


nproc = 2
npc = len(prodlist)/nproc
ik = 0

if npc < 1:
    npc = 1
    nproc = ngridpoints
    print ('Changed nproc=%d and npc=%d'%(nproc, npc))

print (npc)
def run_conf(ip):
    core_config = tf.compat.v1.ConfigProto()
    core_config.gpu_options.allow_growth = True
    session = tf.compat.v1.Session(config=core_config)
    ik = 0
    results = {}
    results['score'] = []
    for par in params:
        results[par] = []

    i0 = int(npc* ip)
    i1 = int(npc* (ip+1)) if ip != nproc-1 else len(prodlist) # last proc scans grid points until the end
    for iz in range(i0,i1):
        z = prodlist[iz]
        if ik == 0: # first thing to do is to define the original features pool set
            feat_train, err_train, fname_train = tools.get_features(dset)
            y_train = [tools.class_to_int(x) for x in dset['class']]
            index, importance, inames = tools.feature_importance(feat_train, y_train, 
                                                                 fname_train)

        parval = {}
        for ip in range(nparams):
            par = params[ip]
            parval[par] = grid[par][z[ip]]
            results[par].append(parval[par])

        scores_cv = []
        #kfold = parval['kfold']
        for kf in range(kfold):  # create a CV split
            scores_cv.append(gridsearchcv_elgs(parval, kfold, kf, index, importance, 
                                               inames))
        results['score'].append(np.mean(scores_cv))
        print(z,'->','%.3f'%results['score'][ik])
        ik += 1
    session.close()
    return results

12.0


In [None]:
RunInParallel = False

if RunInParallel:
#    mp.set_start_method('spawn', force=True)
    pool = mp.Pool(processes=nproc)
    res = [pool.apply_async(run_conf, args=(x,)) for x in range(nproc)]
    sc = [p.get() for p in res]
    scores = [x['score'] for x in sc]
    scores = [x for y in scores for x in y]
    id_best = np.argmax(scores)
    best_pars = sc[id_best]
    print (' Best configuration found ',best_pars)
#res = run_conf(0)
    print ('Scores:',scores[id_best])
else:
    run_conf(0)

(0, 0, 0, 0, 0, 0, 0) -> 0.729


In [None]:
import tensorflow as tf
print (tf.__version__)


In [None]:
tf.test.is_gpu_available(cuda_only=True)

In [None]:
tf.debugging.set_log_device_placement(True)