### This notebook implements a custom `GridSearchCV` on the ELG dataset, including resampling of the training set at each CV split

In [1]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from imblearn.over_sampling import SMOTE
import numpy as np
#import jplus
import class_tools as tools 
from sklearn.preprocessing import StandardScaler 


Using TensorFlow backend.
  utils.DeprecatedIn23,


In [2]:
#from keras import metrics
from sklearn import metrics


In [3]:
#load databases

#DataDir = './data/'
DataDir = '.'
import pickle
dset = pickle.load(open('%s/training_full.data'%DataDir))
print dset.keys()
ngal = len(dset['class'])
print ngal
dset_original = dset.copy() # keeping a copy of original datapoints for 


['rSDSS', 'iSDSS', 'obj', 'gSDSS', 'J0395', 'zSDSS', 'J0378', 'J0430', 'uSDSS', 'dm_j0660', 'J0660', 'J0410', 'J0515', 'J0861', 'class']
751


In [4]:
def collect_pars(pars):
    # collecting parameters
    if 'activation' in pars:
        activation = pars['activation']
    else:
        activation = 'sigmoid'
        
    if 'dropout' in pars:
        dropout = pars['dropout']
    else:
        dropout = 0.0
    
    if 'nfeat' in pars:
        nfeat = pars['nfeat']
    else:
        nfeat = 20
    
    if 'epochs' in pars:
        epochs = pars['epochs']
    else:
        epochs = 10

    return activation, dropout, nfeat, epochs


In [5]:
from sklearn.metrics import f1_score
def gridsearchcv_elgs(pars, kfold, ith, index, importance, inames):

    activation, dropout, nfeat, epochs = collect_pars(pars)    
    train, test = tools.CV_split(dset, kfold,ith)
    train_rs = tools.resample_errors(train, nr=2500,balance_set=True)
    feat_train, err_train, fname_train = tools.get_features(train_rs)
    feat_test, err_test, fname_test    = tools.get_features(test)
    y_train = [tools.class_to_int(x) for x in train_rs['class']]
    y_test = [tools.class_to_int(x) for x in test['class']]
    
    feats_id = index[0:nfeat]

    x_train = [np.array(x)[feats_id] for x in feat_train]
    x_test = [np.array(x)[feats_id] for x in feat_test]

    y_train_cat = keras.utils.to_categorical(y_train, num_classes=len(np.unique(y_train)))
    y_test_cat = keras.utils.to_categorical(y_test, num_classes=len(np.unique(y_test)))


    Scaledata = True
    if Scaledata:
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train_sc = scaler.transform(x_train)
        x_test_sc = scaler.transform(x_test)
        #x_test  = scaler.transform(x_test)

    model = Sequential()
    model.add(Dense(nfeat/2, activation=activation, input_dim=nfeat))
    model.add(Dropout(dropout))
    model.add(Dense(nfeat/2, activation=activation, input_dim=nfeat))
    model.add(Dense(4, activation='softmax'))

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam')

    model.fit(x_train_sc, y_train_cat,
              shuffle=True,
              verbose=0,
              epochs=epochs,
              batch_size=200,
              validation_data=(x_test_sc, y_test_cat)
              )

    pred_te = model.predict(x_test_sc)
    pred_test = [np.argmax(x) for x in pred_te]
    y_new_test = [np.argmax(x) for x in y_test_cat]

    score_test = f1_score(y_new_test, pred_test,average=None)
    return score_test[2]

In [6]:
#define grid
import itertools

grid = {'activation':['relu','sigmoid'],
        'dropout':[0.0,0.2],
        'nfeat':[10,20],
        'epochs':[10,30]
}

# define other parameters

kfold = 5 #k-fold CV
score_func = metrics.f1_score


ngridpoints = 1
for key in grid.keys():
    ngridpoints *= len(grid[key])
    
print 'There are %d grid points to evaluate.'%ngridpoints
params = grid.keys()
nparams = len(grid.keys())
scores = np.zeros(ngridpoints)
    
if nparams == 3:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])))
if nparams == 4:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])), range(len(grid[params[3]])))
if nparams == 5:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])), range(len(grid[params[3]])),
                             range(len(grid[params[4]])))
if nparams == 6:
    prod = itertools.product(range(len(grid[params[0]])), range(len(grid[params[1]])), 
                             range(len(grid[params[2]])), range(len(grid[params[3]])),
                             range(len(grid[params[4]])), range(len(grid[params[5]])))

    

prodlist = list(prod)

#ll = list(prod)
#print
#print len(ll)
#print ll[0]

# TODO: Now build the function that iterates over each grid parameter and evaluates the resulting NN.

There are 16 grid points to evaluate.


In [None]:
import multiprocessing as mp


nproc = 4
npc = len(prodlist)/nproc
ik = 0

def run_conf(ip):
    ik = 0
    results = {}
    for par in params:
        results[par] = []

    i0 = npc* ip
    i1 = npc* (ip+1) if ip != 3 else len(prodlist)-1 # last proc scans grid points until the end
    print i0, i1
    for iz in range(i0,i1):
        z = prodlist[iz]
        print z
        if ik == 0: # first thing to do is to define the original features pool set
            feat_train, err_train, fname_train = tools.get_features(dset)
            y_train = [tools.class_to_int(x) for x in dset['class']]
            index, importance, inames = tools.feature_importance(feat_train, y_train, fname_train)

        parval = {}
        for ip in range(nparams):
            par = params[ip]
            parval[par] = grid[par][z[ip]]
            results[par].append(parval[par])

        scores_cv = []
        for kf in range(kfold):  # create a CV split
            scores_cv.append(gridsearchcv_elgs(parval, kfold, kf, index, importance, inames))
        results['score'] = np.mean(scores_cv)
        ik += 1
    return results

pool = mp.Pool(processes=nproc)
res = [pool.apply_async(run_conf, args=(x,)) for x in range(nproc)]

In [11]:
sc = [p.get() for p in res]
scores = [x['score'] for x in sc]
id_best = np.argmin(scores)
best_pars = sc[id_best]
print best_pars

#res = run_conf(0)


print scores


    
    

{'epochs': [10, 10, 10, 10], 'activation': ['sigmoid', 'sigmoid', 'sigmoid', 'sigmoid'], 'score': 0.34926009139209285, 'dropout': [0.0, 0.0, 0.2, 0.2], 'nfeat': [10, 20, 10, 20]}
[0.40310770837157656, 0.34926009139209285, 0.4381112037132738, 0.3537178279799984]


In [8]:
print sc

[{'epochs': [10, 10, 10, 10], 'activation': ['relu', 'relu', 'relu', 'relu'], 'score': 0.40310770837157656, 'dropout': [0.0, 0.0, 0.2, 0.2], 'nfeat': [10, 20, 10, 20]}, {'epochs': [10, 10, 10, 10], 'activation': ['sigmoid', 'sigmoid', 'sigmoid', 'sigmoid'], 'score': 0.34926009139209285, 'dropout': [0.0, 0.0, 0.2, 0.2], 'nfeat': [10, 20, 10, 20]}, {'epochs': [30, 30, 30, 30], 'activation': ['relu', 'relu', 'relu', 'relu'], 'score': 0.4381112037132738, 'dropout': [0.0, 0.0, 0.2, 0.2], 'nfeat': [10, 20, 10, 20]}, {'epochs': [30, 30, 30], 'activation': ['sigmoid', 'sigmoid', 'sigmoid'], 'score': 0.3537178279799984, 'dropout': [0.0, 0.0, 0.2], 'nfeat': [10, 20, 10]}]
