In [29]:
from CustomTextCNN import CustomTextCNN
from utils import *
from sklearn.model_selection import KFold

# Load data
x_text, y = load_data_and_labels()

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))


encodedPathwayA, encodedPathwayB = list(vocab_processor.transform(['pathwayA pathwayB']))[0][:2]
print("encodedPathwayA = %s" % encodedPathwayA, "encodedPathwayB = %s" % encodedPathwayB)

word_distancesA = load_word_distancesA()
word_distancesB = load_word_distancesB()

pos_embedding = load_pos_embedding()


def perform_cross_validation(experiment="default", 
                             embedding_size=128, filter_sizes=[3,4,5], 
                             num_filters=128, batch_size=64, 
                             l2_reg_lambda=0.0, num_epochs=20,
                             include_word_embedding=True,
                             include_position_embedding=True,
                             include_pos_embedding=True):
    
    # Creating folds
    kf = KFold(n_splits=4, random_state=5, shuffle=True)
    for k, (train_index, test_index) in enumerate(kf.split(x, y)):
        x_train, x_dev = x[train_index], x[test_index]
        y_train, y_dev = y[train_index], y[test_index]

        train_word_distancesA = word_distancesA[train_index]
        train_word_distancesB = word_distancesB[train_index]

        test_word_distancesA = word_distancesA[test_index]
        test_word_distancesB = word_distancesB[test_index]

        train_pos_embedding = pos_embedding[train_index]
        test_pos_embedding = pos_embedding[test_index]

        print("Starting Fold: %s =>" % k, "Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

        model = CustomTextCNN(sequence_length=x_train.shape[1],
                                        vocab_processor=vocab_processor, 
                                          num_epochs=num_epochs, 
                                          embedding_size=embedding_size,
                                          filter_sizes=filter_sizes, 
                                          num_filters=num_filters,
                                          batch_size=batch_size, 
                                          l2_reg_lambda=l2_reg_lambda,
                                          evaluate_every=300, 
                                          results_dir=experiment+'_fold_%s'%k, 
                                          word_embedding=include_word_embedding,
                                          position_embedding=include_position_embedding,
                                          pos_embedding=include_pos_embedding)
        
        model.train_network(x_train, y_train, x_dev, y_dev, 
                            train_word_distancesA, train_word_distancesB, test_word_distancesA, test_word_distancesB,
                           train_pos_embedding, test_pos_embedding)

Vocabulary Size: 33447
encodedPathwayA = 8 encodedPathwayB = 53


In [26]:
print("Varying filter combinations")
filter_sets = [[3,4], [4,5], [3,5], [3], [4], [5]]
for i in range(len(filter_sets)):
    experiment = "filter_sizes_%s" % '_'.join([str(j) for j in filter_sets[i]])
    print("Starting Experiment - %s \n\n\n" % experiment)
    perform_cross_validation(experiment=experiment, 
                             filter_sizes=filter_sets[i])

Varying filter combinations
Starting Experiment - filter_sizes_3_4 



Starting Fold: 0 => Train/Dev split: 31795/10599


 #################### 


SEQUENCE LENGTH 273
BATCH SIZE 64
EMBEDDING SIZE 128
FILTER SIZES [3, 4]
NUMBER OF FILTERS 128
L2 REG LAMBDA 0.0
EPOCHS 20



RESULT DIR filter_sizes_3_4_fold_0
VOCAB SIZE 33447
DROPOUT PROBABILITY 0.5



WORD EMBEDDING True
POSITION EMBEDDING True
POS EMBEDDING True


 #################### 


INFO:tensorflow:Summary name word_embedding/W:0/grad/hist is illegal; using word_embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name word_embedding/W:0/grad/sparsity is illegal; using word_embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpo

KeyboardInterrupt: 

In [27]:
print("Varying number of filter")
filter_numbers = [32, 64, 128, 256]
for i in range(len(filter_numbers)):
    experiment = "num_filter_%s" % filter_numbers[i]
    print("Starting Experiment - %s \n\n\n" % experiment)
    perform_cross_validation(experiment=experiment, 
                             num_filters=filter_numbers[i])

Varying number of filter
Starting Experiment - num_filter_32 



Starting Fold: 0 => Train/Dev split: 31795/10599


 #################### 


SEQUENCE LENGTH 273
BATCH SIZE 64
EMBEDDING SIZE 128
FILTER SIZES [3, 4, 5]
NUMBER OF FILTERS 32
L2 REG LAMBDA 0.0
EPOCHS 20



RESULT DIR num_filter_32_fold_0
VOCAB SIZE 33447
DROPOUT PROBABILITY 0.5



WORD EMBEDDING True
POSITION EMBEDDING True
POS EMBEDDING True


 #################### 


INFO:tensorflow:Summary name word_embedding/W:0/grad/hist is illegal; using word_embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name word_embedding/W:0/grad/sparsity is illegal; using word_embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_

KeyboardInterrupt: 

In [28]:
print("Varying batch size")
batch_sizes = [32, 64, 128, 256]
for i in range(len(batch_sizes)):
    experiment = "batch_size_%s" % batch_sizes[i]
    print("Starting Experiment - %s \n\n\n" % experiment)
    perform_cross_validation(experiment=experiment, 
                             batch_size=batch_sizes[i])

Varying batch size
Starting Experiment - batch_size_32 



Starting Fold: 0 => Train/Dev split: 31795/10599


 #################### 


SEQUENCE LENGTH 273
BATCH SIZE 32
EMBEDDING SIZE 128
FILTER SIZES [3, 4, 5]
NUMBER OF FILTERS 128
L2 REG LAMBDA 0.0
EPOCHS 20



RESULT DIR batch_size_32_fold_0
VOCAB SIZE 33447
DROPOUT PROBABILITY 0.5



WORD EMBEDDING True
POSITION EMBEDDING True
POS EMBEDDING True


 #################### 


INFO:tensorflow:Summary name word_embedding/W:0/grad/hist is illegal; using word_embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name word_embedding/W:0/grad/sparsity is illegal; using word_embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/gra

KeyboardInterrupt: 