In [1]:
import numpy as np
import pandas as pd
import re
import itertools
from collections import Counter
from tensorflow.contrib import learn
import pickle

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels():
    x_text = sentence_support_df.tokenizedSentenceFromPaper.as_matrix()
    y = sentence_support_df.label.as_matrix()
    y = [[0, 1] if x == 1 else [1, 0] for x in y  ]
    return [x_text, np.array(y)]

def compute_pathway_name_terms(pathway):
    pathway = pathway.replace('signaling', '').replace('pathway', '').replace('-', ' ')
    return [t for t in pathway.lower().strip().split() if len(t)>1]

def tokenize_pathway_names(sentence, pathwayA, pathwayB):
    genesA = [gene.lower() for gene in pathway_to_genes_dict[pathwayA]] + compute_pathway_name_terms(pathwayA)
    genesB = [gene.lower() for gene in pathway_to_genes_dict[pathwayB]] + compute_pathway_name_terms(pathwayB)
    tokenized_sentence = []
    for word in sentence.lower().split():
        token = None
        for gene in genesA:
            if gene in word:
                token = 'pathwayA'
                break
                
        for gene in genesB:
            if gene in word:
                token = 'pathwayB'
                break
        if token is None:
            token = word
        tokenized_sentence.append(token)
    return ' '.join(tokenized_sentence)

In [3]:
pathway_to_genes_dict = pickle.load(open( "data/pathway_to_genes_dict.p", "rb" ))
sentence_support_df = pd.read_csv('data/sentence_support_v3.tsv', delimiter='\t')
sentence_support_df.drop_duplicates(inplace=True)
sentence_support_df['tokenizedSentenceFromPaper'] = sentence_support_df.apply(lambda x: tokenize_pathway_names(x.sentenceFromPaper, x.pathwayA, x.pathwayB), axis=1)

In [4]:
# Load data
print("Loading data...")
x_text, y = load_data_and_labels()

Loading data...


In [5]:
# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(0.25 * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

Vocabulary Size: 33447
Train/Dev split: 31796/10598


In [6]:
from BasicTextCNN import BasicTextCNN

In [8]:
model = BasicTextCNN(sequence_length=x_train.shape[1],
            vocab_processor=vocab_processor, num_epochs=1, evaluate_every=300)
model.train_network(x_train, y_train, x_dev, y_dev)

INFO:tensorflow:Summary name embedding/W:0/grad/hist is illegal; using embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name embedding/W:0/grad/sparsity is illegal; using embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/hist is illegal; using conv-maxpool-4/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/sparsity is illegal; using conv-maxpool-4/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/b:0/grad/hist is illegal; using 