In [23]:
'''In this notebook we will classify the prescence/abscence of HPO terms for 183,000 PubMed Abstracts using 
Covultional Neural Networks (CNN). We need the following files:

articles_hpo.txt : a map of articles to the appropriate HPO terms 
pmid_abstract.txt : a map of pmids and abstracts 

NOTE: pmid_abstract.txt includes articles that do not have an HPO term, so we 
will use the list of articles in articles_hpo.txt 
'''

import tensorflow as tf 
import keras 

import numpy as np 
import re 
import os 
import pandas as pd 
import sys

from keras import backend as K 

#create and set the session for Tensorflow 
##K.set_session(sess)




In [7]:
#Getting the data 

#if the text files are somewhere else, change this 
data_path = os.getcwd()+'/data/'

articles_hpo_file = open(data_path+'articles_hpo.txt')
pmid_abstracts_file = open(data_path+'pmid_abstract.txt')

#make an array of pmids and set of HPO terms 
print('Parsing files...')
articles_hpo = {}
pmids = set() 
hpo_terms = set() 
for line in articles_hpo_file:
    parse = line.rstrip().split('\t')
    parse[0] = parse[0].replace('"','')
    articles_hpo[int(parse[0])] = '\t'.join(parse[1:])
    pmids.add(int(parse[0]))
    hpo_terms |= set(parse[1:])

#make a dictionary of abstracts 
pmid_abstracts = {}
for line in pmid_abstracts_file:
    parse = line.rstrip().split('\t')
    pmid_abstracts[int(parse[0])] = pmid_abstracts.get(parse[0],'')+'\t'.join(parse[1:])
    
print('Creating DataFrame...')
#make a matrix of articles and hpo terms using Pandas DataFrame
df = pd.DataFrame(index=list(pmids), columns = list(hpo_terms))
df = df.fillna(0)
#fill in our dataframe 
for pmid in pmids: 
    for hpo_term in articles_hpo[pmid].split('\t'):
        df.loc[pmid,hpo_term] = 1
    
#make sure we are only dealing with articles that have abstracts and HPO terms 
pmids = pmids.intersection(set(pmid_abstracts.keys()))
print('Working with {} articles and {} hpo terms'.format(len(pmids),len(hpo_terms)))

Parsing files...
Creating DataFrame...
Working with 157473 articles and 1307 hpo terms


In [22]:
from keras.models import Sequential,Graph, Model
from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense,Reshape,Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.preprocessing import sequence


class cnn_2d_model(object):
    '''
    This is a 2D convolutional model over word vectors
    params is a dictonary with the following keys:
    - 'word_vecs': initial values for word_vectors
    - 'max_length': maximum length of the input sentences
    - 'num_classes': number of target classes
    - 'filter_sizes': list of n-gram filter sizes to use
    - 'num_maps': number of feature maps to use in the convolutional layers
    - 'num_hidden': number of units in hidden layer
    '''
    def __init__(self,params):
        print("Initializing CNN model...")
        self.vocab_size = self.word_vecs.shape[0]
        self.wv_dim = self.word_vecs.shape[1]
        self.max_length = np.int(params['max_length'])
        self.num_classes = np.int(params['num_classes'])
        self.filter_sizes = params['filter_sizes']
        self.num_maps = np.int(params['num_maps'])
        self.num_hidden = np.int(params['num_hidden'])
        self.window_size = np.int(params['window_size'])

    def build_model(self, embedding_layer):
        ## We'll need to use the graph model here to get the flexibility we need ##
        input_layer = Input(shape=(self.max_length,))
        ## The embedding layer is 3D tensor with shape (n_samples,batch_length,n_dim)
        embedded_sequences = embedding_layer(input_layer)
        grams = []
        for n_gram in self.filter_sizes:
            grams.append(generate_gram(n_gram, embedded_sequences))
        merged = merge(grams, mode='concat', axis=1)
        dense = Dense(self.num_hidden, activation='relu')(merged)
        dense = Dense(self.num_classes, activation='softmax')(dense)

        model = Model(input=input_layer,output=dense)
        return model 

    def generate_gram(n_gram, embedded_sequences):
        gram = Convolution2D(nb_filter=self.num_maps, nb_row=n_gram, nb_col=self.window_size, activation='relu')(embedded_sequences)
        gram = MaxPooling2D(pool_size=(self.max_length - n_gram + 1, 1))(gram)
        gram = Flatten()(gram)
        gram = Dropout(0.5)(gram)
        return gram



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding

print('Thresholding dataframe...')
TRESHOLD = 100 
df_threshold = df.loc[:,np.sum(df.values,axis=0)>THRESHOLD]

print('Compiling abstracts...')
texts = []
labels = []
for pmid in df.index:
    texts.append(pmid_abstract[pmid])
    labels.append(df.loc[pmid,:].values)
    
#Tokenize the words
print('Tokenizing the abstracts...')
max_words = 20000
tokenizer = Tokenizer(max_words)
tokenizer.fit_on_texts(texts) #get the order of words by frequency
texts_as_seq = tokenizer.text_to_sequences(texts)
word_index = tokenizer.word_index
print('Found {} words in the corpus'.format(len(word_index)))

max_words_seq = 1000
data = pad_sequences(texts_as_seq,max_words_seq)

#train/test split 
validation_split = .2 
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

embedding_matrix = np.zeros((len(word_index) + 1, wv_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, 
                            wv_dim, 
                            input_length=MAX_SEQUENCE_LENGTH, 
                            trainable=True)



In [None]:
#make the cnn_2d object here
model = cnn_2d_model.build_model(embedding_layer)