In [1]:
import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
# load training variants
train = pd.read_csv('training_variants')
# load training text
train_txt_ = pd.read_csv('training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
# merge text & variants
train = pd.merge(train, train_txt_, how='left', on='ID').fillna('')
# clean up
del train_txt_
# print train data info
display(train.info())

# load test variants from stage 1
testold_var_ = pd.read_csv('test_variants')
# load test text from stage 1
testold_txt_ = pd.read_csv('test_text', sep='\|\|', engine='python', header=None, skiprows=1, names=["ID","Text"])
# merge text & variants
testold_ = pd.merge(testold_var_, testold_txt_, how='left', on='ID').fillna('')
# clean up
del testold_var_
del testold_txt_

# load stage1 solutions
stage1sol_ = pd.read_csv('stage1_solution_filtered.csv')
# get class
stage1sol_['Class'] = pd.to_numeric(stage1sol_.drop('ID', axis=1).idxmax(axis=1).str[5:]).fillna(0).astype(np.int64)
# drop records from testold_ if they are not in stage1sol_
testold_ = testold_[testold_.index.isin(stage1sol_['ID'])]
# merge class to testold_ from stage1sol_
newtraindata_ = testold_.merge(stage1sol_[['ID', 'Class']], on='ID', how='left')
# reindex columns
newtraindata_ = newtraindata_.reindex_axis(['ID','Gene','Variation','Class','Text'], axis=1)
# clean up
del stage1sol_
del testold_

# append new train data
train = train.append(newtraindata_)
# clean up
del newtraindata_

# print train data info
display(train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3321 entries, 0 to 3320
Data columns (total 5 columns):
ID           3321 non-null int64
Gene         3321 non-null object
Variation    3321 non-null object
Class        3321 non-null int64
Text         3321 non-null object
dtypes: int64(2), object(3)
memory usage: 155.7+ KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3689 entries, 0 to 367
Data columns (total 5 columns):
ID           3689 non-null int64
Gene         3689 non-null object
Variation    3689 non-null object
Class        3689 non-null int64
Text         3689 non-null object
dtypes: int64(2), object(3)
memory usage: 172.9+ KB


None

In [4]:
print('Indexing word vectors.')
import os
from gensim.models import KeyedVectors
word2vec = None
# make sure you load this on your local env and uncomment the line
# word2vec = KeyedVectors.load_word2vec_format('PubMed-and-PMC-w2v.bin', binary=True)
if (word2vec == None):
    print("word2vec not loaded!")
else:
    print("Found {} word vectors of word2vec".format(len(word2vec.vocab)))

Indexing word vectors.
word2vec not loaded!


In [5]:
import nltk

# Create a function called "chunks" with two arguments, l and n:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i+n]

print('Expand records to sentences.')
# increase maxnumberofsentecs on local env to 400
maxnumberofsentences = 400
# increase splitbysenteces on local env to 10
splitbysentences = 10
# temp dict for new train set
tmpdf_ = {'Text': [], 'Class': [], 'ID': [], 'Gene': [], 'Variation': []}
for index, row in train.iterrows():
    # get sentences nltk
    sent_tokenize_list = nltk.sent_tokenize(row['Text'])
    # truncate sentences to last maxnumberofsentences (most important informations are at the end of text)
    if (len(sent_tokenize_list) > maxnumberofsentences):
        sent_tokenize_list = sent_tokenize_list[len(sent_tokenize_list)-maxnumberofsentences:]
    # split sentences to batch
    sent_chunk = list(chunks(sent_tokenize_list, splitbysentences))
    for chunk in sent_chunk:
        # join sentences in text
        tmpdf_['Text'].append(" ".join(chunk))
        # assign class
        tmpdf_['Class'].append(row['Class'])
        # assign ID
        tmpdf_['ID'].append(row['ID'])
        # assign Gene
        tmpdf_['Gene'].append(row['Gene'])
        # assign Variation
        tmpdf_['Variation'].append(row['Variation'])
# create new train set from temp dict
origtrainlen = len(train)
train = pd.DataFrame(tmpdf_)
# clean up
del tmpdf_
# display head
display(train.head())
# display 
print('expanded from {} to {}'.format(origtrainlen,len(train)))

Expand records to sentences.


Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,1,FAM58A,0,"Altogether, our results reveal an additional r...",Truncating Mutations
2,1,FAM58A,0,We show that a recombinant CDK10/cyclin M hete...,Truncating Mutations
3,1,FAM58A,0,An interaction phenotype was also observed bet...,Truncating Mutations
4,1,FAM58A,0,Black boxes indicate internal deletions. The r...,Truncating Mutations


expanded from 3689 to 108046


In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# max top words, increase on local env to 100000
num_words = 100000
# max sequence length, increase on local env to 500
sequencelength = 500
# init tokenizer
tokenizer = Tokenizer(num_words=num_words)
# fit tokenizer
tokenizer.fit_on_texts(train['Text'])
# get sequences
X = tokenizer.texts_to_sequences(train['Text'])
# unique words in text
word_index = tokenizer.word_index
print("Found {} unique tokens.".format(len(word_index)))
# pad sequences
X = pad_sequences(X, maxlen=sequencelength)

embedding_matrix = None
if (word2vec != None):
    # out of vocabulary words > use this to do text analysis
    oov_words = []
    # prepare embedding matrix
    embedding_matrix = np.zeros((num_words+1, 200)) #200 = word2vec dim
    for word, i in word_index.items():
        if i >= num_words:
            continue
        if word in word2vec.vocab:
            # embedd from word2vec
            embedding_matrix[i] = word2vec.word_vec(word)
        else:
            # add to out of vocabulary
            oov_words.append(word)
    print('Preparing embedding matrix done. out-of-vocabulary rate (OOV): {} ({})'.format(len(oov_words)/float(len(word_index)),len(oov_words)))


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 149629 unique tokens.


In [7]:
import keras
from sklearn.utils import class_weight

embed_dim = 200 #same as word2vec dim

model_filename = 'model'

# prepare Y values
Y = train['Class'].values-1
# get weights for unevenly distributed dataset 
class_weight = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
# one hot
Y = keras.utils.to_categorical(Y)
# batch size increase on local env
batch_size = 30
# epochs increase on local env
epochs = 5
# Model saving callback
ckpt_callback = keras.callbacks.ModelCheckpoint(model_filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

# input layer
input1 = keras.layers.Input(shape=(sequencelength,))
# embedding layer
if (embedding_matrix == None):
    # word2vec was not loaded. use fallback method
    embedding = keras.layers.Embedding(num_words+1, embed_dim, trainable=True)(input1)
else:
    # word2vec was loaded, load weights and set to untrainable
    embedding = keras.layers.Embedding(num_words+1, embed_dim, weights=[embedding_matrix], trainable=False)(input1)
 
# conv layers
convs = []
filter_sizes = [2,3,4]
for fsz in filter_sizes:
    l_conv = keras.layers.Conv1D(filters=100,kernel_size=fsz,activation='relu')(embedding)
    l_pool = keras.layers.MaxPooling1D(sequencelength-100+1)(l_conv)
    l_pool = keras.layers.Flatten()(l_pool)
    convs.append(l_pool)
# merge conv layers
l_merge = keras.layers.concatenate(convs, axis=1)
# drop out regulation
l_out = keras.layers.Dropout(0.5)(l_merge)
# output layer
output = keras.layers.Dense(units=9, activation='softmax')(l_out)
# model
model = keras.models.Model(input1, output)
# compile model
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy'])
# train model
model.fit(X, Y, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=1, class_weight=class_weight, callbacks=[ckpt_callback])

Train on 86436 samples, validate on 21610 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 1.41048, saving model to model
Epoch 2/5

Epoch 00002: val_loss did not improve from 1.41048
Epoch 3/5

Epoch 00003: val_loss did not improve from 1.41048
Epoch 4/5

Epoch 00004: val_loss did not improve from 1.41048
Epoch 5/5

Epoch 00005: val_loss did not improve from 1.41048


<keras.callbacks.History at 0x225b6b619e8>

In [8]:
# load test dataset
test = pd.read_csv('stage2_test_variants.csv')
# load test text dataset
test_txt_ = pd.read_csv('stage2_test_text.csv', sep='\|\|', engine='python', header=None, skiprows=1, names=["ID","Text"])
# merge text & variants
test = pd.merge(test, test_txt_, how='left', on='ID')
# clean up
del test_txt_

In [9]:
print('Expand records to sentences.')
# temp dict for new train set
tmpdf_ = {'Text': [], 'ID': [], 'Gene': [], 'Variation': []}
for index, row in test.iterrows():
    # get sentences nltk
    sent_tokenize_list = nltk.sent_tokenize(row['Text'])
    # truncate sentences to last maxnumberofsentences (most important informations are at the end of text)
    if (len(sent_tokenize_list) > maxnumberofsentences):
        sent_tokenize_list = sent_tokenize_list[len(sent_tokenize_list)-maxnumberofsentences:]
    # split sentences to batch
    sent_chunk = list(chunks(sent_tokenize_list, splitbysentences))
    for chunk in sent_chunk:
        # join sentences in text
        tmpdf_['Text'].append(" ".join(chunk))
        # assign ID
        tmpdf_['ID'].append(row['ID'])
        # assign Gene
        tmpdf_['Gene'].append(row['Gene'])
        # assign Variation
        tmpdf_['Variation'].append(row['Variation'])
# create new train set from temp dict
origtestlen = len(test)
test = pd.DataFrame(tmpdf_)
# clean up
del tmpdf_
# display head
display(test.head())
# display 
print('expanded from {} to {}'.format(origtestlen,len(test)))

Expand records to sentences.


Unnamed: 0,Gene,ID,Text,Variation
0,CHEK2,1,The incidence of breast cancer is increasing i...,H371Y
1,CHEK2,1,"These preliminary studies suggest that, on acc...",H371Y
2,CHEK2,1,Peripheral blood samples were collected from t...,H371Y
3,CHEK2,1,"In total, we detected six germline sequence al...",H371Y
4,CHEK2,1,Bioinformatic analysis suggested that the p.H3...,H371Y


expanded from 986 to 29967


In [11]:
#  load best model
model = keras.models.load_model(model_filename)
# get sequences
Xtest = tokenizer.texts_to_sequences(test['Text'])
# pad sequences
Xtest = pad_sequences(Xtest, maxlen=sequencelength)
# predict
probas = model.predict(Xtest, verbose=1)
# prepare data for submission
submission_df = pd.DataFrame(probas, columns=['class'+str(c+1) for c in range(9)])
# insert IDs
submission_df.insert(loc=0, column='ID', value=test['ID'].values)
# average grouped data
submission_df = submission_df.groupby(['ID'], as_index=False).mean()
# save to csv
submission_df.to_csv('submission.csv', index=False)
# debug
print("\n----------------------\n")
print("Done")


----------------------

Done
