In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, Embedding, LeakyReLU, Merge, concatenate, Input, BatchNormalization
from keras.optimizers import Adam ,RMSprop
from keras.preprocessing import text, sequence
import keras
import gensim 

In [None]:
trainvar = pd.read_csv('training_variants')
traintex = pd.read_csv('training_text', sep='\|\|', header=None, names=['ID','Text'], skiprows=[0])
train = pd.merge(trainvar, traintex, on='ID')

trainvar = pd.read_csv('test_variants')
traintex = pd.read_csv('test_text', sep='\|\|', header=None, names=['ID','Text'], skiprows=[0])
test = pd.merge(trainvar, traintex, on='ID')

In [None]:
traintext = test.Text.tolist()+train.Text.tolist()
sentences = []
for t in traintext:
    for s in sent_tokenize(t):
        sentences.append(word_tokenize(s))

In [None]:
vec_size=50
vectorizer = gensim.models.Word2Vec(sentences, size=vec_size)

In [None]:
train.Text = train.Gene +' '+ train.Variation +' '+ train.Text
test.Text = test.Gene + test.Variation + test.Text
train.Text = train.Text.str.lower()
test.Text = test.Text.str.lower()
l = stopwords.words('english').append(['well','determine'])
for words in stopwords.words('english'):
    test.Text = test.Text.str.replace(' '+words+' ', '')
    train.Text = train.Text.str.replace(' '+words+' ', '')

In [None]:
maxlength=10000

m=0
for g in train.Gene:
    if len(g) > m:
        m=len(g)
for g in test.Gene:
    if len(g) > m:
        m=len(g)
genelength = m
genelength

In [None]:
A = ord('A')
def geneVec(gene):
    array = np.zeros((genelength, 36))
    i=0
    
    for c in gene:
        if c.isdigit():
            array[i,int(c)] = 1
        else:
            array[i,10+ord(c)-A]=1
        i += 1
    return array

train.Gene = train.Gene.str.upper()
test.Gene = test.Gene.str.upper()

train = train.assign(genevec = [ geneVec(g) for g in train.Gene])
test = test.assign(genevec = [ geneVec(g) for g in test.Gene])

In [None]:
y = keras.utils.to_categorical(train['Class']-1, num_classes=9)
tok = text.Tokenizer()
tok.fit_on_texts(train['Text'])
X = tok.texts_to_sequences(train.Text)
X = [sequence.pad_sequences(X ,maxlen=maxlength),np.stack(train.genevec.tolist(),axis=0)]

In [None]:
weights = np.zeros(shape=(len(tok.word_index)+1,vec_size))
for k, v in tok.word_index.items():
    if k in vectorizer.wv.vocab:
        weights[v] = vectorizer.wv[k]      
weights = weights/np.amax(abs(weights))

In [None]:
# model = Sequential()
# model.add(Embedding(max(tok.word_index.values())+1, 15, input_length=maxlength))
# model.add(Conv1D(128,10))
# model.add(LeakyReLU(0.1))
# model.add(MaxPooling1D())
# model.add(Dropout(0.2))
# model.add(Conv1D(128,10))
# model.add(LeakyReLU(0.1))
# model.add(MaxPooling1D())
# model.add(Flatten())
# model.add(Dense(128))
# model.add(LeakyReLU(0.1))
# model2 = Sequential()
# model2.add(Conv1D(15,3, input_shape=(genelength,36)))
# model2.add(LeakyReLU(0.1))
# model2.add(MaxPooling1D())
# model2.add(Flatten())
# model2.add(Dense(32))
# model2.add(LeakyReLU(0.1))
           
# model = Sequential()
# model.add(Concatenate([model1,model2]))
# model.add(Dense(units=9, activation='softmax'))
# model.summary()

input1 = Input(shape=(maxlength,))
x = Embedding(max(tok.word_index.values())+1, 100, input_length=maxlength,weights=[weights])(input1)
x = BatchNormalization()(x)
x = Conv1D(128,10, activation='relu')(x)
x = MaxPooling1D()(x)
x = Dropout(0.2)(x)
x = Conv1D(128,10, activation='relu')(x)
x = MaxPooling1D()(x)
x = Flatten()(x)
x = Dense(128)(x)
words = LeakyReLU(0.1)(x)

input2 = Input(shape=(genelength,36,))
x = Conv1D(15,2,activation='relu')(input2)
x = MaxPooling1D()(x)
x = Flatten()(x)
x = Dense(32,activation='relu')(x)
x = Dense(genelength)(x)
gene = LeakyReLU(0.1)(x)
           
x = concatenate([words,gene])
output = Dense(units=9, activation='softmax')(x)
          
model = Model(inputs=[input1,input2], outputs=output)
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=3, verbose=1)

In [None]:
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

In [None]:
x = tok.texts_to_sequences(test['Text'])
x = [sequence.pad_sequences(x,maxlen=maxlength),np.stack(test.genevec.tolist(),axis=0)]

In [None]:
classes = model.predict(x, batch_size=32, verbose=1)

In [None]:
cl = np.zeros_like(classes)
for i in range(len(classes)):
    cl[i, classes[i].argmax()] = 1
cols=['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9']
dfsub = pd.DataFrame(classes, columns=cols)
dfsub.insert(0,'ID',range(len(dfsub.class1)))
dfsub.head(15)

In [None]:
dfsub.to_csv('gene.csv',index=False)