In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# sklearn libraries
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# import keras models
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
from keras.layers import Bidirectional

%matplotlib inline

In [2]:
def getseq(data): 
    all_data = [rows for rows in data[0]]
    return [all_data[i] for i in range(1,len(all_data)) if i%2!=0]

In [3]:
prom_raw = pd.read_csv('./data/PromoterSequence.txt', header=None)
nonprom_raw = pd.read_csv('./data/NonPromoterSequence.txt', header=None)

In [4]:
prom = getseq(prom_raw)
nonprom = getseq(nonprom_raw)

In [5]:
prom_df = pd.DataFrame(prom)
prom_df['label'] = 1
prom_df.rename({0:'sequence','label':'label'},axis=1, inplace=True)

In [6]:
nonprom_df = pd.DataFrame(nonprom)
nonprom_df['label'] = 0
nonprom_df.rename({0:'sequence','label':'label'},axis=1,inplace=True)

In [7]:
#df = pd.concat([prom_df,nonprom_df], axis=0, ignore_index=True)

In [8]:
sequence_promo=[seq for seq in prom_df['sequence']]
len(sequence_promo[0])

301

In [9]:
#[row for row in df.]
words_promo = []
for row in sequence_promo:
    if len(row)%4==0:
        words_promo.append([(row[i:i+4]) for i in range(0, len(row), 4)])
    else:
        remove = len(row) % 4
        row = row[:-remove]
        words_promo.append([(row[i:i+4]) for i in range(0, len(row), 4)])

In [10]:
len(words_promo)

11300

In [11]:
sequence_nonprom=[seq for seq in nonprom_df['sequence']]

In [12]:
words_nonprom = []
for row in sequence_nonprom:
    if len(row)%4==0:
            words_nonprom.append([(row[i:i+4]) for i in range(0, len(row), 4)])
    else:
        remove = len(row) % 4
        row = row[:-remove]
        words_nonprom.append([(row[i:i+4]) for i in range(0, len(row), 4)])

In [13]:
len(words_nonprom)

11300

In [14]:
for i in range(len(words_promo)):
    words_promo[i] = " ".join(words_promo[i])


In [15]:
for i in range(len(words_nonprom)):
    words_nonprom[i] = " ".join(words_nonprom[i])


In [16]:
df_promo = pd.DataFrame(words_promo)

In [17]:
df_non_promo = pd.DataFrame(words_nonprom)

In [18]:
df_promo.to_csv('promo.csv', index=False, header=False)

In [19]:
df_non_promo.to_csv('non_promo.csv', index=False, header=False)

In [20]:
df_promo['label'] = 1
df_non_promo['label'] = 0

In [21]:
df = pd.concat([df_promo,df_non_promo])

df = df.reset_index(drop=True)
df.rename({0:'seq','label':'label'},axis=1, inplace=True)
df

Unnamed: 0,seq,label
0,TTAA TTTG TCCT TATT TGAT TAAG AAGA ATAA ATCT T...,1
1,ATAG CTCA AATT GCTT TATT AGTA TTAG AATC AGCT G...,1
2,AAGC TTCC CTTT AATG TGCT CCTT GTGA ATAC AGCA T...,1
3,TATG TAGA ATCT GTAC AAGT ATCT GTGT TTGG ACAA T...,1
4,ACAT ATTA CTGC ATAC AGGT CTCA AATT ATAA AATG A...,1
...,...,...
22595,TGGT AAAA AATT GTAC ACCT AACT AGTG CCTT CATG T...,0
22596,AGTG CAAC TGGA GCCG TGCC GTGA CCCA CAGA GATC G...,0
22597,GCAT GGAT TTCA TATT ATCT TAAT CGAC TTGC TTTT A...,0
22598,GTGA CCAG GTTT TGCT CTAA TGCG AAGT ACGG ATTG G...,0


In [22]:
X = df['seq']
y = df['label']

y.value_counts(normalize=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)


In [23]:
X_train.index

Int64Index([22259, 11097,  5902, 11671, 15595,  8088,  7152, 15893,  1808,
            20579,
            ...
             9943, 13254, 21312, 17475,  5848,  9974, 10275,  3889,  3713,
             8682],
           dtype='int64', length=18080)

In [24]:
vocab_size=300
embedding_dim=64
max_length=400
trunc_type='post' 
padding_type='post' 
oov_tok='<OOV>' 



In [25]:
A=[len(X_train[i]) for i in X_train.index]
max(A)

374

In [26]:
tokenizer=Tokenizer(num_words=vocab_size,oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

In [27]:
print(tokenizer.word_counts)

OrderedDict([('cctt', 3658), ('ggac', 2568), ('agcg', 4465), ('attg', 7585), ('tacg', 2708), ('gctt', 5416), ('tggc', 6002), ('gatg', 4116), ('actt', 6639), ('aaaa', 22944), ('aatg', 7596), ('aatt', 14440), ('cttt', 7928), ('tagt', 4370), ('tatc', 5306), ('aata', 12719), ('tatt', 11851), ('ttgc', 6793), ('tgtt', 8631), ('atgt', 6179), ('ggta', 3044), ('cctc', 2635), ('ctgc', 5767), ('taaa', 14096), ('gtgt', 5432), ('attt', 16786), ('catt', 7626), ('ggca', 4746), ('atat', 11858), ('acta', 4509), ('tttg', 10585), ('gtcg', 3472), ('cgaa', 6262), ('attc', 6996), ('gcgc', 4030), ('tgaa', 7934), ('gtaa', 5542), ('acaa', 8736), ('atta', 9456), ('gagt', 4063), ('gata', 5574), ('gtag', 2670), ('tcgc', 4911), ('cagg', 2998), ('tatg', 4881), ('atac', 5272), ('caac', 5963), ('aaag', 8942), ('gaat', 6873), ('ttag', 4729), ('ttac', 4948), ('ccat', 4567), ('ttat', 10504), ('gcat', 4771), ('aagt', 7546), ('tttt', 20477), ('tata', 9537), ('tttc', 10085), ('gccg', 4067), ('aatc', 7120), ('cggt', 3067), 

In [28]:
train_seq=tokenizer.texts_to_sequences(X_train)

In [29]:
train_padded=pad_sequences(train_seq,maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [30]:
test_seq=tokenizer.texts_to_sequences(X_test)
test_padded=pad_sequences(test_seq,maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [31]:
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [32]:
y_train=to_categorical(y_train,2)
y_test=to_categorical(y_test,2)


In [33]:
model=Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(64,return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(2,activation='softmax'))

In [34]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) #binary_crossentropy

In [35]:
history=model.fit(train_padded,y_train,validation_data=(test_padded,y_test),batch_size=100, epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [36]:
testmain_df = pd.read_csv('data/test.csv')


In [37]:
sequence=[seq for seq in testmain_df['Sequence']]

words_test_promo = []
for row in sequence:
    if len(row)%4==0:
        words_test_promo.append([(row[i:i+4]) for i in range(0, len(row), 4)])
    else:
        remove = len(row) % 4
        row = row[:-remove]
        words_test_promo.append([(row[i:i+4]) for i in range(0, len(row), 4)])

for i in range(len(words_test_promo)):
    words_test_promo[i] = " ".join(words_test_promo[i])

testmain_df_plus=pd.DataFrame(words_test_promo)
testmain_df_plus = testmain_df_plus.reset_index(drop=True)
testmain_df_plus.rename({0:'seq'},axis=1, inplace=True)
testmain_df_plus['seq']

0      AATC TCTG TTGT TGTT ATGC AAAA AAGG AATA ATGT A...
1      TTCT CTAA GAAA GTCG ATGC TAAG CGGA TGCT AAAC G...
2      ACAT TTTA CGGT CTTG CGTG TGTA TTTA TTGT TCGT A...
3      CACT GAAA AAAA AAGA AAGG CTTA TTTA CTAT TAAC A...
4      GGGA GTGG CAAC ATGG GCTC ACAA GTCT AGAT CGAC T...
                             ...                        
173    TAAA TTGC ATGT AAAA TCAT TAAT AACG ATTA TATT C...
174    GGAT GTGC TTAG TGTA ATTT GCTT ATAA AAAC TACT T...
175    ATTC AGAA ATTA CATG TTTC TGAA AACT CTTT CCGA T...
176    ACGG TTTT AAGT GCCC AAAC TTAG GGTG TAGC GCCC T...
177    CTCA TCGC TCTG TTTG CCGA GGCT GGTT TTGT AGTT G...
Name: seq, Length: 178, dtype: object

In [38]:
#tokenizer.fit_on_texts(testmain_df_plus['seq'])
testmain_seq=tokenizer.texts_to_sequences(testmain_df_plus['seq'])
testmain_padded=pad_sequences(testmain_seq,maxlen=max_length, padding=padding_type, truncating=trunc_type)
testmain_padded

array([[ 41, 126,  28, ...,   0,   0,   0],
       [ 53, 113,  15, ...,   0,   0,   0],
       [ 56,   8, 217, ...,   0,   0,   0],
       ...,
       [ 42,  37,  21, ...,   0,   0,   0],
       [231,   3,  35, ...,   0,   0,   0],
       [197, 107, 126, ...,   0,   0,   0]])

In [39]:
y_pred=model.predict(testmain_padded)
len(y_pred)

178

In [40]:
y_pred

array([[0.86497116, 0.13502881],
       [0.1641024 , 0.8358976 ],
       [0.28488517, 0.71511483],
       [0.979091  , 0.02090893],
       [0.949178  , 0.05082207],
       [0.9081046 , 0.09189541],
       [0.10382894, 0.89617103],
       [0.43159777, 0.5684023 ],
       [0.8232943 , 0.17670576],
       [0.11077201, 0.889228  ],
       [0.03790273, 0.9620973 ],
       [0.7384129 , 0.26158708],
       [0.7693344 , 0.2306656 ],
       [0.66319096, 0.33680907],
       [0.84207296, 0.15792698],
       [0.09285783, 0.9071422 ],
       [0.9774383 , 0.02256173],
       [0.5506794 , 0.4493206 ],
       [0.20775907, 0.7922409 ],
       [0.65255153, 0.34744847],
       [0.890158  , 0.10984198],
       [0.23941226, 0.76058775],
       [0.1387364 , 0.86126363],
       [0.7168374 , 0.28316256],
       [0.5007619 , 0.499238  ],
       [0.17120457, 0.82879543],
       [0.15753421, 0.8424658 ],
       [0.11153081, 0.88846916],
       [0.07080887, 0.9291911 ],
       [0.29491842, 0.7050816 ],
       [0.

In [41]:
y_pred_plus=[0 if y_pred[i][0]>=0.5 else 1 for i in range(0,len(y_pred))]
y_pred_plus 

[0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0]

In [42]:
testmain_df['predictions']=y_pred_plus

In [43]:
testmain_df.to_csv('data/test_RNN.csv',index=False)