In [2]:
import tensorflow as tf
from tensorflow import keras
from keras import layers, Sequential
from utils.utils import extract_text_from_pdf
from utils.utils import preprocess_text
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
text = extract_text_from_pdf("data/Research_Proposal_Bikas_chaudhary_tharu_2541647.pdf")
text

'1 \n \n \n  \n \n \n \nResearch Proposal \nOn \nAdvancing Drug Discovery Through Artificial Intelligence \n \nResearch Methods \n7CS112 \n \nStudent Name: Bikas Chaudhary tharu \nStudent ID: 2541647 \nProgramme: MREs in Artificial Intelligent \n \n \n \n \n \n \n \n \n \n \n \n\n \n 2 \n \nTable of Contents \n1. Introduction .......................................................................................................... 3 \n2. Literature Review ................................................................................................... 4 \n2.1 Theoretical foundations and AI Architecture .............................................................. 4 \n2.2 Emerging trends in ai-driven drug target discovery ..................................................... 4 \n2.3 Limitations and Critical Debates ............................................................................... 5 \n2.4 Research Gap ......................................................................

In [4]:
text = preprocess_text(text)
text

'research proposal on advancing drug discovery through artificial intelligence research methods cs student name bikas chaudhary tharu student id programme mres in artificial intelligent table of contents introduction literature review theoretical foundations and ai architecture emerging trends in aidriven drug target discovery limitations and critical debates research gap research questions and ethical considerations research questions ethical considerations draft methodology research design data collection and preprocessing computational development and benchmarking feasibility and limitations references introduction discovering the drug is one of the most challenging complex expensive and time consuming processes in pharmaceutical research traditionally laboratory process for drug interaction dtis takes long time to develop cost lots of money and have low success rate in this pipeline finding the right therapeutic targets and how drugs bind with protein compound interaction is the ma

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
tokenizer.word_index

{'and': 1,
 'the': 2,
 'of': 3,
 'in': 4,
 'to': 5,
 'will': 6,
 'for': 7,
 'model': 8,
 'research': 9,
 'ai': 10,
 'data': 11,
 'this': 12,
 'drug': 13,
 'is': 14,
 'discovery': 15,
 'with': 16,
 'which': 17,
 'learning': 18,
 'are': 19,
 'protein': 20,
 'on': 21,
 'artificial': 22,
 'that': 23,
 'by': 24,
 'be': 25,
 'computational': 26,
 'biological': 27,
 'drugtarget': 28,
 'models': 29,
 'doi': 30,
 'prediction': 31,
 'experimental': 32,
 'validation': 33,
 'intelligence': 34,
 'dataset': 35,
 'molecular': 36,
 'using': 37,
 'benchmarking': 38,
 'from': 39,
 'datasets': 40,
 'vol': 41,
 'these': 42,
 'interactions': 43,
 'multimodal': 44,
 'no': 45,
 'interaction': 46,
 'networks': 47,
 'structure': 48,
 'explainable': 49,
 'or': 50,
 'study': 51,
 'target': 52,
 'deep': 53,
 'xai': 54,
 'testing': 55,
 'through': 56,
 'ethical': 57,
 'design': 58,
 'targets': 59,
 'graph': 60,
 'neural': 61,
 'use': 62,
 'knowledge': 63,
 'studies': 64,
 'evaluation': 65,
 'phase': 66,
 'methods'

In [6]:
input_sequences = []
for word in text.split('.'):
    sentence = word.strip()
    if len(sentence) == 0:
        continue
    
    text_seq = tokenizer.texts_to_sequences([word])[0]
    if len(text_seq) < 2:
        continue

    for i in range(1, len(text_seq)):
        n_gram_squence = text_seq[:i+1]
        input_sequences.append(n_gram_squence)

input_sequences

[[9, 222],
 [9, 222, 21],
 [9, 222, 21, 223],
 [9, 222, 21, 223, 13],
 [9, 222, 21, 223, 13, 15],
 [9, 222, 21, 223, 13, 15, 56],
 [9, 222, 21, 223, 13, 15, 56, 22],
 [9, 222, 21, 223, 13, 15, 56, 22, 34],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9, 67],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9, 67, 383],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9, 67, 383, 224],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9, 67, 383, 224, 384],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9, 67, 383, 224, 384, 385],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9, 67, 383, 224, 384, 385, 386],
 [9, 222, 21, 223, 13, 15, 56, 22, 34, 9, 67, 383, 224, 384, 385, 386, 387],
 [9,
  222,
  21,
  223,
  13,
  15,
  56,
  22,
  34,
  9,
  67,
  383,
  224,
  384,
  385,
  386,
  387,
  224],
 [9,
  222,
  21,
  223,
  13,
  15,
  56,
  22,
  34,
  9,
  67,
  383,
  224,
  384,
  385,
  386,
  387,
  224,
  388],
 [9,
  222,
  21,
  223,
  13,
  15,
  56,
  22,
  34,
  9,
  67,
  383,
  224

In [7]:
max_len =max([len(x) for x in input_sequences])

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
input_sequences_padded = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

In [9]:
X= input_sequences_padded[:,:-1]
y= input_sequences_padded[:,-1]

In [10]:
X.shape, y.shape

((2871, 2871), (2871,))

In [11]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)

In [12]:
y.shape

(2871, 990)

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [14]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
[1m58/90[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m1:59[0m 4s/step - accuracy: 0.0458 - loss: 6.6898

In [None]:
import numpy as np
text = "Can hybrid physics-informed graph neural networks help"
for i in range(10):
    token_text = tokenizer.texts_to_sequences([text])[0]
    padded_token_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')
    pos = np.argmax(model.predict(padded_token_text), axis=-1)[0]

    for word, index in tokenizer.word_index.items():
        if index == pos:
            text += " " + word
            break
print(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Can hybrid physics-informed graph neural networks help to reduce falsepositive rates in virtual screening compared to the
