In [53]:
## importing requried libraries
import os
import glob
import fitz  # PyMuPDF for PDF processing
import numpy as np
import faiss

from PyPDF2 import PdfReader

# from langchain.llms import Groq
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings,OllamaEmbeddings
from langchain.vectorstores.cassandra import Cassandra
import cassio

from dotenv import load_dotenv
load_dotenv()

True

In [54]:
## calling api keys
GROQ_API_KEY = os.environ['GROQ_API_KEY']

In [55]:
# Initialize LLM
groq_llm = ChatGroq(api_key=GROQ_API_KEY)

In [56]:
# Extract text from PDFs
data_folder = "data/"

### Load the documents from the data folder
pdf_files = glob.glob(os.path.join(data_folder, "*.pdf"))
documents = []

In [57]:
pdf_files

['data\\paper1.pdf',
 'data\\paper2.pdf',
 'data\\paper3.pdf',
 'data\\paper4.pdf',
 'data\\paper5.pdf',
 'data\\paper6.pdf']

In [58]:
import pymupdf

In [59]:
for file in pdf_files:
        with pymupdf.open(file) as doc:
            text = "\n".join([page.get_text("text") for page in doc])
            documents.append(text)

In [60]:
# Chunking the text
chunk_size=500
for i in range(0, len(text), chunk_size):
    documents.append(text[i:i+chunk_size])


In [61]:
len(documents)

163

In [62]:
documents

['Expert Opinion on Drug Discovery\nISSN: (Print) (Online) Journal homepage: www.tandfonline.com/journals/iedc20\nInduﬆrializing AI-powered drug discovery: lessons\nlearned from the Patrimony computing platform\nMickaël Guedj, Jack Swindle, Antoine Hamon, Sandra Hubert, Emiko\nDesvaux, Jessica Laplume, Laura Xuereb, Céline Lefebvre, Yannick Haudry,\nChristine Gabarroca, Audrey Aussy, Laurence Laigle, Isabelle Dupin-Roger &\nPhilippe Moingeon\nTo cite this article: Mickaël Guedj, Jack Swindle, Antoine Hamon, Sandra Hubert, Emiko\nDesvaux, Jessica Laplume, Laura Xuereb, Céline Lefebvre, Yannick Haudry, Christine Gabarroca,\nAudrey Aussy, Laurence Laigle, Isabelle Dupin-Roger & Philippe Moingeon (2022) Industrializing\nAI-powered drug discovery: lessons learned from the Patrimony computing platform, Expert\nOpinion on Drug Discovery, 17:8, 815-824, DOI: 10.1080/17460441.2022.2095368\nTo link to this article:  https://doi.org/10.1080/17460441.2022.2095368\n© 2022 The Author(s). Published b

In [63]:
# Initialize HuggingFace embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = np.array([embedding_model.embed_query(doc) for doc in documents])

In [64]:
# Store in FAISS index
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

In [65]:
d

384

In [66]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001D5235F0FF0> >

In [67]:

test_query = "What are the recent advancements in AI-powered drug discovery?"


query_embedding = embedding_model.embed_query(test_query)
D, I = index.search(np.array([query_embedding]), k=3)  # Retrieve top 3 similar docs
similar_docs = [documents[i] for i in I[0]]

In [68]:
#query_embedding
similar_docs

['Expert Opinion on Drug Discovery\nISSN: (Print) (Online) Journal homepage: www.tandfonline.com/journals/iedc20\nInduﬆrializing AI-powered drug discovery: lessons\nlearned from the Patrimony computing platform\nMickaël Guedj, Jack Swindle, Antoine Hamon, Sandra Hubert, Emiko\nDesvaux, Jessica Laplume, Laura Xuereb, Céline Lefebvre, Yannick Haudry,\nChristine Gabarroca, Audrey Aussy, Laurence Laigle, Isabelle Dupin-Roger &\nPhilippe Moingeon\nTo cite this article: Mickaël Guedj, Jack Swindle, Antoine Hamon, Sandra Hubert, Emiko\nDesvaux, Jessica Laplume, Laura Xuereb, Céline Lefebvre, Yannick Haudry, Christine Gabarroca,\nAudrey Aussy, Laurence Laigle, Isabelle Dupin-Roger & Philippe Moingeon (2022) Industrializing\nAI-powered drug discovery: lessons learned from the Patrimony computing platform, Expert\nOpinion on Drug Discovery, 17:8, 815-824, DOI: 10.1080/17460441.2022.2095368\nTo link to this article:  https://doi.org/10.1080/17460441.2022.2095368\n© 2022 The Author(s). Published b

In [69]:
# Construct a prompt for the LLM
prompt = f"Based on the following research papers, answer the query: {test_query}\n\n"
for i, doc in enumerate(similar_docs):
    prompt += f"Research Paper {i+1}: {doc[:500]}...\n\n"
prompt += "Provide a concise and informative response."

# Get response from Groq LLM
response = groq_llm.predict(prompt)

  response = groq_llm.predict(prompt)


In [70]:
response

'Recent advancements in AI-powered drug discovery include the industrialization of AI platforms, such as the Patrimony computing platform, which offers lessons learned in this field (Guedj et al., 2021). Additionally, there has been a comprehensive study on AI-powered drug discovery, highlighting its potential to accelerate pharmaceutical research (Kolluri, 2021). Furthermore, AI has shown promise in therapeutic target discovery, with its ability to analyze large datasets and intricate biological networks (Pun et al., 2021). Overall, AI is making significant strides in various aspects of drug discovery, including speeding up the process and improving target identification.\n\nReferences:\nGuedj, M., Swindle, J., Hamon, A., Hubert, S., Desvaux, E., Laplume, J., ... & Moingeon, P. (2021). Industrializing AI-powered drug discovery: lessons learned from the Patrimony computing platform. Expert Opinion on Drug Discovery, 16(4), 381-391.\n\nKolluri, V. (2021). A comprehensive study on AI-pow