### Using Gemini Pro and  Pinecone Database to Store Research Paper Embeddings

In [69]:
# Importing the necessary libraries

from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_google_genai import ChatGoogleGenerativeAI

In [70]:
# To access all the API keys
from dotenv import load_dotenv
load_dotenv()

True

In [71]:
import os

In [72]:
# Reading the research paper
def read_data(directory):
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    return documents

In [77]:
data = read_data("data/")

In [78]:
# All the lines from the paper loaded line by line
data

[Document(page_content='2024-5-7\nAdvancing Multimodal Medical Capabilities of\nGemini\nGoogle Research and Google DeepMind†\nMany clinical tasks require an understanding of specialized data, such as medical images and genomics,\nwhich is not typically found in general-purpose large multimodal models. Building upon Gemini’s\nmultimodal models, we develop several models within the new Med-Gemini family that inherit core\ncapabilities of Gemini and are optimized for medical use via fine-tuning with 2D and 3D radiology,\nhistopathology, ophthalmology, dermatology and genomic data. Med-Gemini-2D sets a new standard for\nAI-based chest X-ray (CXR) report generation based on expert evaluation, exceeding previous best results\nacross two separate datasets by an absolute margin of 1% and 12%, where 57% and 96% of AI reports\non normal cases, and 43% and 65% on abnormal cases, are evaluated as “equivalent or better” than\nthe original radiologists’ reports. We demonstrate the first ever large m

In [80]:
len(data)  # PDF is of 62 pages

62

In [81]:
# Dividing the documents into chunks since models have restrictions on the no of tokens

def chunk_data(docs, chunk_size=550, chunk_overlap=60):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc

In [82]:
documents = chunk_data(docs=data)
len(documents)

414

In [83]:
# Creating Vector Embeddings of the documents using GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=os.environ['GOOGLE_API_KEY'])
embeddings

GoogleGenerativeAIEmbeddings(model='models/embedding-001', task_type=None, google_api_key=SecretStr('**********'), credentials=None, client_options=None, transport=None, request_options=None)

In [85]:
# Testing the embeddings
vectors = embeddings.embed_query("Atmik Shetty!!!")
len(vectors)  

768

In [86]:
# Creating a vector store for the embeddings
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(documents, embeddings)
print(db.index.ntotal)

414


In [87]:
vector_store = FAISS.from_documents(data, embedding = embeddings)   
vector_store.save_local("faiss_index")

In [91]:
query = "Abstract of this research paper"
ans = db.similarity_search_with_score(query)
ans[0]

(Document(page_content='Bethesda, MD, USA , 2019.\n41', metadata={'source': 'data\\med.pdf', 'page': 40}),
 0.19530368)

In [93]:
embedding_vector = embeddings.embed_query(query)
score = db.similarity_search_with_score_by_vector(embedding_vector)
score[0]

(Document(page_content='Bethesda, MD, USA , 2019.\n41', metadata={'source': 'data\\med.pdf', 'page': 40}),
 0.19530368)

In [97]:
# Accessing from the DB
model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.2)
chain = load_qa_chain(model, chain_type="stuff")

In [98]:
def retrieve_answers(query):
    doc_search = db.as_retriever()
    ans = doc_search.invoke(query)
    print(ans)
    response = chain.run(input_documents = doc_search, question = query)
    return response

In [None]:
my_query = "What Large langauge model is used in this research paper?"
answer = retrieve_answers(my_query)
print(answer)