In [None]:
# libraries to install
# !pip install langchain pypdf tiktoken
# !pip install sentence-transformers
# !pip install bitsandbytes
# !pip install accelerate
# !pip install chromadb
# !pip install openai
# !pip install PyPDF2
# !pip install faiss-cpu

In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import glob
from langchain_community.vectorstores import Chroma
# from langchain_community.document_loaders.csv_loader import CSVLoader
# from openai import OpenAI
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAI

In [33]:
import os
os.environ["OPENAI_API_KEY"] = ""
embedding = OpenAIEmbeddings()
Model_llm = OpenAI()

## DataLoader

In the course of handling PDF files, I engage in a comprehensive series of data preprocessing maneuvers aimed at refining the quality of the extracted data. These meticulous steps encompass the removal of stop words, punctuation, and extraneous content inherent in the PDF documents. Leveraging the PyPDF library significantly streamlines the loading process, facilitating swift and efficient data extraction. Subsequently, the gleaned data is allocated to the `document` variable for in-depth scrutiny.

Through this preprocessing endeavor, we ensure that solely pertinent information is preserved, a crucial element for conducting precise and effective data analysis. By excising superfluous elements, the data assumes a more focused demeanor, conducive to yielding valuable insights. This systematic and methodical approach to data management serves to optimize the efficiency and efficacy of subsequent analytical procedures.

Moreover, the segmentation of data into chunks is imperative due to the limited token size accessible to an LLM model. This strategy not only aids in proper data management but also allows for overlapping, further enhancing data coherence and interpretability.





In [4]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a single string
    processed_text = ' '.join(tokens)

    return processed_text

# Example usage
raw_text = "This is an example sentence with some punctuation! It needs preprocessing."
processed_text = preprocess_text(raw_text)

print("Raw Text: ", raw_text)
print("Processed Text: ", processed_text)

Raw Text:  This is an example sentence with some punctuation! It needs preprocessing.
Processed Text:  example sentence punctuation needs preprocessing


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
import re
def preprocess(text):
  
  replacement_text = ""
  pattern = r'CC-BY-NC-ND 4\.0 International license.*?https://doi\.org/10\.1101/2024\.01\.10\.575113doi: bioRxiv preprint'

  # Use regex to replace all occurrences of the specified line
  pattern = re.compile(pattern, re.DOTALL)
  updated_text = re.sub(pattern, replacement_text, text)

# Print or use the updated text as needed
  return updated_text

In [25]:
# # Extract the data from pdf
from PyPDF2 import PdfReader
import glob

document = ""
pdf_path = '.\\data\\*.pdf'
for pdf_file in glob.glob(pdf_path):
  print(pdf_file)
  with open(pdf_file, 'rb') as file:
      pdf = PdfReader(file)
      num_pages = len(pdf.pages)
      for page_number in range(num_pages):
          # print(page_number)
          page = pdf.pages[page_number]
          page_text = page.extract_text()
          page_text = preprocess(page_text)
          # page_text = preprocess_text(page_text)
          document += page_text



.\data\data1.pdf
.\data\data2.pdf
.\data\data3.pdf
.\data\data4.pdf
.\data\data5.pdf
.\data\data6.pdf
.\data\Into_Life_Sciences.pdf


In [26]:
spliter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap =100)
chunked_doc = spliter.split_text(document)

In [27]:
len(chunked_doc)

218

## Retrieval 
For the retrieval component, we employed two distinct vector storage databases: Chroma DB and FAISS Retriever. These databases serve as efficient tools for storing and retrieving high-dimensional vectors, facilitating the process of finding relevant documents or information based on similarity scores. Chroma DB is utilized for its ability to handle large-scale data and provide fast retrieval times, while FAISS Retriever is chosen for its speed and efficiency in indexing and searching through vector spaces. Together, these databases enhance the retrieval process by offering a robust and scalable solution for accessing relevant information quickly and accurately.




In this segment, we utilize Chroma DB for the purpose of retrieving pertinent information in response to the specified query. Chroma DB, with its advanced vector storage capabilities, efficiently handles the task of finding and returning relevant documents or data points that closely match the query's criteria. This process not only streamlines the retrieval process but also ensures that the most relevant information is quickly accessible, thereby enhancing the overall efficiency and effectiveness of the information retrieval system.


In [34]:
query = "How did the study manipulate the training data to investigate the impact of system diversity on model performance?"
chroma_db = Chroma.from_texts(chunked_doc, embedding)
chroma_retriever = chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 2})
chroma_score = chroma_db.similarity_search_with_score(query)


In this part, we employ FAISS Vector DB as the retriever. FAISS, known for its exceptional speed and efficiency in handling high-dimensional vectors, serves as a powerful tool for quickly and accurately retrieving relevant information based on the query. This choice ensures that the retrieval process is both fast and precise, making it an ideal solution for accessing the most pertinent data points in a timely manner.

In [37]:
faiss_db = FAISS.from_texts(chunked_doc, embedding)
faiss_retriever = faiss_db.as_retriever(search_type="similarity", search_kwargs={"k": 2})
faiss_score = faiss_db.similarity_search_with_score(query)

## Prompt
The prompt's significance in optimizing outcomes with the Large Language Model (LLM) cannot be overstated. Its effective utilization not only elevates the model's output but also mitigates the risk of hallucination. Hallucination occurs when the model generates unsupported information, potentially yielding erroneous responses. Through meticulous prompt construction, we steer the model's responses within the provided context, ensuring grounded outputs. This strategic approach minimizes the likelihood of hallucinations, fostering greater reliability in the model's responses. Thus, the careful crafting of prompts serves as a cornerstone in enhancing the LLM's performance and reinforcing the credibility of its generated content.

In [36]:
prompt_RAG = """You have extensive expertise in the field of life sciences,
      demonstrating a robust understanding of the subject matter. Address the questions according to the context provided below If the
      question cannot be answered using the information provided answer with "I don't know"

      Question:
      {question}

      Context:
      {context}

      Helpful Response :
       """
prompt = PromptTemplate.from_template(prompt_RAG)

## Generator 
Within the generator component, we embrace the ChatGPT model for its cost-effectiveness and superior performance over alternatives. Renowned for its capacity to yield top-notch results while minimizing hallucination, ChatGPT emerges as the optimal choice for generating precise and relevant responses. This strategic preference guarantees output quality and reliability, effectively meeting the demand for accurate and contextually appropriate information. By selecting ChatGPT, we ensure that the generated content not only maintains high standards but also aligns seamlessly with the context, fulfilling the essential requirement for dependable information dissemination.


In [39]:
## Generator
# retriever = chroma_retriever.as_retriever(search_type="similarity", search_kwargs={"k": 2})
rag = RetrievalQA.from_chain_type(
    llm= Model_llm,
    chain_type="stuff",
    retriever=chroma_retriever,
    chain_type_kwargs={"prompt":prompt},
    return_source_documents=True)
# Use self.query instead of query
result = rag({"query": query})

  warn_deprecated(


## RAG cycle
In this integration, we have consolidated the essential components of the RAG (Retrieval-Augmented Generation) cycle to ensure seamless execution of the code. These components include:

1. **DataLoader**: Responsible for loading and preparing the data, ensuring it is in a format suitable for the retrieval and generation processes. This may involve data cleaning, preprocessing, and formatting tasks to optimize data quality.

2. **Retriever**: Utilizes databases like Chroma DB or FAISS Retriever to efficiently find relevant documents or information based on the query, facilitating the retrieval of pertinent data. It employs advanced search algorithms to retrieve the most relevant results.

3. **Generator**: Employs a model, such as ChatGPT, to generate responses based on the retrieved information. This component is crucial for producing high-quality, contextually relevant outputs with minimal hallucination. It utilizes state-of-the-art language models to generate responses that align with the context and intent of the query.

By integrating these components, we create a comprehensive system that streamlines the process of retrieving and generating responses to queries, enhancing the overall efficiency and effectiveness of the RAG cycle. This integrated approach ensures that the system delivers accurate and relevant information while minimizing the risk of generating incorrect or misleading responses.


In [41]:
#Rag cycle
class RAG_cycle:
    def __init__(self, Model, retriever, docs, query,llms):
        self.Model = Model
        self.retriever = retriever
        self.docs = docs
        self.query = query
        self.llms = llms

    def prompt_temp(self):
      prompt_RAG = """You have extensive expertise in the field of life sciences,
      demonstrating a robust understanding of the subject matter. Address the questions according to the context provided below If the
      question cannot be answered using the information provided answer with "I don't know"

      Question:
      {question}

      Context:
      {context}

      Helpful Response :
       """
      custom_rag_prompt = PromptTemplate.from_template(prompt_RAG)
      return custom_rag_prompt

    def model_chain(self,prompt):
        # Use self.query instead of query
        # retriever = self.db.as_retriever(search_type="similarity", search_kwargs={"k": 2})
        rag = RetrievalQA.from_chain_type(
            llm=self.llms,
            chain_type="stuff",
            retriever=self.retriever,
            chain_type_kwargs={"prompt":prompt},
            return_source_documents=True)
        # Use self.query instead of query
        result = rag({"query": self.query})
        return result

In [43]:
obj = RAG_cycle(embedding,chroma_retriever,chunked_doc,query,Model_llm)
prompt = obj.prompt_temp()
generator= obj.model_chain(prompt)

In [44]:
obj = RAG_cycle(embedding,faiss_retriever,chunked_doc,query,Model_llm)
prompt = obj.prompt_temp()
generator= obj.model_chain(prompt)

## Benchmarking
In the benchmarking phase, we focus on evaluating the performance of two critical components of our system: the retrieval mechanism and the generator.

1. **Retrieval Benchmarking**: For the retrieval component, we utilize the `langchain` metric `load_evaluator`, which is designed to assess the effectiveness of retrieval systems. This evaluator is particularly useful for comparing different retrieval strategies and their ability to accurately fetch relevant documents or information based on a given query. In the context of our system, we are also considering the use of a critic, as suggested by Claude, to further refine the evaluation process. A critic can provide a more nuanced assessment by evaluating not just the retrieval's ability to fetch relevant documents but also its capacity to exclude irrelevant ones, thereby offering a more comprehensive view of the retrieval system's performance.

2. **Generator Benchmarking**: For the generator component, we employ the BERT score as our evaluation metric. The BERT score is a powerful tool for assessing the quality of text generated by models, such as BERT, by comparing it to human-written text. It calculates precision, recall, and F1 score, which are essential metrics for understanding the accuracy and relevance of the generated text. The BERT score is based on cosine similarity, a measure that quantifies the cosine of the angle between two vectors. This similarity metric is particularly effective for evaluating the semantic similarity between the generated text and the target text, making it an ideal choice for benchmarking the generator's performance.

By benchmarking both the retrieval and generator components using these metrics, we aim to gain insights into the strengths and weaknesses of our system. This comprehensive evaluation will help us identify areas for improvement and ensure that our system delivers high-quality, relevant, and accurate responses.


In [58]:
import pandas as pd
df = pd.read_csv("synthetic_data_2.csv",encoding='unicode_escape')
df.head()
df.drop(["Unnamed: 5","Unnamed: 6","Unnamed: 7"],axis = 1,inplace = True)


To evaluate the model effectively, we generate **synthetic data** that encompasses a wide variety of questions. This approach ensures that the model is tested in a realistic and comprehensive environment, covering a broad spectrum of potential queries. By creating synthetic data that mimics real-world questions, we can assess the model's ability to handle a diverse range of scenarios. This method allows us to evaluate the model's performance across different domains, contexts, and question types, providing a more accurate and reliable assessment of its capabilities. Additionally, it helps in identifying any biases or limitations in the model's understanding, enabling us to make targeted improvements to enhance its overall effectiveness.


In [63]:
df.head()

Unnamed: 0,question,contexts,ground_truth,prediction,retrieval context
0,What is the primary motivation for incorporati...,"[In this section, we present an overview of ou...",The primary motivation for incorporating condi...,The primary motivation for incorporating cond...,and CBAM contributes to notable colorization o...
1,How does the Pix2Pix conditional GAN model co...,"[In this section, we present an overview of ou...",The Pix2Pix conditional GAN model contributes ...,The Pix2Pix conditional GAN model contributes...,To make the model more robust to pay more atte...
2,How does generative AI address the challenges ...,[Generative AI in Life Science Domain Generati...,Generative AI can create synthetic datasets th...,Generative AI can address the challenges asso...,"of critical thinking skills, scientific inquir..."
3,"In drug discovery and design, how do generativ...",[Generative AI in Life Science Domain Generati...,Generative AI models in drug discovery generat...,Generative AI models significantly contribute...,"f critical thinking skills, scientific inquiry..."
4,How did the study manipulate the training data...,[We showed above that idpSAM performs better t...,To assess the influence of system diversity on...,The study manipulated the training data by va...,of training data translates into better perfor...


In [59]:
df.rename(columns={"ground_truths": "ground_truth"}, inplace=True)

In [60]:
def string_to_sequence(input_string):
    # Convert string to list of characters
    sequence_list = [input_string]

    return sequence_list

df['contexts'] = df['contexts'].apply(string_to_sequence)
# df['ground_truth'] = df['ground_truth'].apply(string_to_sequence)

In [64]:
from transformers import BertTokenizer, BertModel
from bert_score import BERTScorer

def calculate_bert_score(df):
    """
    Calculate BERTScore for the given DataFrame containing 'ground_truths' and 'prediction' columns.

    Args:
    df (DataFrame): DataFrame containing 'ground_truths' and 'prediction' columns.

    Returns:
    tuple: A tuple containing BERTScore precision, recall, and F1 score.
    """
    # Extract references and candidates from the DataFrame
    references = df['ground_truth'].tolist()
    candidates = df['prediction'].tolist()
    references = [str(ref) for ref in references]
    candidates = [str(ref) for ref in candidates]

    # Initialize a BERTScorer instance
    scorer = BERTScorer(model_type='bert-base-uncased')

    # Calculate BERTScore
    P, R, F1 = scorer.score(candidates, references)


    return P.mean(), R.mean(), F1.mean()

bert_precision, bert_recall, bert_f1 = calculate_bert_score(df)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


The provided BERT score values (precision: 0.4535, recall: 0.4330, F1 score: 0.4338) offer insights into the quality of the generated text by the model:

- Precision (0.4535): This score indicates that the model's generated text is approximately 45.35% accurate, emphasizing the relevance and correctness of the output. A higher precision signifies a more selective approach, focusing on delivering accurate information.

- Recall (0.4330): With a score of around 43.30%, the model captures a portion of relevant information from the target text. Improved recall would mean a better ability to include all pertinent details.

- F1 Score (0.4338): This metric, balancing precision and recall, stands at 0.4338, implying a moderate accuracy with a slight preference towards precision. This suggests effectiveness in generating relevant text, yet there's room for improvement to achieve a better balance between precision and recall for more accurate responses.

Overall, the model exhibits moderate performance, slightly favoring precision. Enhancing recall would ensure a more comprehensive inclusion of relevant information, while balancing precision and recall could improve overall accuracy and completeness of generated content.


In [65]:
print(bert_precision, bert_recall, bert_f1)

tensor(0.4535) tensor(0.4330) tensor(0.4338)


In [70]:
from langchain.evaluation import load_evaluator
from langchain_anthropic import ChatAnthropic
os.environ['ANTHROPIC_API_KEY'] =""
model = ChatAnthropic(model='claude-3-opus-20240229')
evaluator = load_evaluator("labeled_criteria",llm=model, criteria="correctness")
def evaluate_strings(row):
    eval = evaluator.evaluate_strings(
        input=row['question'],
        prediction=row['prediction'],
        reference=row['retrieval context'])
    return eval["score"]
    # eval_result]

# Apply the function to each row of the DataFrame
df['eval_result'] = df.apply(evaluate_strings, axis=1)

In [71]:
df.head()

Unnamed: 0,question,contexts,ground_truth,prediction,retrieval context,eval_result
0,What is the primary motivation for incorporati...,"[In this section, we present an overview of ou...",The primary motivation for incorporating condi...,The primary motivation for incorporating cond...,and CBAM contributes to notable colorization o...,1
1,How does the Pix2Pix conditional GAN model co...,"[In this section, we present an overview of ou...",The Pix2Pix conditional GAN model contributes ...,The Pix2Pix conditional GAN model contributes...,To make the model more robust to pay more atte...,1
2,How does generative AI address the challenges ...,[Generative AI in Life Science Domain Generati...,Generative AI can create synthetic datasets th...,Generative AI can address the challenges asso...,"of critical thinking skills, scientific inquir...",1
3,"In drug discovery and design, how do generativ...",[Generative AI in Life Science Domain Generati...,Generative AI models in drug discovery generat...,Generative AI models significantly contribute...,"f critical thinking skills, scientific inquiry...",1
4,How did the study manipulate the training data...,[We showed above that idpSAM performs better t...,To assess the influence of system diversity on...,The study manipulated the training data by va...,of training data translates into better perfor...,1
