In [9]:
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

In [10]:
pdf_text = load_pdf(file_path="try.pdf")


In [22]:
import re
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.

    """
    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]

chunked_text = split_text(text=pdf_text)

In [23]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [24]:
import chromadb
from typing import List
def create_chroma_db(documents:List, path:str, name:str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name

In [27]:
db,name =create_chroma_db(documents=chunked_text, 
                          path="new.pdf", #replace with your path
                          name="rag_experiment")

In [28]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

In [30]:
db=load_chroma_collection(path="new.pdf", name="rag_experiment")


In [33]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

#Example usage
relevant_text = get_relevant_passage(query="how to make corba",db=db,n_results=3)

In [34]:
print(relevant_text)

[' \uf0a1 Corba Component Model \nThe component model specifies how interfaces should be defined and the elements that should \nbe included in an int erface definition ', 'Elements of components model ', ' \n6.2. Components \nComponents provide a service wit hout regard to where the compon ent is executing or its \nprogramming language \n\uf0a1 A component is an independent executable entity that can be mad e up of one or more \nexecutable objects; \n\uf0a1 The component interface is publis hed and all interactions are t hrough the published \ninterface; \nA software component is a software element that  conforms to a component model and can be \nindependently deployed and composed without modification according to a composition \nstandard. -  Councill and Heinmann:  A softw a\nexplicit c\nsubject t o']


In [87]:
def make_rag_prompt(query, full_marks, ideal_answer, relevant_text, students_answer):
    """
    Generates a grading prompt for a teacher checking an engineering exam paper.
    
    Parameters:
    - query: The exam question.
    - full_marks: The full marks allocated for the question.
    - ideal_answer: The ideal or model answer for the question.
    - relevant_text: The relevant reference text to be used in grading.
    - students_answer: The answer provided by the student.

    Returns:
    - A formatted prompt that can be used to grade the student's answer.
    """
    # Escape special characters to ensure proper formatting
    escaped_relevant_text = relevant_text.replace("'", "").replace('"', "").replace("\n", " ")
    if ideal_answer:
        ideal_answer = ideal_answer.replace("'", "").replace('"', "").replace("\n", " ")

    # Format the grading prompt with all the necessary information
    prompt = f"""
    You are a teacher checking Bachelor's in Engineering exam papers. You will be given a question, its full marks, the ideal answer, 
    the relevant reference text, and the answer given by the student. Your task is to grade the student's answer strictly, keeping in mind 
    the full marks allocated for the question. If the ideal answer and relevant text lacks important information, use your own judgment and intuition 
    to evaluate the answer based on the provided reference text.

    Be sure to evaluate the completeness, accuracy, and clarity of the student's response while being fair and consistent with the marks.

    QUESTION: '{query}'
    Full Marks: {full_marks}
    Ideal Answer: '{ideal_answer}'
    Relevant Reference Text: '{escaped_relevant_text}'
    
    Student's Answer: '{students_answer}'

    GRADE:
    """

    return prompt


In [88]:
import google.generativeai as genai
def generate_answers(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [91]:
def the_final_function(db, query, full_marks, students_answer, ideal_answer=None, n_results=3):
    """
    Generates an answer based on a student's response, relevant reference text, and an ideal answer.
    
    Parameters:
    - db: The Chroma database for retrieving relevant text.
    - query: The exam question.
    - full_marks: The full marks allocated for the question.
    - students_answer: The answer provided by the student.
    - ideal_answer: The ideal answer, if available (defaults to None).
    - n_results: The number of relevant text chunks to retrieve (default is 3).
    
    Returns:
    - The generated grade or evaluation based on the prompt.
    """
    
    # Retrieve the top N relevant text chunks for the query
    relevant_text_chunks = get_relevant_passage(query, db, n_results=n_results)
    
    # If no relevant text is found, return a default message
    if not relevant_text_chunks:
        return "No relevant information found for grading."

    # Combine the retrieved text chunks into one passage
    relevant_text = " ".join(relevant_text_chunks)
    
    print(ideal_answer)
    # If no ideal answer is provided, use None
    
    # Generate the grading prompt using the ideal answer (if available) and student answer
    prompt = make_rag_prompt(query, full_marks, ideal_answer=ideal_answer,  relevant_text=relevant_text, students_answer=students_answer)
    
    # Generate the answer or evaluation from the model based on the prompt
    answer = generate_answers(prompt)
    
    return answer


In [None]:
the_final_function(db,"what is corba?",full_marks=5,ideal_answer='',students_answer="CORBA is the Common Object Request Broker Architecture, a specification that defines how distributed objects can communicate with each other in a heterogeneous network environment."  )

CORBA is the Common Object Request Broker Architecture, a specification that defines how distributed objects can communicate with each other in a heterogeneous network environment.


"5 out of 5\n\nThe student's answer is complete, accurate, and clear. It matches the ideal answer perfectly. The student has demonstrated a good understanding of the concept of CORBA."