In [3]:
from pypdf import PdfReader


def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a list of strings,
    where each string corresponds to the text of a single page.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - List[str]: A list containing the text content of each page in the PDF.
    """
    reader = PdfReader(file_path)

    # Extract text from each page and store it in a list
    text_by_page = [page.extract_text() for page in reader.pages]

    return text_by_page

In [4]:
import re
from typing import List
from itertools import chain

def split_text(text_pages: List[str]) -> List[str]:
    """
    Splits the text of each page into a list of non-empty substrings based on paragraphs ("\n\n").
    Returns a flat list of strings (one for each paragraph across all pages).

    Parameters:
    - text_pages (List[str]): A list where each element is the text of a single page.

    Returns:
    - List[str]: A flat list containing non-empty substrings (paragraphs) from all pages.
    """
    # Flatten the list of lists into a single list
    return list(chain.from_iterable(
        [para for para in re.split('\n\n', page_text) if para.strip()] 
        for page_text in text_pages
    ))

pdf_text = load_pdf(file_path="eco.pdf")

# Get the flattened list of paragraphs
chunked_text = split_text(text_pages=pdf_text)

print(len(chunked_text))


114


In [5]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        embd    = genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]
        print(embd)
        return embd

In [6]:
import chromadb
from typing import List
def create_chroma_db(documents:List, path:str, name:str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        print(i)
        db.add(documents=d, ids=[str(i)])

    return db, name

In [106]:
import os

# Set the environment variable
os.environ["gemini_api_key"] = ""

pdf_text = load_pdf(file_path="eco.pdf")


chunked_text = split_text(text_pages=pdf_text)

db,name =create_chroma_db(documents=chunked_text,  path="Economic.pdf", #replace with your path
                          name="rag_experiment")

0
[[0.03475225, -0.041092176, -0.06708627, 0.008069091, 0.107633, 0.033061095, 0.06721708, 0.0036304765, -0.014235088, 0.083290726, 0.012606201, 0.029297054, -0.032860983, 0.0063524605, -0.007607358, -0.028581727, 0.03031133, -0.015407769, 0.024703054, -0.09157979, -0.0185828, 0.034944963, -0.004757188, 0.0010763227, 0.02688079, 0.0047610826, -0.03354409, -0.07071218, -0.03569696, 0.026199173, -0.07210127, 0.010317157, -0.051475957, 0.009786544, 0.0043305606, 0.03501536, 0.004381824, 0.02171203, 0.022955494, 0.065032445, -0.052415073, 0.006156888, 0.001991089, -0.04101181, -0.01658624, -0.0392706, 0.006973509, 0.030842163, 0.051466532, -0.06103296, 0.008873823, 0.031696953, 0.070525706, 0.0156014, -0.01628042, -0.045790784, 0.030046385, -0.055154257, -0.034052845, 0.028051645, 0.0067170192, -0.0073248064, -0.007345621, 0.030215368, 0.0009017215, -0.10732259, -0.07420497, 0.03510307, 0.06165107, 0.0073126955, -0.004967598, -0.03783778, 0.048266627, -0.029582644, -0.036898702, -0.0916057

In [1]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

[0.061793126, -0.026895914, -0.053126033, -0.031423815, 0.0009166275, -0.0022777421, 0.022070201, -0.018893186, 0.02978871, 0.02701744, -0.018954588, 0.019026665, -0.059850425, -0.004122646, -0.019843316, -0.03185287, -0.0049529113, -0.028315166, -0.014075361, -0.011312237, 0.027588522, 0.011682636, -0.05070751, -0.01471039, 0.030218303, -0.028374307, -0.009508844, -0.021729477, -0.020495428, 0.016852077, -0.018496567, 0.0013552041, -0.016671715, 0.009485285, 0.026152847, -0.06680192, 0.010716877, 0.0077826334, -0.020552522, 0.0007314454, 0.003967336, -0.059546135, -0.051566582, 0.012487974, 0.035073392, -0.0003822633, -0.012866104, -0.0019021706, -0.0069007627, -0.059979115, 0.029266499, 0.020707613, 0.06977824, -0.018300544, -0.040920842, -0.03989515, 0.032942392, -0.033683795, -0.026755376, 0.008335367, -0.016114943, 0.0127893705, 0.03487377, -0.01172662, -0.0088305725, -0.08846688, -0.022089414, 0.06523572, 0.04772638, 0.013917293, -0.01571729, -0.040032417, 0.013135482, -0.0350970

In [7]:
db=load_chroma_collection(path="Economic.pdf", name="rag_experiment")


In [116]:
x=db.get(ids=["1"])
print(x)

{'ids': ['1'], 'embeddings': None, 'metadatas': [None], 'documents': ['Manual of Engineering Economy – Nanda Shakya  \n \n \n \n \nManual of Engineering Economy – Nanda Shakya   Page 2 \n 1.0   \nENGINEERING     \nECONOMICS  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n Economics is defined as the study of allocation of scarce resources \namong unlimited ends (or wants).  \nOur wants are unlimited or at least increasing ever and to satisfy all \nthese wants, we need unlimited supply of productive resources which \ncould provide necessary goods and services to the community. \nHowever, resources are scarce i.e. limited in supply and obtained at \nsome cost. In other words, resources are scarce in relation to its needs \nTherefore, scarce resou rces should be used wisely judiciously and \nmore effectively at optimum level, minimizing the cost and maximizing \nprofit and benefit without compromising the quality of product or \nservice.  \nAll engineering decisions involve numbe

In [11]:
def get_relevant_passage(query, db, n_results=3):
  passage = db.query(query_texts=query, n_results=n_results)['documents']
  passage = list(chain.from_iterable(passage))

  return passage
import os

# Set the environment variable
#Example usage
relevant_text = get_relevant_passage(query="Derive the formula for geometirc graient series",db=db,n_results=10)


[[0.057350665, -0.024703616, -0.031601585, 0.031819142, 0.06156529, -0.008111587, 0.03908952, -0.021887206, 0.024869284, 0.07725628, -0.009747691, 0.0043075318, -0.0711767, -0.023337644, 0.019899582, -0.012090507, 0.061745167, -0.016518375, 0.02249984, -0.039825182, 0.030336652, 0.00437925, -0.0015720305, -0.019114586, 0.0074127694, -0.044184383, 0.025864463, -0.029647358, -0.047900822, 0.031128626, -0.027386736, -0.018280702, -0.0444184, 0.0014921402, 0.007929833, -0.044191092, -0.0045455666, -0.009217904, -0.031867035, 0.027608527, 0.025553439, -0.033468798, -0.0010393847, 0.034995243, 0.021039188, -0.02757494, -0.00023045103, 0.057442065, 0.0072641745, -0.05857685, 0.024124198, -0.022302652, 0.0760271, -0.050894238, 0.0070804753, -0.08840143, 0.023568744, -0.04961609, 0.03266204, 0.014609749, 0.017822737, 0.0071914783, 0.0027192011, 0.015403913, 0.0087768305, -0.07748225, -0.029011512, 0.018659282, 0.027910095, -0.023998218, -0.0020428558, -0.037835322, 0.07725188, -0.03940227, 0.00

In [12]:

print(relevant_text)


['Manual of Engineering Economy – Nanda Shakya  \n \n \n \n \nManual of Engineering Economy – Nanda Shakya   Page 18 \n 2.5.1.5  \nGEOMETRIC  \nGRADIENT  \nSERIES  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n If an amount A1 is invested or paid at the end of interest period 1 \nchanges (increases or decreases) by a constant percentage (g%)  at the \nend of each periods for N interest periods at interest rate i% per \nperiod,  \nif i ≠ g   \nIts present worth P would be    \nP= 𝐀𝟏\n( 𝟏 + 𝐢 )+ 𝐀𝟏( 𝟏 + 𝐠 )𝟏\n( 𝟏 + 𝐢 )𝟐+𝐀𝟏( 𝟏 + 𝐠)𝟐\n( 𝟏 + 𝐢 )𝟑+𝐀𝟏( 𝟏 + 𝐠 )𝟑\n( 𝟏 + 𝐢 )𝟒+⋯  𝐀𝟏( 𝟏 + 𝐠 )𝐍−𝟐\n( 𝟏 + 𝐢 )𝐍−𝟏+\n𝐀𝟏( 𝟏 + 𝐠 )𝐍−𝟏\n( 𝟏 + 𝐢 )𝐍….(𝟏)       \n            \nMultiplying both side by    (𝟏 +𝐠 )\n(𝟏 + 𝐢) \nP (𝟏 +𝐠 )\n(𝟏 + 𝐢) =𝐀𝟏( 𝟏 + 𝐠 )𝟏\n( 𝟏 + 𝐢 )𝟐+𝐀𝟏( 𝟏 + 𝐠)𝟐\n( 𝟏 + 𝐢 )𝟑+𝐀𝟏( 𝟏 + 𝐠 )𝟑\n( 𝟏 + 𝐢 )𝟒+⋯                +\n𝐀𝟏( 𝟏 + 𝐠 )𝐍−𝟏\n( 𝟏 + 𝐢 )𝐍+𝐀𝟏( 𝟏 + 𝐠 )𝐍\n( 𝟏 + 𝐢 )𝐍+𝟏….(𝟐)  \nSubtracting (2) by (1)  \n𝐏−𝐏 (𝟏 +𝐠 )\n(𝟏 + 𝐢) =𝐀𝟏\n( 𝟏 + 𝐢 )−𝐀𝟏( 𝟏 + 𝐠 )𝐍\n( 𝟏 + 𝐢 )𝐍+𝟏               

In [146]:
def make_rag_prompt(query, full_marks, ideal_answer, relevant_text, students_answer):
    """
    Generates a grading prompt for a teacher checking an engineering exam paper.
    
    Parameters:
    - query: The exam question.
    - full_marks: The full marks allocated for the question.
    - ideal_answer: The ideal or model answer for the question.
    - relevant_text: The relevant reference text to be used in grading.
    - students_answer: The answer provided by the student.

    Returns:
    - A formatted prompt that can be used to grade the student's answer.
    """
    # Escape special characters to ensure proper formatting
    escaped_relevant_text = relevant_text.replace("'", "").replace('"', "").replace("\n", " ")
    if ideal_answer:
        ideal_answer = ideal_answer.replace("'", "").replace('"', "").replace("\n", " ")

    # Format the grading prompt with all the necessary information
    prompt = f"""
    You are a teacher checking Bachelor's in Engineering exam papers. You will be given a question, its full marks, the ideal answer, 
    the relevant reference text, and the answer given by the student. Your task is to grade the student's answer strictly, keeping in mind 
    the full marks allocated for the question. If the ideal answer and relevant text lacks important information or is not present, use your own judgment and intuition 
    to evaluate the answer.

    Be sure to evaluate the completeness, accuracy, and clarity of the student's response while being fair and consistent with the marks.

    QUESTION: '{query}'
    Full Marks: {full_marks}
    Ideal Answer: '{ideal_answer}'
    Relevant Reference Text: '{escaped_relevant_text}'
    
    Student's Answer: '{students_answer}'

    GRADE:
    """

    return prompt


In [147]:
import google.generativeai as genai
def generate_answers(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [148]:
def the_final_function(db, query, full_marks, students_answer, ideal_answer=None, n_results=3):
    """
    Generates an answer based on a student's response, relevant reference text, and an ideal answer.
    
    Parameters:
    - db: The Chroma database for retrieving relevant text.
    - query: The exam question.
    - full_marks: The full marks allocated for the question.
    - students_answer: The answer provided by the student.
    - ideal_answer: The ideal answer, if available (defaults to None).
    - n_results: The number of relevant text chunks to retrieve (default is 3).
    
    Returns:
    - The generated grade or evaluation based on the prompt.
    """
    
    # Retrieve the top N relevant text chunks for the query
    relevant_text_chunks = get_relevant_passage(query, db, n_results=n_results)
    
    # If no relevant text is found, return a default message
    # if not relevant_text_chunks:
    #      "No relevant information found for grading."

    # Combine the retrieved text chunks into one passage
    relevant_text=""
    if relevant_text_chunks:
        relevant_text = " ".join(relevant_text_chunks)
    
    
    print(ideal_answer)
    print(relevant_text)
    
    # Generate the grading prompt using the ideal answer (if available) and student answer
    prompt = make_rag_prompt(query, full_marks, ideal_answer=ideal_answer,  relevant_text=relevant_text, students_answer=students_answer)
    
    # Generate the answer or evaluation from the model based on the prompt
    answer = generate_answers(prompt)
    
    return answer


In [150]:
the_final_function(db,"write the formula for interest?",ideal_answer="",full_marks=5,students_answer="""i=pxtxr/100""")

[[0.06552106, -0.085833326, -0.030842869, 0.0031778654, 0.032490566, 0.027705185, 0.08586573, -0.013249637, 0.014884033, 0.050556574, -0.011750648, 0.0013063372, -0.004395374, -0.0075439825, -0.016061168, 0.009568496, 0.010611396, -0.0015417684, -0.0049430337, -0.021431426, 0.04570701, -0.0028140196, -0.042008024, 0.0041392166, 0.06816833, -0.04635754, -0.012766386, -0.07873744, -0.049195692, 0.008897233, -0.030973578, 0.029301595, -0.02767131, -0.0015219199, 0.022192078, -0.03719065, -0.0051138196, 0.039926756, -0.032875914, 0.0393713, -0.0166, -0.012154261, -0.053922698, 0.011934627, 0.017676905, -0.012878251, -0.015896138, 0.02854291, -0.0036236327, -0.07821797, 0.025946504, 0.008421257, 0.093319744, -0.026310602, -0.00998766, -0.05943065, -0.0036578353, -0.050932534, 0.024344768, 0.011495029, -0.006501479, -0.01736309, -0.003823142, 0.0003873376, 0.009627019, -0.034530707, 0.010372694, 0.07207104, 0.025227543, 0.00043940046, -0.0018550659, -0.014942476, 0.053744383, 0.009418298, 0.

"2 out of 5\n\nThe student's answer, 'i=pxtxr/100', is a formula for calculating interest, where 'i' represents interest, 'p' represents principal, 't' represents time, and 'r' represents the rate of interest. This formula is generally used for simple interest calculations. \n\nWhile the formula provided by the student is technically correct for calculating simple interest, it does not represent the most general form of the interest formula, which should account for both simple and compound interest. The ideal answer, which was not provided in the given context, should have included both the simple interest formula and the compound interest formula, as well as an explanation of the differences between the two. \n\nTherefore, the student's answer deserves partial credit for providing the simple interest formula, but loses marks for not providing the compound interest formula and for not explaining the differences between the two."

<class 'str'>
