In [27]:
# MTEB: Massive text embedding model 
# https://huggingface.co/spaces/mteb/leaderboard

# open source llm https://huggingface.co/google/gemma-7b-it
#create write token from huggingface
# gemma-7b-it 
# gemma-2b-it


In [28]:

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2')
# embeddings = embedding_model.encode(sentences)
input_query = "What is Metabolism of Proteins"
query_embedding = embedding_model.encode(input_query)
query_embedding_list = query_embedding.tolist()

print("query_embedding len", len(query_embedding_list))
print(query_embedding_list[:5])

query_embedding len 768
[0.05997343361377716, -0.013056949712336063, 0.023020777851343155, -0.02001880295574665, -0.007817837409675121]


In [29]:
print(type(query_embedding_list))
print(type(query_embedding))

<class 'list'>
<class 'numpy.ndarray'>


In [30]:
import psycopg2
import numpy as np
import os
from dotenv import load_dotenv

def connect_to_db():
    # Load environment variables from .env file
    load_dotenv()
    DB_NAME = os.getenv("DB_NAME")
    DB_USER = os.getenv("DB_USER")
    DB_PASS = os.getenv("DB_PASS")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")
    
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASS,
            host=DB_HOST,
            port=DB_PORT
        )
        print("Connected to PostgreSQL database successfully!")
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to PostgreSQL database: {e}")
        return None


In [31]:

# CREATE OR REPLACE FUNCTION get_embedding(	
# 	rows_limit int,
#     embedding_input vector(768)
# )
# RETURNS TABLE (id bigint, chunk varchar(400), embedding vector(768))
# LANGUAGE plpgsql
# AS $$
# BEGIN    
# 	RETURN QUERY SELECT ni.id,ni.chunk, ni.embedding FROM nutritionitems ni
# 	ORDER BY ni.embedding <=> embedding_input LIMIT rows_limit;
# END;
# $$;

# select get_embedding(2::int,'[5.99734336e-02,-1.30569497e-02]'::vector);


In [59]:
def fetch_data(conn):    
    try:
        with conn.cursor() as cur:
            select_sql = """SELECT * from get_embedding(%s,%s::vector);"""
            cur.execute(select_sql, (5, query_embedding_list))
            nutritionitems = cur.fetchall()
            retrieved_chunks = []
            for item in nutritionitems:
                retrieved_chunks.append(item[1])
                #print(f"ID: {item[0]} chunk: {item[1]} vector: {item[2][:100]} ")
        return retrieved_chunks
    except psycopg2.Error as e:
        print(f"Error fetching data: {e}")

conn = connect_to_db()
retrieved_chunks = fetch_data(conn)
if conn:
    conn.close()
    print("Connection closed.")
print("retrieved_chunks",retrieved_chunks)

Connected to PostgreSQL database successfully!
Connection closed.
retrieved_chunks ['protein metabolism were proposed and  challenged, leading to the more or less contemporary  view which was established through the seminal work  of Rudolf Schoenheimer, conducted at Columbia', 'New York, 1969. Munro HN, ed. Mammalian Protein Metabolism, vol. IV. Academic  Press, New York, 1970. Waterlow JC, Garlick PJ, Millward DJ. Protein Turnover in  Mammalian Tissues and in the Whole', 'London, 1985. Lehninger AL, Nelson DL, Cox MM. Principles of Biochemistry, 2nd  edn. Worth, New York, 1993. Munro HN, ed. Mammalian Protein Metabolism, vol. III. Academic  Press, New York, 1969.', 'Nutrition and Metabolism of Proteins  53 either their speciﬁ c chemical properties or speciﬁ c  metabolic interrelationships. Examples of the former  are the facility of methionine to donate a methyl', 'more  critical for students to understand the physiology of  human protein metabolism at its various levels of  biologica

In [56]:
def prompt_formatter(query:str,
                    context_items: list[dict])-> str:
    context = "- "+"\n- ".join([item for item in context_items])            
    
    base_prompt = """Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query
    Don't return the thinking, only return answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    \nExample 1:
    Query: What are fat-soluble vitamins?
    Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and vitamin K. These vitamins dissolve in fat and are absorbed and stored in the body's fatty tissues
    \nExample 2:
    Query: What are the causes of type 2 diabetes?
    Answer: Insulin Resistance – Body’s cells don’t respond properly to insulin, leading to elevated blood glucose. Pancreatic Dysfunction – Over time, the pancreas can’t produce enough insulin to compensate for resistance.
    Now use the following context items to answer user query:
    {context}
    \nRelevant passages: <extract relevant passages from the context here>
    User query: {query}
    Answer:"""
    base_prompt = base_prompt.format(context=context, query=query)
    return base_prompt

In [60]:
formatted_prompt = prompt_formatter(input_query,retrieved_chunks)
print("formatted_prompt",formatted_prompt)

formatted_prompt Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query
    Don't return the thinking, only return answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    
Example 1:
    Query: What are fat-soluble vitamins?
    Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and vitamin K. These vitamins dissolve in fat and are absorbed and stored in the body's fatty tissues
    
Example 2:
    Query: What are the causes of type 2 diabetes?
    Answer: Insulin Resistance – Body’s cells don’t respond properly to insulin, leading to elevated blood glucose. Pancreatic Dysfunction – Over time, the pancreas can’t produce enough insulin to compensate for resistance.
    Now use the following context items to answer user query:
    - protein metabolism were proposed 