In [37]:
# MTEB: Massive text embedding model 
# https://huggingface.co/spaces/mteb/leaderboard

# open source llm https://huggingface.co/google/gemma-7b-it
#create write token from huggingface
# gemma-7b-it 
# gemma-2b-it


In [63]:

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path='all-mpnet-base-v2')
# embeddings = embedding_model.encode(sentences)
#input_query = "What is Metabolism of Proteins?"
input_query = "Describe the process of digestion and absorption of nutrients in the human body."
query_embedding = embedding_model.encode(input_query)
query_embedding_list = query_embedding.tolist()

print("query_embedding len", len(query_embedding_list))


query_embedding len 768


In [64]:
import psycopg2
import numpy as np
import os
from dotenv import load_dotenv

def connect_to_db():
    # Load environment variables from .env file
    load_dotenv()
    DB_NAME = os.getenv("DB_NAME")
    DB_USER = os.getenv("DB_USER")
    DB_PASS = os.getenv("DB_PASS")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")
    
    try:
        conn = psycopg2.connect(
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASS,
            host=DB_HOST,
            port=DB_PORT
        )
        print("Connected to PostgreSQL database successfully!")
        return conn
    except psycopg2.Error as e:
        print(f"Error connecting to PostgreSQL database: {e}")
        return None


In [52]:

# CREATE OR REPLACE FUNCTION get_embedding(	
# 	rows_limit int,
#     embedding_input vector(768)
# )
# RETURNS TABLE (id bigint, chunk varchar(2500), embedding vector(768), cosine_similarity DOUBLE PRECISION)
# LANGUAGE plpgsql
# AS $$
# BEGIN    
# 	RETURN QUERY SELECT ni.id,ni.chunk, ni.embedding, 1 - (ni.embedding <=> embedding_input) AS cosine_similarity FROM nutritionitems ni
# 	ORDER BY cosine_similarity DESC LIMIT rows_limit;
# END;
# $$;

# select get_embedding(2::int,'[5.99734336e-02,-1.30569497e-02]'::vector);


In [65]:
def fetch_data(conn):    
    try:
        cur = conn.cursor()
        select_sql = """SELECT * from get_embedding(%s,%s::vector);"""
        cur.execute(select_sql, (5, query_embedding_list))
        nutritionitems = cur.fetchall()
        #print("nutritionitems length", nutritionitems)
        retrieved_chunks_internal = []
        for item in nutritionitems:
            retrieved_chunks_internal.append({"id":item[0],
                                     "chunk": item[1],
                                     "cosine_similarity": item[3]
                                    })
            #print(f"ID: {item[0]} chunk: {item[1]} vector: {item[2][:100]} ")
        cur.close()
        return retrieved_chunks_internal
    except psycopg2.Error as e:
        print(f"Error fetching data: {e}")

conn = connect_to_db()
retrieved_chunks_dict = fetch_data(conn)
if conn:
    conn.close()
    print("Connection closed.")

retrieved_chunks_file = "Retrieved_chunks_test.txt"

if os.path.exists(retrieved_chunks_file):
    os.remove(retrieved_chunks_file)

retrieved_chunks = []
with open(retrieved_chunks_file, 'w', encoding='utf-8') as file:
    for item in retrieved_chunks_dict :      
        retrieved_chunks.append(item["chunk"])
        file.write(f"{item["id"]} | {item["cosine_similarity"]} | {item["chunk"]} \n\n")


Connected to PostgreSQL database successfully!
Connection closed.


In [66]:
def prompt_formatter(query:str,
                    context_items: list[dict])-> str:
    context = "- "+"\n- ".join([item for item in context_items])            
    
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query
Don't return the thinking, only return answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and vitamin K. These vitamins dissolve in fat and are absorbed and stored in the body's fatty tissues
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Insulin Resistance – Body’s cells don’t respond properly to insulin, leading to elevated blood glucose. Pancreatic Dysfunction – Over time, the pancreas can’t produce enough insulin to compensate for resistance.
\nNow use the following context items to answer user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""
    base_prompt = base_prompt.format(context=context, query=query)
    return base_prompt

In [67]:
formatted_prompt = prompt_formatter(input_query,retrieved_chunks)
#print("formatted_prompt",formatted_prompt)


In [59]:
# hugging face token stored in env variable HF_TOKEN
from huggingface_hub import login
import os

# Using os.getenv()
# Returns the value of 'MY_VARIABLE' or None if it doesn't exist
# os.environ['HF_TOKEN'] = ''
hf_token_val = os.getenv('HF_TOKEN') 

# your_token = "hf_YOUR_ACTUAL_TOKEN_HERE" # Replace with your copied token
login(token=hf_token_val)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [68]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")

# Define a chat history
messages = [
    {"role": "user", "content": formatted_prompt}    
]

# Apply the chat template, which automatically handles BOS/EOS and formatting
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt_file = "prompt_test.txt"

if os.path.exists(prompt_file):
    os.remove(prompt_file)

with open(prompt_file, 'w', encoding='utf-8') as file:
    file.write(formatted_prompt)

In [69]:

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b-it",
    torch_dtype=torch.bfloat16
)

#input_text = "Write me a poem about Machine Learning."
#input_ids = tokenizer(formatted_prompt, return_tensors="pt")

inputs = tokenizer.encode(formatted_prompt, add_special_tokens=False, return_tensors="pt")
outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=250)

#outputs = model.generate(**input_ids)
print(tokenizer.decode(outputs[0]))


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.12it/s]


<bos><start_of_turn>user
Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query
Don't return the thinking, only return answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and vitamin K. These vitamins dissolve in fat and are absorbed and stored in the body's fatty tissues

Example 2:
Query: What are the causes of type 2 diabetes?
Answer: Insulin Resistance – Body’s cells don’t respond properly to insulin, leading to elevated blood glucose. Pancreatic Dysfunction – Over time, the pancreas can’t produce enough insulin to compensate for resistance.

Now use the following context items to answer user query:
- right, not only essential for human development but also as an outcome o