## Necessary libraries

In [19]:
from peft import AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain.vectorstores.pgvector import PGVector
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

## Load the fine tuned model

#### Path of the PEFT adaptor

In [8]:
path_to_adaptor = "/home/ubuntu/genai_learning/llm/III_Finetuning_For_RAG/mistral-7b-int4-dolly"

#### load base LLM model with PEFT Adapter

In [10]:
# load base LLM model with PEFT Adapter

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoPeftModelForCausalLM.from_pretrained(
    path_to_adaptor,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    use_flash_attention_2=True,
    quantization_config = bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(path_to_adaptor)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Function to pass prompt and generate response

In [22]:
# prompt and temperature are paramters here

def generate(prompt, temp=0.3):
    """Convenience function for generating model output"""
    # Tokenize the input
    input_ids = tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True).input_ids.cuda()
    
    # Generate new tokens based on the prompt, up to max_new_tokens
    # Sample aacording to the parameter
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids, 
            max_new_tokens=300, 
            do_sample=True, 
            top_p=0.9,
            temperature=temp,
            use_cache=True
        )
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

## Connect to the vector DB

#### connection string of the vector db

In [13]:
# The connection to the database
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver= "psycopg2",
    host = "localhost",
    port = "5432",
    database = "vectordb",
    user= "username",
    password="password"
)

print(CONNECTION_STRING)

postgresql+psycopg2://username:password@localhost:5432/vectordb


#### Load the embedding model

In [17]:
embedding_model = SentenceTransformerEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True}
)

#### Creates the database connection to our existing DB

In [18]:
# Creates the database connection to our existing DB
db = PGVector(
    connection_string = CONNECTION_STRING,
    collection_name = "capstone_embedding",
    embedding_function = embedding_model
)

  warn_deprecated(


## Prepare prompt templates

In [20]:
# prepare different types of templates for prompots

zero_shot_prompt_template = """
Question: {query}

Answer:
"""

few_shot_promt_template = """
{shot1}

{shot2}

Question: {query}

Answer:
"""

rag_promt_template = """
Answer the question using only the following context:

Context: {context}

Question: {query}

Answer: 
"""

## Query the LLM (Perf)

In [23]:
# CardioVita is not used for osteoporosis - Ref MSL note 38, 13

query = "Summarize the expert opinion on efficacy of the drug CardioVita in osteoporosis."

#### Zero shot prompt with temp=0.1

In [24]:
prompt = zero_shot_prompt_template.format(
    query=query
)

res = generate(
    prompt=prompt,
    temp=0.1
)

print(f"Prompt:\n{prompt}\n")
print(f"Generated Response:\n{res}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt:

Question: Summarize the expert opinion on efficacy of the drug CardioVita in osteoporosis.

Answer:


Generated Response:
CardioVita is a drug that is used to treat osteoporosis. It is a bisphosphonate that works by inhibiting bone resorption. It has been shown to be effective in increasing bone density and reducing fracture risk in postmenopausal women.

This advice was from basu.



#### Zero shot prompt with temp=0.9

In [26]:
prompt = zero_shot_prompt_template.format(
    query=query
)

res = generate(
    prompt=prompt,
    temp=0.9
)

print(f"Prompt:\n{prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt:

Question: Summarize the expert opinion on efficacy of the drug CardioVita in osteoporosis.

Answer:


Generated Response:
CardioVita has been shown to have some efficacy in the treatment of osteoporosis, but the results of clinical trials are mixed. Some studies have shown that CardioVita can improve bone density and reduce the risk of fractures, while other studies have found that it has little effect. It is important to talk to your doctor about whether CardioVita is right for you.

This advice was from basu.



#### Few shot prompt with temp=0.1

In [27]:
shot1 = """CardioVita is an innovative cardiovascular medication, highlighting its potential in reducing cardiovascular events and improving patient outcomes. Discussed the mechanism of action and its ability to target specific pathways involved in cardiac disease progression"""
shot2 = """OsteoGuard as a potential therapy for osteoporosis, focusing on bone health preservation."""

prompt = few_shot_promt_template.format(
    shot1=shot1,
    shot2=shot2,
    query=query
)

res = generate(
    prompt=prompt,
    temp=0.1
)

print(f"Prompt:\n{prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt:

CardioVita is an innovative cardiovascular medication, highlighting its potential in reducing cardiovascular events and improving patient outcomes. Discussed the mechanism of action and its ability to target specific pathways involved in cardiac disease progression

OsteoGuard as a potential therapy for osteoporosis, focusing on bone health preservation.

Question: Summarize the expert opinion on efficacy of the drug CardioVita in osteoporosis.

Answer:


Generated Response:
CardioVita is an innovative cardiovascular medication, highlighting its potential in reducing cardiovascular events and improving patient outcomes. Discussed the mechanism of action and its ability to target specific pathways involved in cardiac disease progression.

OsteoGuard as a potential therapy for osteoporosis, focusing on bone health preservation.

This advice was from basu.



#### Few shot prompt with temp=0.9

In [28]:
shot1 = """CardioVita is an innovative cardiovascular medication, highlighting its potential in reducing cardiovascular events and improving patient outcomes. Discussed the mechanism of action and its ability to target specific pathways involved in cardiac disease progression"""
shot2 = """OsteoGuard as a potential therapy for osteoporosis, focusing on bone health preservation."""

prompt = few_shot_promt_template.format(
    shot1=shot1,
    shot2=shot2,
    query=query
)

res = generate(
    prompt=prompt,
    temp=0.9
)

print(f"Prompt:\n{prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt:

CardioVita is an innovative cardiovascular medication, highlighting its potential in reducing cardiovascular events and improving patient outcomes. Discussed the mechanism of action and its ability to target specific pathways involved in cardiac disease progression

OsteoGuard as a potential therapy for osteoporosis, focusing on bone health preservation.

Question: Summarize the expert opinion on efficacy of the drug CardioVita in osteoporosis.

Answer:


Generated Response:
Based on the expert opinion, CardioVita is a promising therapy for osteoporosis.

This advice was from basu.



#### RAG prompt with temp=0.1

In [30]:
docs_with_scores = db.similarity_search_with_score(query, k = 1)


context_prompt = rag_promt_template.format(
    context = docs_with_scores[0][0].page_content,
    query = query
)

res = generate(context_prompt, temp=0.1)

print(f"Prompt:\n{context_prompt}\n")
print(f"Generated Response:\n{res}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt:

Answer the question using only the following context:

Context: 7. Emerging Research and Future Developments:  
• Highl ighted ongoing research on OsteoGuard's potential in preventing glucocorticoid -
induced osteoporosis.  
• Dr. Harper questioned the rationale behind expanding research without addressing 
existing concerns.  
8. Competitive Landscape:  
• Compared OsteoGuard to existing osteoporosis therapies, with Dr. Harper expressing a 
preference for more established options.  
• Discussed the challenges of introducing a new drug without clear advantages over 
existing treatments.  
9. Access and Reimbursement:  
• Dr. Harper expressed skepticism ab out the cost -effectiveness of OsteoGuard, especially 
given existing therapeutic alternatives.  
• Shared concerns about potential financial burdens on patients and healthcare systems.  
10. Action Items and Follow -Up: 
• Dr. Harper did not express interest in further c ollaboration or follow -up. 
• Agreed to remain open t

#### RAG prompt with temp=0.9

In [None]:
docs_with_scores = db.similarity_search_with_score(query, k = 1)


context_prompt = rag_promt_template.format(
    context = docs_with_scores[0][0].page_content,
    query = query
)

res = generate(context_prompt, temp=0.9)

print(f"Prompt:\n{context_prompt}\n")
print(f"Generated Response:\n{res}")