In [1]:
from typing import List, Dict
from langchain.vectorstores.pgvector import PGVector

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

from peft import AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [3]:
# THE FIRST TIME YOU RUN THIS, IT MIGHT TAKE A WHILE
model_path_or_id = "mistralai/Mistral-7B-v0.1"
# lora_path=False
# lora_path = "./mistral-7b-int4-dolly/checkpoint-82" # BAD
# lora_path = "./mistral-7b-int4-dolly/checkpoint-80" # BAD
# lora_path = "./mistral-7b-int4-dolly_SMALL/checkpoint-16" # 400 samples BAD
# lora_path = "./mistral-7b-int4-dolly_SMALL/" # 400 samples BAD
# lora_path = "./mistral-7b-int4-dolly_SMALL_V2/" # 1000 samples BAD
# lora_path = "./mistral-7b-int4-dolly_SMALL_400_3epochs/" # BAD - non-blank context only
# lora_path = "./mistral-7b-int4-dolly_FULL_3279_1epochs/" # BAD - non-blank context only
# lora_path = "./mistral-7b-int4-dolly_FULL_3279_1epochs/checkpoint-62" # BAD - non-blank context only
# lora_path = "./mistral-7b-int4-dolly_FULL_3279_1epochs_r8_alpha8/" # BAD
# lora_path = "./mistral-7b-int4-dolly_FULL_3279_1epochs_r16_alpha32_lr1e-4" # BAD
# lora_path = "./mistral-7b-int4-dolly_summarization" # BAD
# lora_path= "./mistral-7b-int4-dolly_summarization_r16_a16_ep3_LR1e3_datacampv1/checkpoint-22" # BAD
# lora_path="./mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_datacampv2_redproj" # BAD
# lora_path="./mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_blankcontextallowed"
# lora_path="./mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_qkvo_projonly" # has only 1000 now and unk-token padded
# lora_path="./mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_qkvo_fixedtokens" # best
# lora_path="./mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_qkvo_fixedtokens_state_sourcev1/checkpoint-82"
# lora_path="./mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_qkvo_fixedtokens_state_sourcev2"
lora_path="./mistral-7b-int4-dolly_summarization_r8_a16_ep1_LR1e3_allproj_fixedtokens"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False, # AP: For nested quantization
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

if lora_path:
    # load base LLM model with PEFT Adapter
    model = AutoPeftModelForCausalLM.from_pretrained(
        lora_path,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        use_flash_attention_2=True,
        # attn_implementation="flash_attention_2",
        quantization_config = bnb_config
    )
    # tokenizer = AutoTokenizer.from_pretrained(lora_path)
    # tokenizer = AutoTokenizer.from_pretrained(lora_path, padding_side='left')
    tokenizer = AutoTokenizer.from_pretrained(lora_path, padding_side='right') # AP: ADDED
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_path_or_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        # use_flash_attention_2=True,
        attn_implementation="flash_attention_2",
        quantization_config = bnb_config
    )
    # tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
    # tokenizer = AutoTokenizer.from_pretrained(model_path_or_id, padding_side='left')
    tokenizer = AutoTokenizer.from_pretrained(model_path_or_id, padding_side='right')  # AP: ADDED

def generate(prompt, max_new_tokens = 100, temperature = 0.7):
    """Convenience function for generating model output"""
    # Tokenize the input
    input_ids = tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True).input_ids.cuda()
    
    # Generate new tokens based on the prompt, up to max_new_tokens
    # Sample aacording to the parameter
    with torch.inference_mode(mode=True): # AP: added mode=True
        outputs = model.generate(
            input_ids=input_ids, 
            max_new_tokens=max_new_tokens, 
            do_sample=True,
            top_p=0.9,
            temperature=temperature,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id ## ADDED - AP
        )
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# The connection to the database
CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver= "psycopg2",
    host = "localhost",
    port = "5432",
    database = "postgres",
    user= "username",
    password="password"
)

# The embedding function that will be used to store into the database
embedding_function = SentenceTransformerEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs = {'device': 'cuda'},
    encode_kwargs = {'normalize_embeddings': True}
)

# Creates the database connection to our existing DB
db = PGVector(
    connection_string = CONNECTION_STRING,
    collection_name = "embeddings",
    embedding_function = embedding_function
)

### Print sources

In [9]:
question = "Who can you think of works at the clinic?"

docs_with_scores = db.similarity_search_with_score(question, k = 3)

In [24]:
def concat_RAG_sources_and_pages(docs_input):
    """
    Concatenate and list sources+pages for multiple documents in RAG
    """
    
    concat_source_lst = []
    for tup in docs_input:
        doc_source = re.search('(MSL .{1,})', tup[0].metadata["source"])[0]
        doc_page = tup[0].metadata["page"]
        
        source_page = f"{doc_source}, page = {doc_page}"
        concat_source_lst.append(source_page)
        
    concat_source_lst.sort()        
    concat_source_info = "; ".join(concat_source_lst)
    
    return concat_source_info

In [12]:
import re
docs_with_scores[0][0].metadata

{'source': '../../msl-data/MSL Notes_4.pdf', 'page': 0}

In [18]:
for tup in docs_with_scores:
    print(tup[0].metadata["source"])

../../msl-data/MSL Notes_4.pdf
../../msl-data/MSL Notes_1.pdf
../../msl-data/MSL Notes_4.pdf


In [25]:
concat_RAG_sources_and_pages(docs_with_scores)

'MSL Notes_1.pdf, page = 0; MSL Notes_4.pdf, page = 0; MSL Notes_4.pdf, page = 1'

In [10]:
docs_with_scores[0][1]

0.5062807489131533

In [22]:
RAG_PROMPT_TEMPLATE = """### Context:
{context}

### Question:
Using only the context above, {question}

### Response:
"""

empty_context = ""
question = "What is the efficacy of NeuroGlyde?"
# question = "What is the efficacy of Aetherisol?"
# question = "What kind of doctor is Michael Chang?"
# question="What does Gastroguard treat?"
# question = "What clinic does Jonathan Reynolds work at?"

# question = "What is Aetherisol used to treat?"
# question="What is the mechanism of action for Gastroguard?"
# question="What is a drug name for treating gastric medical problems and what does it treat?"


docs_with_scores = db.similarity_search_with_score(question, k = 2)
context_prompt = RAG_PROMPT_TEMPLATE.format(
    context = docs_with_scores[0][0].page_content,
    question = question
)

res = generate(context_prompt, max_new_tokens = 1000, temperature = 0.4)

print(f"Question:\n{question}\n")
print(f"Generated Response:\n{res}")

Question:
What is the efficacy of NeuroGlyde?

Generated Response:
NeuroGlyde has been shown to be effective in reducing annualized relapse rates by 40% in multiple sclerosis patients. It has also been shown to improve quality of life measures.

Answer provided by AP.



### Add source? 10Feb2024

In [None]:
docs_with

In [10]:
docs_with_scores[0][0].metadata["source"]

'../../msl-data/MSL Notes_32.pdf'

In [None]:
docs

In [17]:
import re

re.search("(MSL .{1,})", docs_with_scores[0][0].metadata["source"])[0]

'MSL Notes_32.pdf'

In [13]:
import re

### NEW PROMPT ###
RAG_PROMPT_TEMPLATE = """### Context:
{context}

### Question:
Using only the context above, {question}. Provide the source of response below.

### Response:
"""

empty_context = ""
# question = "What is the efficacy of NeuroGlyde?"
# question = "What is the efficacy of Aetherisol?"
# question = "What kind of doctor is Michael Chang?"
# question="What does GastriGuard do?"
question = "What clinic does Jonathan Reynolds work?"

docs_with_scores = db.similarity_search_with_score(question, k = 1)
context_prompt = RAG_PROMPT_TEMPLATE.format(
    context = docs_with_scores[0][0].page_content,
    question = question,
    source_filename = re.search("(MSL .{1,})", docs_with_scores[0][0].metadata["source"])[0]
)

res = generate(context_prompt, max_new_tokens = 300, temperature = 0.2)

print(f"Question:\n{question}\n")
print(f"Generated Response:\n{res}")

Question:
What clinic does Jonathan Reynolds work?

Generated Response:
RespiraLung Pulmonary Clinic

### Source:
Source is: Medical.pdf

Answer provided by AP.



In [5]:
RAG_PROMPT_TEMPLATE = """### Context:
{context}

### Question:
Using only the context above, {question}

### Response:
"""

# empty_context = ""
question = "What is the efficacy of NeuroGlyde?"
# question = "What is the efficacy of Aetherisol?"
# question = "What kind of doctor is Michael Chang?"

docs_with_scores = db.similarity_search_with_score(question, k = 1)
context_prompt = RAG_PROMPT_TEMPLATE.format(
    context = docs_with_scores[0][0].page_content,
    question = question
)

for temp in range(2, 10+1, 2):
    res = generate(context_prompt, max_new_tokens = 100, temperature = temp/10)
    print(f" temp = {temp/10}")
    print(f"Question:\n{question}\n")
    print(f"Generated Response:\n{res}")
    print()

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


 temp = 0.2
Question:
What is the efficacy of NeuroGlyde?

Generated Response:




A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


 temp = 0.4
Question:
What is the efficacy of NeuroGlyde?

Generated Response:




A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


 temp = 0.6
Question:
What is the efficacy of NeuroGlyde?

Generated Response:




A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


 temp = 0.8
Question:
What is the efficacy of NeuroGlyde?

Generated Response:


 temp = 1.0
Question:
What is the efficacy of NeuroGlyde?

Generated Response:




In [10]:
res

''

In [9]:
docs_with_scores

[(Document(page_content="Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on NeuroGlyde  \nDate:  April 10, 2023  \nProvider:  Dr. James Harper  \nTitle:  Neurologist  \nInstitution:  City Neurology Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced NeuroGlyde, a novel neuroprotective agent, emphasizing its potential in \nslowing disease progression.  \n• Discussed ongoing clinical trials and positive early -phase results.  \n2. Provider's Current Patient C ases:  \n• Explored Dr. Harper's experience with NeuroGlyde in treating neurodegenerative \ndisorders.  \n• Discussed improvements in cognitive function observed in Alzheimer's patients.  \n3. Efficacy and Clinical Data:  \n• Presented data demonstrating a 40% reduction in annualized relapse rates in multiple \nsclerosis patients.  \n• Highlighted significant improvements in quality of life measures.  \n4. Safety Profile:  \n• Discussed the favorable safety profile of NeuroGlyde, with 

In [5]:
from pprint import pprint
pprint(context_prompt)

('### Context:\n'
 'Subject:  Medical Science Liaison (MSL) Notes - In-Depth Discussion on '
 'NeuroGlyde  \n'
 'Date:  April 10, 2023  \n'
 'Provider:  Dr. James Harper  \n'
 'Title:  Neurologist  \n'
 'Institution:  City Neurology Clinic  \n'
 'Summary of Key Discussion Points:  \n'
 '1. Introduction:  \n'
 '• Introduced NeuroGlyde, a novel neuroprotective agent, emphasizing its '
 'potential in \n'
 'slowing disease progression.  \n'
 '• Discussed ongoing clinical trials and positive early -phase results.  \n'
 "2. Provider's Current Patient C ases:  \n"
 "• Explored Dr. Harper's experience with NeuroGlyde in treating "
 'neurodegenerative \n'
 'disorders.  \n'
 "• Discussed improvements in cognitive function observed in Alzheimer's "
 'patients.  \n'
 '3. Efficacy and Clinical Data:  \n'
 '• Presented data demonstrating a 40% reduction in annualized relapse rates '
 'in multiple \n'
 'sclerosis patients.  \n'
 '• Highlighted significant improvements in quality of life measures.  \n

In [10]:
len(context_prompt)

1825

In [7]:
def generate(prompt, max_new_tokens = 100, temperature = 0.7):
    """Convenience function for generating model output"""
    # Tokenize the input
    input_ids = tokenizer(
        prompt, 
        return_tensors="pt", 
        truncation=True).input_ids.cuda()
    
    # Generate new tokens based on the prompt, up to max_new_tokens
    # Sample aacording to the parameter
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids, 
            max_new_tokens=max_new_tokens, 
            do_sample=True, 
            top_p=0.9,
            temperature=temperature,
            use_cache=True
        )
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

In [8]:
res = generate(context_prompt, max_new_tokens = 100, temperature = 0.1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [8]:
input_ids_check = tokenizer(context_prompt, return_tensors="pt", truncation=True).input_ids.cuda()

In [9]:
with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids_check, 
            max_new_tokens=100, 
            do_sample=True, 
            top_p=0.9,
            temperature=0.1,
            use_cache=True
        )
        
tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(context_prompt):]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


''

In [10]:
output_check = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

In [15]:
input_ids_check

tensor([[    1,   774, 14268, 28747,    13, 22210, 28747, 28705, 12195,  9323,
           393,   515,  2350,   325,  3477, 28758, 28731, 16254,   387,   560,
         28733, 17603,  3433, 18637,   356,  3147,  2138, 28777,   346,   450,
           259,    13,  3465, 28747, 28705,  3999, 28705, 28740, 28734, 28725,
         28705, 28750, 28734, 28750, 28770,   259,    13,  5342, 28747, 28705,
          2985, 28723,  4797, 23649,   259,    13,  7522, 28747, 28705,  3147,
         28718, 22068,   392,   259,    13,  6060,  6529, 28747, 28705,  3805,
          3147, 28718,  1438,  8677, 19950,   294,   259,    13, 17590,   302,
          7388,  3433, 18637, 24304, 28747,   259,    13, 28740, 28723, 23628,
         28747,   259,    13, 28899,  4666,  3399,  1354,  3147,  2138, 28777,
           346,   450, 28725,   264,  7092, 20342,  8716,   310,   495,  8073,
         28725, 10574,  3864,   871,  4628,   297, 28705,    13,  2181, 15675,
          8030,  5097,   296, 28723,   259,    13, 2

In [46]:
with torch.inference_mode():
    outputs_check = model.generate(
            input_ids=input_ids_check, 
            max_new_tokens=100, 
            do_sample=True, 
            top_p=0.9,
            temperature=0.1,
            use_cache=True
        )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [13]:
tokenizer.batch_decode(outputs_check.detach().cpu().numpy(), skip_special_tokens=True)

NameError: name 'outputs_check' is not defined

In [16]:
tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(context_prompt):]

' ### Context:\nSubject:  Medical Science Liaison (MSL) Notes - In -Depth Discussion on NeuroGlyde  \nDate:  April 10, 2023  \nProvider:  Dr. James Harper  \nTitle:  Neurologist  \nInstitution:  City Neurology Clinic  \nSummary of Key Discussion Points:  \n1. Introduction:  \n• Introduced NeuroG'

In [47]:
outputs_check

tensor([[    1,   774, 14268, 28747,    13, 22210, 28747, 28705, 12195,  9323,
           393,   515,  2350,   325,  3477, 28758, 28731, 16254,   387,   560,
         28733, 17603,  3433, 18637,   356,  3147,  2138, 28777,   346,   450,
           259,    13,  3465, 28747, 28705,  3999, 28705, 28740, 28734, 28725,
         28705, 28750, 28734, 28750, 28770,   259,    13,  5342, 28747, 28705,
          2985, 28723,  4797, 23649,   259,    13,  7522, 28747, 28705,  3147,
         28718, 22068,   392,   259,    13,  6060,  6529, 28747, 28705,  3805,
          3147, 28718,  1438,  8677, 19950,   294,   259,    13, 17590,   302,
          7388,  3433, 18637, 24304, 28747,   259,    13, 28740, 28723, 23628,
         28747,   259,    13, 28899,  4666,  3399,  1354,  3147,  2138, 28777,
           346,   450, 28725,   264,  7092, 20342,  8716,   310,   495,  8073,
         28725, 10574,  3864,   871,  4628,   297, 28705,    13,  2181, 15675,
          8030,  5097,   296, 28723,   259,    13, 2