In [12]:
import os
import re
import json

import pandas as pd
import numpy as np

import faiss
from openai import OpenAI
from PyPDF2 import PdfReader
from dotenv import load_dotenv

load_dotenv()

True

# Medical QnA Bot

Designing a bot that takes in a user query and only considers our own data source to ask relevant follow-up questions and guides the user towards relevant sections of the data source.

![title](system.jpg)

In [13]:
##### Data Loading, Processing and Embedding Generation Functionality #####

def read_pdf(file_path):
    reader = PdfReader(file_path)
    text = []
    for page in reader.pages:
        text.append(page.extract_text())
    return "\n".join(text)

def process_text(x):
    return re.sub('[^A-Za-z0-9]+', ' ', x).strip()

def generate_embeddings(text, model):
    embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
    return embedding


def save_embedding_chunks(pdf_text, model, vector_dir, avg_token_len=4, max_tokens=1024):
    if vector_dir not in os.listdir():
        os.mkdir(vector_dir)
    
    # max_tokens = 1024, 1/8 of what text-embedding-3-small can process (8k tokens)    
    chunk_len = avg_token_len * max_tokens
    for i in range(0, len(pdf_text), chunk_len):
        text_chunk = pdf_text[i:i+chunk_len]
        text = process_text(text_chunk)
        embed = generate_embeddings(text, model)
        file_name = f'{vector_dir}/vector_store_{int(i/chunk_len+1)}.csv'
        df = pd.DataFrame({'text': [text], 'embedding': [embed]})
        df.to_csv(file_name, index=False)
        print(f"{int(i/chunk_len+1)}: Saving Embedding Vector at: {file_name}")
        
def load_embedding_vectors(vector_dir):
    embedding_df = pd.DataFrame()
    files = os.listdir(vector_dir)
    for i in range(1, len(files)):
        file_name = f'{vector_dir}/vector_store_{i}.csv'
        df = pd.read_csv(file_name)
        embedding_df = pd.concat([embedding_df, df])
    return embedding_df

def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

##### CSV Based Search and Retreival Functionality ##### 
 
def compute_similarity(embeddings, query_embed, top_k):
    embed_scores = embeddings.apply(
        lambda x: (x['text'], cosine(x['embedding'], query_embed)), axis=1
    )
    embed_scores_df = pd.DataFrame(embed_scores.tolist(), columns=['text', 'score'])
    return embed_scores_df.sort_values(by='score', ascending=False)[:top_k]

def top_k_query_matches(embedding_df, query, model, top_k=3):
    query_embed = generate_embeddings(query, model)
    sim_scores = compute_similarity(embedding_df, query_embed, top_k)
    return '\n'.join(sim_scores['text'].values.tolist())
    
##### FAISS Index Creation and Retreival Functionality #####   

def create_faiss_index(embedding_df, faiss_out, embed_col='embedding'):
    embedding_df[embed_col] = embedding_df[embed_col].apply(lambda x: json.loads(x))
    embeddings = np.vstack(embedding_df[embed_col].values.tolist())
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    faiss.write_index(index, faiss_out)
    print(f'Saving Faiss Index Locally at: {faiss_out}')
    return index

def faiss_top_k(df, query, index_path, top_k=5):
    index = faiss.read_index(index_path)
    embed = generate_embeddings(query, model=model)
    search_embed = np.array(embed).reshape(1, -1)
    S, I = index.search(search_embed, k=top_k)
    return '\n'.join(df.iloc[I[0].tolist()]['text'].values.tolist()), S, I

def valid_response(embedding_df, query_response, index_path, threshold=0.5):
    context, S, I = faiss_top_k(embedding_df, query_response, index_path)
    if np.any(S[0] > threshold):
        return True
    return False

def duplicate_response(previous_responses, resp_embed, threshold=0.8):
    for pr_embed in previous_responses:
        cos = round(cosine(pr_embed, resp_embed), 2)
        if cos >= threshold:
            return True
    return False

In [14]:
# initialize openai client
client = OpenAI()

In [15]:
# load pdf file as text/str
file_path = 'data/Clinical practice guideline-Brian-isselbacher-et-al-2022-2022-acc-aha-guideline-for-the-diagnosis-and-management-of-aortic-disease-a-report-of-the.pdf'
pdf_text = read_pdf(file_path)
len(pdf_text)

858137

In [16]:
# generate embeddings
# model = "text-embedding-3-small"
# vector_dir = "embedding_vectors"
# save_embedding_chunks(pdf_text, model, vector_dir)

In [17]:
# load embeddings and save in a single csv
vector_dir = "embedding_vectors"
embeddings = load_embedding_vectors(vector_dir)

embed_file_csv = "embeddings_1k.csv"
embeddings.to_csv(embed_file_csv, index=False)

In [18]:
# create a faiss index locally
index_path = "embeddings.faiss"
embeddings = pd.read_csv(embed_file_csv)
index = create_faiss_index(embeddings, faiss_out=index_path)

Saving Faiss Index Locally at: embeddings.faiss


In [19]:
# load embedding file
embed_file_csv = "embeddings_1k.csv"
embedding_df = pd.read_csv(embed_file_csv)

In [20]:
# embed_file_csv = "embeddings_4k.csv"
# embedding_df = pd.read_csv(embed_file_csv)
# embedding_df['embedding'] = embedding_df['embedding'].apply(lambda x: json.loads(x))

### 1. QA Bot V1

The Embedding model being used is `text-embedding-3-small` from OpenAI. The text generation model is 'GPT-3.5-Turbo'. The bot maintains all previous interactions in the query string. It also adds context by using all previous
queries and fetching `top_k` documents from the vector store and makes conversation based on that. 

## TO-DOs:

##### 1. Use an LLM to articulate user-query that can be used to generate embeddings.
i. Generate embeddings after removing stop words using NLTK.<br>
ii. Use LLM to articulate query and generate embeddings to search. (optional second step)
##### 2. Avoid duplicates:
i. Check similarity with previous questions and if greater than a threshold, get rid of duplicates.<br>
ii. Engineer system prompt to ensure no duplicate questions are asked by the LLM.
##### 3. Previously acquired information that is verified should not be verified again.
##### 4. Post-processing in vector-store (diversity ranking), it drops the most similar snippets (chunks --> mutually exclusive splits remain).
##### 5. Use local LLMs and even fine-tune an embedding model (optional)

In [21]:
model = "text-embedding-3-small"
query = "i have a 65 year old patient with aortic aneurysm, what are the treatment options?"

In [None]:
while(True):
    index_path = "embeddings.faiss"
    context, S, I = faiss_top_k(embedding_df, query, index_path)
    
    system_msg = f"""You are a medical expert. You can only use the information provided in the context 
    that comes from our own data source. When asked a question, you will ask relevant follow-up questions 
    till you are able to find relevant sections of the data/information required by the user. If you cannot
    answer, then respond with I don't know the answer. Once relevant information is provided to the user,ask 
    them if they are satisfied with response, if they answer is yes, then terminate the conversation.
    \n:Context:{context}\n
    """
    
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": query},
    ]
    
    response = client.chat.completions.create(model="gpt-3.5-turbo", messages=messages, temperature=0)
    query_response = response.choices[0].message.content
    validity_check = valid_response(embedding_df, query_response, index_path)
    if not validity_check:
        print("\nSorry I cannot answer based on given docuements")
        break
    else:
        print(f"\n{query_response}")
        user_input = input()
        query = f"{query}, {query_response}, {user_input}"
        


What is the size of the aortic aneurysm in your patient?


 5mm



What is the maximum diameter of the aortic aneurysm in your patient?


 5mm



Based on the information provided, the maximum diameter of the aortic aneurysm in your 65-year-old patient is 5mm. 

Given that the maximum diameter of the aortic aneurysm in your patient is 5mm, it falls below the threshold for elective surgical repair, which is typically considered at 5.5cm. 

For a patient with a 5mm aortic aneurysm, the current recommendation would be surveillance and monitoring of the aneurysm size over time. 

Would you like more information on surveillance and monitoring recommendations for aortic aneurysms with a diameter of 5mm?


 yes



What specific information would you like to know about surveillance and monitoring recommendations for aortic aneurysms with a diameter of 5mm?


 what medication can be given to them



What specific information would you like to know about medication for patients with aortic aneurysms with a diameter of 5mm?


### 2. QA Bot V2

Insted of appending all previous interactions in the query string, there's a query list that's being maintained.
The first query is added to the list, then subsequent `response` + `user_input` are added. This queue is then used to fetch `top_k` matches from vector store. There's also a message list being maintained for the OpenAI model. It has the `system_msg`, `context`, `query` as first 3 objects. Then, for each turn, `query_response`, `new_context` and `new_user_query` is appended to the list. This is then passed onto the model for generating conversation.

In [7]:
history_limit = 10
model = "text-embedding-3-small"
query = "are there any treatment options for aortic aneurysms in 65 year old patients"

In [10]:
queries = [query]
context, S, I = faiss_top_k(embedding_df, ', '.join(queries), index_path)

system_msg = f"""You are a medical expert. You can only use the information provided in the context 
    that comes from our own data source. When asked a question, you will ask relevant follow-up questions 
    till you are able to find relevant sections of the data/information required by the user. If you cannot
    answer, then respond with I don't know the answer. Once relevant information is provided to the user,ask 
    them if they are satisfied with response, if they answer is yes, then terminate the conversation.
    """

messages = [
    {"role": "system", "content": system_msg},
    {"role": "user", "content": context},
    {"role": "user", "content": query},
]

turn = 0

while(True):
    response = client.chat.completions.create(model="gpt-3.5-turbo", messages=messages, temperature=0)
    query_response = response.choices[0].message.content
    validity_check = valid_response(embedding_df, query_response, index_path)
    if not validity_check:
        print("\nSorry I cannot answer based on given docuements")
        break
    else:
        print(f"\n{query_response}")
        user_input = input()
        query = f"{query_response}, {user_input}"
        queries.append(query)
        context = faiss_top_k(embedding_df, ', '.join(queries), index_path)
        messages.append({"role": "assistant", "content": query_response})
        messages.append({"role": "user", "content": context})
        messages.append({"role": "user", "content": user_input})
        if len(messages) > history_limit:
            messages = [messages[0]] + messages[-history_limit:]
    
        turn += 1
    


To provide you with accurate information, I need to ask a few follow-up questions based on the data provided. 

1. Do you have any specific information on the size of the aortic aneurysm in the 65-year-old patient?
2. Are there any symptoms associated with the aortic aneurysm in this patient?
3. Has the patient undergone any previous treatments or surgeries for the aortic aneurysm?
4. Are there any other medical conditions or risk factors present in this patient that may impact treatment options?


 the size is 3cm, the patient also has diabetes, no they didn't have any previous surgeries


TypeError: Object of type ndarray is not JSON serializable