In [1]:
import os
import re
import json

import pandas as pd
import numpy as np

import faiss
import nltk

from openai import OpenAI
from PyPDF2 import PdfReader
from dotenv import load_dotenv
from textwrap import dedent

from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textwrap import dedent

load_dotenv()

True

# Medical QnA Bot

Designing a bot that takes in a user query and only considers our own data source to ask relevant follow-up questions and guides the user towards relevant sections of the data source.

![title](system.jpg)

In [2]:
def read_pdf(file_path):
    reader = PdfReader(file_path)
    text = []
    for page in reader.pages:
        text.append(page.extract_text())
    return "\n".join(text)

def get_stop_words():
    stop_words = list(stopwords.words('english'))
    stop_words.remove("no")
    return stop_words

def process_text(text, stop_words):
    text = re.sub('[^A-Za-z0-9]+', ' ', text).strip()
    tokens = [w for w in text.split() if w not in stop_words] # word_tokenize(text.lower())
    return ' '.join(tokens)

def generate_embeddings(text, model):
    embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
    return embedding

def save_embedding_chunks(pdf_text_processed, model, embed_file, avg_token_len=4, max_tokens=1024, overlap=512):
    chunk_len = avg_token_len * max_tokens
    text_list = []
    embed_list = []
    for i in tqdm(range(0, len(pdf_text_processed)-chunk_len+overlap, chunk_len-overlap)):
        text = pdf_text_processed[i:i+chunk_len]
        embed = generate_embeddings(text, model)
        text_list.append(text)
        embed_list.append(embed)
    
    df = pd.DataFrame({'text': text_list, 'embedding': embed_list})
    df.to_csv(embed_file, index=False)
    print(f"Saved Vector Embeddings at: {embed_file}")

def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def compute_similarity(embeddings, query_embed, top_k=5):
    embed_scores = embeddings.apply(
        lambda x: (x['text'], x['embedding'], cosine(x['embedding'], query_embed)), axis=1
    )
    embed_scores_df = pd.DataFrame(embed_scores.tolist(), columns=['text', 'embedding', 'score'])
    sim_df = embed_scores_df.sort_values(by='score', ascending=False)[:top_k]
    return sim_df

def valid_response(embedding_df, query_embed, threshold=0.6, top_k=5):
    sim_df = compute_similarity(embedding_df, query_embed, top_k=top_k)
    scores = sim_df['score'].values
    if np.any(scores > threshold):
        return True
    return False

def duplicate_response(previous_responses, resp_embed, threshold=0.9):
    #return any(np.allclose(resp_embed, prev_resp, atol=1e-2) for prev_resp in previous_responses) 
    for pr_embed in previous_responses:
        cos = round(cosine(pr_embed, resp_embed), 2)
        if cos >= threshold:
            return True
    return False

def is_satisfied(query_response, user_input):
    if "satisfied with response" in query_response.lower() and "yes" in user_input.lower():
        return True
    return False

##### Vectorized implementation of MMR (maximal marginal relevance) #####
    
def diversity_ranking(query_embed, selected_docs, unselected_docs, lambda_=0.7, top_k=5):
    # convert embeddings to np arrays
    docs = [d[0] for d in unselected_docs]
    usd = np.array([d[1] for d in unselected_docs])
    sd = np.array([d[1] for d in selected_docs])
    q = np.array(query_embed).reshape(1, -1)
    
    # similarity of query with all unselected docs
    cos_one = np.dot(q, usd.T) / (np.linalg.norm(q, axis=1).reshape(-1, 1) * np.linalg.norm(usd, axis=1).reshape(1, -1))
    
    # similarity of selected docs with all unselected docs
    cos_two = np.dot(sd, usd.T) / (np.linalg.norm(sd, axis=1).reshape(-1, 1) * np.linalg.norm(usd, axis=1).reshape(1, -1))
    cos_two = np.max(cos_two, axis=0).reshape(1, -1)
    
    # compute mmr scores and create df
    score = cos_one * lambda_ - (1 - lambda_) * cos_two
    score = score.flatten().tolist()

    # return top_k docs with highest MMR (maximal marginal relevance)
    score_df = pd.DataFrame({'doc': docs, 'score': score})
    
    top_k_idx = score_df['score'].nlargest(top_k).index
    return score_df.iloc[top_k_idx]


def create_faiss_index(embedding_df, faiss_out, embed_col='embedding'):
    embedding_df[embed_col] = embedding_df[embed_col].apply(lambda x: json.loads(x))
    embeddings = np.vstack(embedding_df[embed_col].values.tolist())
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    faiss.write_index(index, faiss_out)
    print(f'Saving Faiss Index Locally at: {faiss_out}')
    return index

def faiss_top_k(df, query_embed, index, top_k=5):
    search_embed = np.array(query_embed).reshape(1, -1)
    S, I = index.search(search_embed, k=top_k)
    docs = df.iloc[I[0].tolist()].values.tolist()
    return docs, S, I
    
#selected_docs, S, I = faiss_top_k(embedding_df, query_embed, index)

In [3]:
# initialize openai client
client = OpenAI()

In [4]:
# load pdf file as text/str
file_path = '/Users/abdulrafeytahir/Desktop/healthcare-demo-project/notebooks/guideline-for-the-diagnosis-and-management-of-aortic-disease-a-report-of-the.pdf'
pdf_text = read_pdf(file_path)
len(pdf_text)

858137

In [5]:
# process pdf_text by removing special characters and stop words
stop_words = get_stop_words()
pdf_text_processed = process_text(pdf_text, stop_words)
len(pdf_text_processed)

704659

In [6]:
# # generate embeddings
# model = "text-embedding-3-small"
# embed_file = "overlap_embeddings_1k.csv"
# save_embedding_chunks(pdf_text_processed, model, embed_file)

In [7]:
# load embedding file
embed_file = "overlap_embeddings_1k.csv"
embedding_df = pd.read_csv(embed_file)
embedding_df['embedding'] = embedding_df['embedding'].apply(lambda x: json.loads(x))

In [8]:
# # create a faiss index locally
# index_path = "overlap_embeddings.faiss"
# embeddings = pd.read_csv(embed_file)
# index = create_faiss_index(embeddings, faiss_out=index_path)

### QnA Bot

The Embedding model being used is `text-embedding-3-small` from OpenAI. The text generation model is `GPT-3.5-Turbo`

The bot maintains all previous interactions in the query string. It also adds context by using all previous
queries and fetching `top_k` documents from the vector store and makes conversation based on that. 

In [9]:
embedding_model = "text-embedding-3-small"
chat_model = "gpt-3.5-turbo"
query = "How should I treat my 50 year old patient with aortic aneurysm"
query = process_text(query, stop_words)
queries = [query]

In [None]:
# Initialize user interaction and setup
user_satisfied = False
previous_responses = []
messages = [{}]

while not user_satisfied:
    # Generate query embeddings and perform retrieval and ranking
    query_embed = generate_embeddings(query, embedding_model) #
    selected_docs = compute_similarity(embedding_df, query_embed, top_k=5)
    unselected_docs = embedding_df.loc[~embedding_df.index.isin(selected_docs.index)]

    # Document Ranking using Maximum Marginal Relevance to select top_k docs (using higher lambda for relevance)
    # TODO: figure out a mechanism to tweak values of lambda in cases of invalid or duplicate responses.
    ranked_docs = diversity_ranking(query_embed, selected_docs.values.tolist(), unselected_docs.values.tolist(), lambda_=0.7, top_k=5)
    context = '\n'.join([d[0] for d in ranked_docs.values.tolist()])

    # System message setup
    system_msg = dedent(f"""
        You are a medical expert. You can only use the information provided in the context that comes from 
        our own data source to response. When asked a question, you will only ask relevant follow-up questions
        till you are able to find relevant sections of the data/information required by the user. If you cannot 
        answer, then respond with I don't know the answer. Once relevant information is provided to the user,
        ask them if they are satisfied with response (yes/no), if they answer is yes, then terminate the conversation. 
        Also, make sure NOT to ask duplicate questions based on previous chat history.
        Context: {context}
        """)

    messages[0] = {"role": "system", "content": system_msg}
    messages.append({"role": "user", "content": query})
    
    
    # Get model response
    response = client.chat.completions.create(model=chat_model, messages=messages, temperature=0)
    query_response = response.choices[0].message.content

    # Process and generate embedding for the response for duplication check
    resp_processed = process_text(query_response, stop_words)
    resp_embed = generate_embeddings(resp_processed, embedding_model)

    # Check for duplicate response
    if duplicate_response(previous_responses, resp_embed):
        msg = "It is a duplicate response, generate a new response."
        messages.append({"role": "user", "content": msg})

    # Display and verify the response
    print(f"\n{query_response}")
    previous_responses.append(resp_embed)  # Store the response embedding to check for future duplicates

    user_input = input() 
    user_satisfied = is_satisfied(query_response, user_input)
        
    query = dedent(f"{query_response}, {user_input}")
    queries.append(process_text(query, stop_words))



Is the patient showing any symptoms or complications related to the thoracic aortic aneurysm (TAA)?


 yes 



What specific symptoms or complications related to the thoracic aortic aneurysm (TAA) is the 50-year-old patient experiencing?


 they have chest pain and shortness of breath



Is the patient currently receiving any treatment for their chest pain and shortness of breath related to the thoracic aortic aneurysm (TAA)?


 not at the moment



Based on the symptoms of chest pain and shortness of breath related to the thoracic aortic aneurysm (TAA) in the 50-year-old patient, it is important to consider immediate medical intervention. Here are some follow-up questions to gather more information:

1. Has the patient been diagnosed with a thoracic aortic aneurysm (TAA) before experiencing these symptoms?
2. Are there any other medical conditions or risk factors that the patient has that could be contributing to the symptoms?
3. Has the patient had any previous imaging or diagnostic tests done for the thoracic aortic aneurysm (TAA)?
4. Is the patient currently on any medications that could be affecting their symptoms?

Please provide this information so that we can determine the appropriate course of action for the patient.


 the patient also has diabetes



Given the patient's symptoms of chest pain and shortness of breath, along with a known diagnosis of thoracic aortic aneurysm (TAA) and diabetes, it is crucial to address these concerns promptly. Here are some recommendations based on the information provided:

1. **Immediate Medical Attention**: Due to the symptoms of chest pain and shortness of breath, it is advisable for the patient to seek urgent medical evaluation at a healthcare facility.

2. **Management of Thoracic Aortic Aneurysm (TAA)**: The patient's TAA needs to be assessed for size and potential risk of complications. Treatment options may include medical therapy, endovascular stenting, or open surgical repair, depending on the size and location of the aneurysm.

3. **Diabetes Management**: Given the coexistence of diabetes, it is important to ensure that blood sugar levels are well-controlled to prevent any complications that could exacerbate the TAA symptoms.

4. **Cardiovascular Evaluation**: The patient may require fur

 yes



What specific information would you like to know more about regarding the treatment options for thoracic aortic aneurysm (TAA) in the 50-year-old patient with chest pain, shortness of breath, and diabetes?


 what medication can be given to this patient



Given the patient's thoracic aortic aneurysm (TAA) along with chest pain, shortness of breath, and diabetes, the management may involve the following medications:

1. **Blood Pressure Control**: Medications to control blood pressure are crucial in managing aortic aneurysms. This may include beta-blockers, calcium channel blockers, or ACE inhibitors to reduce the risk of aortic dissection.

2. **Diabetes Management**: For diabetes control, medications such as metformin, insulin, or other oral hypoglycemic agents may be prescribed to maintain blood sugar levels within the target range.

3. **Pain Management**: Analgesics may be given to manage chest pain associated with the thoracic aortic aneurysm.

4. **Antiplatelet Therapy**: Depending on the patient's overall cardiovascular risk, antiplatelet medications like aspirin may be considered to reduce the risk of clot formation.

5. **Cholesterol Management**: Statins may be prescribed to manage cholesterol levels and reduce the risk of at

 no



What specific information would you like to know more about regarding the medications used in the management of thoracic aortic aneurysm (TAA) in the 50-year-old patient with chest pain, shortness of breath, and diabetes?


 thanks for the information, i don't need anything else



You're welcome! If you have any more questions in the future or need further assistance, feel free to reach out. Take care!


 okay thank



I don't know the answer.


 ok



I don't know the answer.
