**This Notebook demonstrate how we can use OpenAI to match the clinical trail study with a doctor's note. This is notebook just for demo purpose, not intended to show to customer** 

In [5]:
#Install required python libries 

%pip install pytrials jmespath openai langchain redis tiktoken num2words matplotlib plotly scipy scikit-learn pandas

Collecting num2words
  Downloading num2words-0.5.12-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting plotly
  Downloading plotly-5.14.1-py2.py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting docopt>=0.6.2
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l- \ | done
[?25h  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=74e8446f91f57555ea70e36677cd5e1932296685d3fe65a3bb397a495af39d07
  Stored in directory: /home/azureuser/.cache/pip/wheels/7c/d7/8d/2156234738063e3d4a39ba77dc677046100e62766b53807189
Successfully built docopt
Installing collected packages: docopt, num2words, p

In [2]:
from pytrials.client import ClinicalTrials
import jmespath
import re
from openai.embeddings_utils import get_embedding, cosine_similarity
from redis import Redis
from redis.commands.search.query import Query
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.field import VectorField, TagField, TextField
from tenacity import retry, wait_random_exponential, stop_after_attempt
from langchain.llms import AzureOpenAI
from langchain.chains import LLMChain
from langchain.embeddings import OpenAIEmbeddings
from langchain import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import openai
import uuid
import numpy as np
import pandas as pd
import os

from langchain.callbacks import get_openai_callback
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)


In [3]:
# setting up api keys

openai.api_type = "azure"
openai.api_base = 'https://<your endpoint>.openai.azure.com/'
openai.api_version = "2023-03-15-preview"
openai.api_key = '<Add your openai Key>'

redis_key = '<add your redis key>'
redis_endpoint = '<add your redis endpoint>'


In [4]:
# Redis configuration
#Ada: 1536 dimensions
#Babbage: 2048 dimensions
#Curie: 4096 dimensions
#Davinci: 12288 dimensions
DIM = 1536
VECT_NUMBER = 3155
index_name = "embeddings-index"
url_prefix = "https://clinicaltrials.gov/ct2/show/"

In [15]:
#setting up the models
doc_engine = 'text-embedding-ada-002'
query_engine = 'text-embedding-ada-002'
qna_engine = 'gpt-4-32k'
encoding_name = "cl100k_base"  # For second-generation embedding models like text-embedding-ada-002, use the cl100k_base encoding.
top_n_results = 3

In [5]:
#normalizaing text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

In [6]:
# retriveing the clincal study details
def get_clinical_study(study_name , num_studies):
    ct = ClinicalTrials()
    desc = ct.get_full_studies(search_expr=study_name, max_studies=num_studies)
    detail_lst = []    

    for i in range(num_studies):
        detail_lst.append([])
        NCTId = jmespath.search('FullStudiesResponse.FullStudies['+str(i)+'].Study.ProtocolSection.IdentificationModule.NCTId',desc)
        source_url = url_prefix + NCTId
        #RecruitmentStatus = jmespath.search('FullStudiesResponse.FullStudies['+str(i)+'].Study.ProtocolSection.StatusModule.OverallStatus',desc)
        #BriefSummary = jmespath.search('FullStudiesResponse.FullStudies['+str(i)+'].Study.ProtocolSection.DescriptionModule.BriefSummary',desc)
        #DetailedDescription = jmespath.search('FullStudiesResponse.FullStudies['+str(i)+'].Study.ProtocolSection.DescriptionModule.DetailedDescription',desc)
        EligibilityCriteria = jmespath.search('FullStudiesResponse.FullStudies['+str(i)+'].Study.ProtocolSection.EligibilityModule.EligibilityCriteria',desc)

        EligibilityCriteria = normalize_text(EligibilityCriteria)

        detail_lst[i].append(source_url)
        detail_lst[i].append(EligibilityCriteria)
        
        '''
        if DetailedDescription is None:
            BriefSummary = normalize_text(BriefSummary)
            detail_lst[i].append(source_url)
            detail_lst[i].append(BriefSummary + EligibilityCriteria)
        else:
            DetailedDescription = normalize_text(DetailedDescription)
            BriefSummary = normalize_text(BriefSummary)
            detail_lst[i].append(source_url)
            detail_lst[i].append(BriefSummary + DetailedDescription + EligibilityCriteria)
        '''
    return detail_lst

In [7]:
#counting the tokens 
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [8]:
#chunking the study EligibilityCriteria and returing the embeddings
def chunk_and_embed(text, source_url):

    full_data = {
       "source_url" : None,
        "text": None,
        "doc_embeddings": None
    }

    # initialize a text splitter (why? GPT-3 has a limited context window, so we need a way to chunk our documents and pass in the relevant content)
    
    text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 1000,
            chunk_overlap  = 200,
            length_function = len)
    
    chunks = text_splitter.split_text(text)

    #documents = []

    for i, chunk in enumerate(chunks):
        emb = get_embedding(chunk, engine = doc_engine)

        full_data['source_url'] = source_url
        full_data['text'] = text
        full_data['doc_embeddings'] = emb

        #doc = Document(chunk, embedding= emb, doc_id=f"doc_id_{i}", extra_info = source_url)
        #documents.append(doc)
    return full_data

In [9]:
#delete index
def delete_index(index):
    result = redis_conn.execute_command("FT.DROP", index)  
    print('Index Deleted.')

In [10]:
#create index
def create_index(redis_conn: Redis, index_name="embeddings-index", prefix = "embedding",number_of_vectors = VECT_NUMBER, distance_metric:str="COSINE"):
    source_url = TextField(name="source_url")
    text = TextField(name="text")
    embeddings = VectorField("embeddings",
                "HNSW", {
                    "TYPE": "FLOAT32",
                    "DIM": DIM,
                    "DISTANCE_METRIC": distance_metric,
                    "INITIAL_CAP": number_of_vectors,
                })
    # Create index
    redis_conn.ft(index_name).create_index(
        fields = [source_url, text, embeddings],
        definition = IndexDefinition(prefix=[prefix], index_type=IndexType.HASH)
    )

In [11]:
#settin the documents
def set_document(elem):
    index = str(uuid.uuid4())
    redis_conn.hset(
        f"embedding:{index}",
        mapping={
            "source_url": elem['source_url'],
            "text": elem['text'],
            "embeddings": np.array(elem['doc_embeddings']).astype(dtype=np.float32).tobytes()
        }
    )

In [12]:
#creating the index
def add_embeddings(source_url, text):
    embeddings = chunk_and_embed(source_url, text)
    if embeddings:
        # Store embeddings in Redis
        set_document(embeddings)


In [13]:
#Getting the documents
def get_documents(number_of_results: int=VECT_NUMBER):
    base_query = f'*'
    return_fields = ['id', 'source_url', 'text']
    query = Query(base_query)\
        .paging(0, number_of_results)\
        .return_fields(*return_fields)\
        .dialect(2)
    results = redis_conn.ft(index_name).search(query)
    if results.docs:
        return pd.DataFrame(list(map(lambda x: {'id' : x.id, 'source_url': x.source_url, 'text': x.text}, results.docs)))
    else:
        return pd.DataFrame()

In [16]:
#executing the query
def execute_query(np_vector:np.array, return_fields: list=[], search_type: str="KNN", number_of_results: int=top_n_results, vector_field_name: str="embeddings"):
    base_query = f'*=>[{search_type} {number_of_results} @{vector_field_name} $vec_param AS vector_score]'
    query = Query(base_query)\
        .sort_by("vector_score")\
        .paging(0, number_of_results)\
        .return_fields(*return_fields)\
        .dialect(2)
    
    params_dict = {"vec_param": np_vector.astype(dtype=np.float32).tobytes()}

    results = redis_conn.ft(index_name).search(query, params_dict)
    return pd.DataFrame(list(map(lambda x: {'id': x.id, 'source_url': x.source_url, 'text': x.text, 'vector_score': x.vector_score}, results.docs)))


In [17]:
# Semantically search using the computed embeddings on RediSearch
def search_semantic_redis(search_query, pprint=True):
    embedding = get_embedding(search_query, engine = query_engine)
    res = execute_query(np.array(embedding))

    if pprint:
        for r in res:
            print(r[:200])
            print()
    return res.reset_index()

In [18]:
#creating the context for LLM
def create_context(question):
    """
    Find most relevant context for a question via Redisearch
    """
    res = search_semantic_redis(question)
    if len(res) == 0:
        return None, "No vectors matched, try a different context."
    
    res_text = "\nStudy Link: ".join(res['source_url'] + "\n" + "Criteria: " + res["text"] + "\n\n")
    n_tokens = num_tokens_from_string(res_text, encoding_name)

    return (res_text)

In [122]:
#populating the study requirement deatils into vector store
def add_docs(study_name, number_of_studies):
    """
    Adding the base docuents' embeddings into Redisearch
    """
    # Check if Redis index exists
    index_name = "embeddings-index"
    #delete_index(index_name)
    try:
        if redis_conn.ft(index_name).info():
            print("Index exists")
    except:
        print("Index does not exist")
        print("Creating index")
        # Create index 
        create_index(redis_conn)

    studis = get_clinical_study(study_name, number_of_studies)

    for inner_list in studis:
        add_embeddings(inner_list[1],inner_list[0])


In [20]:
# Connect to the Redis server
redis_conn = Redis(host= redis_endpoint, port=10000, password=redis_key, ssl=True)

#Creating the index
#add_docs('COVID-19',10)

#add_docs('Breast Cancer',10)

In [21]:
result = get_documents(20)

result

Unnamed: 0,id,source_url,text
0,embedding:15e4d3b0-0fd2-4358-b6f4-9b374ad10a02,https://clinicaltrials.gov/ct2/show/NCT04372004,Inclusion Criteria: • Male or female over 18 y...
1,embedding:39dfa344-2dc7-4a4d-8cf4-91648ca5df7e,https://clinicaltrials.gov/ct2/show/NCT05384886,Inclusion Criteria: - All patients with a conf...
2,embedding:58f5644f-17b3-411f-8265-8ba7dd1b9934,https://clinicaltrials.gov/ct2/show/NCT04167605,Inclusion Criteria: Adult women (≥ 18 years) c...
3,embedding:bf437457-5f52-4f73-8947-49b8a2174450,https://clinicaltrials.gov/ct2/show/NCT05373459,Inclusion Criteria: All full time staff member...
4,embedding:2d2639d7-01e1-4c3f-aec8-3e703b9b1a8f,https://clinicaltrials.gov/ct2/show/NCT04892888,Inclusion Criteria: The participant is capable...
5,embedding:2defac16-0c5d-4efc-8a37-f1412fa75456,https://clinicaltrials.gov/ct2/show/NCT04516330,Inclusion Criteria: Having breast cancer Havin...
6,embedding:8649ea8b-a74f-4776-b88d-a634193f1707,https://clinicaltrials.gov/ct2/show/NCT03598660,Inclusion Criteria: breast cancer patients bef...
7,embedding:c4b633f9-2b67-4dd1-a727-1422a2cad2c9,https://clinicaltrials.gov/ct2/show/NCT04941144,Inclusion Criteria: Participants who have part...
8,embedding:cd674627-cf4b-4023-b45e-099767984380,https://clinicaltrials.gov/ct2/show/NCT05375799,Inclusion Criteria: Data from adult patients d...
9,embedding:d8ca8955-3e75-45c9-9e3e-9d1270f93b75,https://clinicaltrials.gov/ct2/show/NCT04657510,Inclusion Criteria: Age ≥45 years Admitted to ...


In [83]:
# defining the search query to retrive the related embedding from vector store

search_query = 'A 30 years old pragnent woman  presents with symptoms of COVID-19, including fever, cough, fatigue, and loss of taste and smell. The patient reports that these symptoms began approximatel 7 days ago and have progressively worsened since onset. The patient reports that they recently returned from international travel and had close contact with someone diagnosed with COVID-19.'

#search_query = 'John Smith is a 45-year-old male with no known underlying health conditions, who takes medication for high blood pressure and has no history of respiratory illness. He began experiencing symptoms of COVID-19 on March 22, 2023, including fever, cough, and shortness of breath, and tested positive for the virus on March 25, 2023. His symptoms worsened, leading to hospitalization on March 24, 2023. '

#search_query = 'Sarah Johnson is a 35-year-old female with no history of cancer, no known genetic mutations associated with breast cancer, and no significant family history of the disease. She takes medication for hypothyroidism. During a self-exam, Sarah discovered a lump in her breast. A biopsy confirmed the lump was cancerous, and she was diagnosed with stage II breast cancer. Based on this patient history, Sarah Johnson may be eligible for breast cancer clinical trials that require patients to have a breast cancer diagnosis at stage II or higher, with no known genetic mutations associated with breast cancer or significant family history of the disease.'

In [84]:
content = create_context(search_query)

content

id

source_url

text

vector_score



'https://clinicaltrials.gov/ct2/show/NCT04372004\nCriteria: Inclusion Criteria: • Male or female over 18 years of age at the time of enrollment Current symptoms of COVID-19 ; fever alone or fever and at least one of the following symptoms need to be present Dry cough Sore throat Shortness of breath Chills Muscle pain Headache New loss of taste or smell Chills with repeated shaking Exclusion Criteria: • Unwilling to provide informed consent Unwilling to undergo bi-weekly serological test during the 1-month enrollment\n\n\nStudy Link: https://clinicaltrials.gov/ct2/show/NCT02403505\nCriteria: Conducting an early phase clinical trial to assess COVID-19 Antigen Presentation Therapeutic Biological Product Mix that suggests the potential for clinical benefit of COVID-19 patients. 20 Lighter Than Mild COVID-19 Patients Inclusion Criteria: Lighter Than Mild COVID-19 Patients Positive testing COVID-19 by standard RT-PCR assay COVID-19 infection without symptoms Symptoms of mild illness with COV

In [85]:
#initializing chat instance from langchain
chatmodel = AzureChatOpenAI(
    openai_api_base=openai.api_base,
    openai_api_version=openai.api_version,
    deployment_name=qna_engine,
    openai_api_key=openai.api_key,
    openai_api_type=openai.api_type,
    model_name=qna_engine,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), 
    verbose=True, 
    temperature=.7,
    streaming=True,
    max_tokens= 500
    )

In [86]:
# Creating the prompt and executing the prompt

prompt_template = """You are an helpful healthcare AI bot that helps docotor to find clinical trials matching from below studies. 
Make sure you always explain your answer for all {top_n_results} studies. If the question cannot be answered using the information provided then answer with "There is no much infomation", instead of making your own answers.
While answering always provide the source study link with concide extactive summary at the end.\n

Studies :\n\nStudy Link: {context}
"""
system_message_prompt = SystemMessagePromptTemplate.from_template(prompt_template)

human_template = "{query}"

human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

In [87]:
question = search_query + "Is this patient eligible for covid clinical trail?"

In [88]:
formatted_messages = chat_prompt.format_prompt(context=content, query=question, top_n_results = top_n_results).to_messages()

#print(formatted_messages[0].content)

llm_chain = LLMChain(llm=chatmodel,prompt=chat_prompt)

In [89]:

response = llm_chain.run(context=content, query=question, top_n_results = top_n_results)

This patient is not eligible for the clinical trials mentioned in the provided studies. The reason is that she is pregnant, which is listed as an exclusion criterion in Study NCT02403505 (https://clinicaltrials.gov/ct2/show/NCT02403505). The study specifically excludes pregnant individuals from participating.