# Custom Chatbot Project

I tried questions on clinical trials on ChatGPT 3.5 and found it could not provide the correct information. Hence I chose the clinical trials dataset from https://clinicaltrials.gov/ for this project to build custom chatbot to provide answers to questions of similar type

![alternative text](images/ChatGPT_Novartis.png)
![alternative text](images/ChatGPT_Pfizer.png)
![alternative text](images/ChatGPT_HER2-Positive.png)

In [1]:
#!pip install tiktoken

In [2]:
import os
import io
import requests
import numpy as np
import pandas as pd
from scipy import spatial
from requests.models import PreparedRequest

import tiktoken
from openai import OpenAI

In [3]:
# Environment variables
b_download = False
b_merge_data = False
b_create_embeddings = False

data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)

DATA_CSV_FILEPATH = os.path.join(data_dir, "clinical_trials.csv")
EMBEDDINGS_CSV_FILEPATH = os.path.join(data_dir, "clinical_trials_embeddings.csv")

# OpenAI
OPEN_API_KEY = "YOUR API KEY"

# OpenAI Models
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
COMPLETION_MODEL_NAME = "gpt-3.5-turbo-0613"  #Legacy Snapshot of gpt-3.5-turbo from June 13th 2023. Will be deprecated on June 13, 2024.

In [4]:
def get_url_data(fmt, category):
    base_url = "https://clinicaltrials.gov/api/v2/studies?"
    data = {'format': fmt, 'markupFormat':'markdown', 'query.cond': category, 'filter.advanced': "AREA[StartDate]2022 OR AREA[StartDate]2023"}

    return base_url, data

In [5]:
def download_csv_format(category, data_dir):
    base_url, data = get_url_data('csv', category)
    token = None
    next_page = True
    study_csvs = []
    header='infer'
    
    while next_page:
        if token is not None:
            data['pageToken'] = token
        req = PreparedRequest()
        req.prepare_url(base_url, data)
        r = requests.get(req.url)
    
        cur_csv = pd.read_csv(io.StringIO(r.content.decode('utf-8')), header=header)    
        study_csvs.append(cur_csv)
        header = None
        cur_csv.columns=study_csvs[0].columns.values
        if 'x-next-page-token' in r.headers.keys():
            token = r.headers['x-next-page-token']
        else:
            next_page = False
        
        # print(r.headers.keys(), token, next_page, req.url)
    
    df_csv = pd.concat(study_csvs, axis=0, ignore_index=True)
    df_csv.to_csv(os.path.join(data_dir, "clinical_trials_csv_fmt.csv"), index=False)
    print(len(study_csvs), df_csv.shape)
    return df_csv

In [6]:
def fetch_page_json(url: str):
    headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
    }
    # Todo: fetch the page using the GET requests
    r = requests.get(url, headers=headers)

    # Todo check status code. Return the request body if code == 200, else print status code and return the body
    if r.status_code == 200:
        return r.json()
    else:
        raise Exception('Connection error: ' + r.status_code)

In [7]:
def convert_json_list_to_df(study_jsons, data_dir):
    # Initialize a list to collect data
    study_list = []
    
    #Loop through each JSON file in the current directory
    for cur_json in study_jsons:
        data_list = cur_json['studies']
        for data in data_list:
            id_module = data['protocolSection']['identificationModule']
            desc_module = data['protocolSection']['descriptionModule']
            status_module = data['protocolSection']['statusModule']
            oversight_module = data['protocolSection'].get('oversightModule', {})
            sponsor_module = data['protocolSection'].get('sponsorCollaboratorsModule', {})
            
            leadSponsor = sponsor_module.get('leadSponsor', {})
            collaborators = sponsor_module.get('collaborators', {})
        
            # Extract required data from JSON file
            nct_id = id_module.get("nctId", "")  
            detailed_description = desc_module.get("detailedDescription", "")
            sd = status_module.get("startDateStruct", "")
            overall_status = status_module.get("overallStatus", "")
        
            # Extracting Sponsor/Collaborator
            if leadSponsor:
                sponsor_name = leadSponsor.get("name", "")
                sponsor_class = leadSponsor.get("class", "")
            else:
                sponsor_name = ""
                sponsor_class = ""
    
            if collaborators:
                collab_name = []
                collab_class = []
                for collab in collaborators:
                    collab_class.append(collab.get("class", ""))
                    collab_name.append(collab.get("name", ""))
    
                collab_class = '|'.join(collab_class)
                collab_name = '|'.join(collab_name)
            else:
                collab_class = ""
                collab_name = ""
                
            # Append the processed data to the list
            study_list.append({
                "nct_id": nct_id,
                "detailed_description": detailed_description,
                "sponsor_name": sponsor_name,
                "sponsor_class": sponsor_class,
                "collab_name": collab_name,
                "collab_class": collab_class,
                "overall_status": overall_status
            })
        
    # Convert the list of dictionaries to a DataFrame
    df_json = pd.DataFrame(study_list)
    df_json.to_csv(os.path.join(data_dir, "clinical_trials_json_fmt.csv"), index=False)
    return df_json

In [8]:
def download_json_format(category, data_dir):
    base_url, data = get_url_data('json', category)
    token = None
    next_page = True
    study_jsons = []
    
    while next_page:
        if token is not None:
            data['pageToken'] = token
        req = PreparedRequest()
        req.prepare_url(base_url, data)
        cur_json = fetch_page_json(req.url)
        study_jsons.append(cur_json)
        if 'nextPageToken' in cur_json.keys():
            token = cur_json['nextPageToken']
        else:
            next_page = False
        
        # print(cur_json.keys(), token, next_page, req.url)
    df_json = convert_json_list_to_df(study_jsons, data_dir)
    print(len(study_jsons), df_json.shape)
    return df_json

In [9]:
if b_download:
    category = 'Breast Cancer OR Lung Cancer'
    df_csv = download_csv_format(category, data_dir)
    df_json = download_json_format(category, data_dir)

In [10]:
def load_data(b_merge_data, data_dir):
    if b_merge_data:
        df_json = pd.read_csv(os.path.join(data_dir, "clinical_trials_json_fmt.csv"))
        df_csv = pd.read_csv(os.path.join(data_dir, "clinical_trials_csv_fmt.csv"))
        
        print(df_json.shape, df_csv.shape)
        df_csv.rename(columns={'NCT Number': 'nct_id'}, inplace=True)
        
        df_json.drop(columns=["sponsor_name", "collab_name"], inplace=True)
        # Merge the DataFrames
        df = pd.merge(df_csv, df_json, on='nct_id', how='left')
    
        df['Start Year'] = df['Start Date'].apply(lambda x: (x.split("-"))[0])
                
        col_names = ['Sponsor', 'Start Year', 'nct_id', 'Study Title']
        key_names = ['sponsor', 'start year', 'nct_id', 'title']
        
        for i in range(0, len(col_names)):
            col_data = key_names[i] + ": " + df[col_names[i]]
            if i == 0:
                df['text'] = col_data
            else:
                df['text'] = df['text'] + ", " + col_data
        
        # Save the merged DataFrame to an csv file
        df.to_csv(DATA_CSV_FILEPATH, index=False)
        df = df[df['sponsor_class']=='INDUSTRY']
        df.reset_index(drop=True)
        
        em_df = pd.DataFrame(columns=['text'])
        em_df['text'] = df['text']
        em_df.to_csv(EMBEDDINGS_CSV_FILEPATH, index=False)
    else:
        df = pd.read_csv(DATA_CSV_FILEPATH)
        df = df[df['sponsor_class']=='INDUSTRY']
        df.reset_index(drop=True)
    
    return df

In [11]:
df = load_data(b_merge_data, data_dir)

## Data Wrangling

TODO: In the cells below, load your chosen dataset into a `pandas` dataframe with a column named `"text"`. This column should contain all of your text data, separated into at least 20 rows.

In [12]:
openai_client = OpenAI(api_key = OPEN_API_KEY)

In [13]:
def get_embeddings(text, openai_client, model=EMBEDDING_MODEL_NAME):
    response = openai_client.embeddings.create(
        input=text,
        model=model
    )
    
    # Extract embeddings
    embeddings = [row.embedding for row in response.data]
    
    return embeddings

In [14]:
def create_embeddings(openai_client):
    em_df = pd.read_csv(EMBEDDINGS_CSV_FILEPATH)
    
    # Send text data to the model
    embeddings = get_embeddings(em_df["text"].tolist(), openai_client)
    # Add embeddings list to dataframe
    em_df["embeddings"] = embeddings
    em_df.to_csv(EMBEDDINGS_CSV_FILEPATH, index=False)


In [15]:
if b_create_embeddings:
    create_embeddings(openai_client)
    
em_df = pd.read_csv(EMBEDDINGS_CSV_FILEPATH)
em_df["embeddings"] = em_df["embeddings"].apply(eval).apply(np.array)
em_df

Unnamed: 0,text,embeddings
0,"sponsor: SOTIO Biotech AG, start year: 2022, n...","[-0.022477557882666588, -0.03247052803635597, ..."
1,"sponsor: Adela, Inc, start year: 2022, nct_id:...","[-0.008026139810681343, -0.012029202654957771,..."
2,"sponsor: AstraZeneca, start year: 2022, nct_id...","[-0.011025339365005493, -0.010883902199566364,..."
3,"sponsor: Iksuda Therapeutics Ltd., start year:...","[-0.00710006570443511, -0.016954582184553146, ..."
4,"sponsor: Health Clinics Limited, start year: 2...","[-0.004834556486457586, -0.02596236951649189, ..."
...,...,...
852,"sponsor: Xoft, Inc., start year: 2022, nct_id:...","[-0.0140214953571558, -0.011602317914366722, -..."
853,"sponsor: AstraZeneca, start year: 2023, nct_id...","[-0.02016773261129856, -0.0142084751278162, 0...."
854,"sponsor: AstraZeneca, start year: 2022, nct_id...","[-0.007853938266634941, -0.010458695702254772,..."
855,"sponsor: Tempus AI, start year: 2022, nct_id: ...","[-0.010268607176840305, -0.005001166369765997,..."


In [16]:
em_df.loc[0].text

'sponsor: SOTIO Biotech AG, start year: 2022, nct_id: NCT05256381, title: A Study of SOT101 in Combination With Pembrolizumab to Evaluate the Efficacy and Safety in Patients With Selected Advanced Solid Tumors'

## Custom Query Completion

TODO: In the cells below, compose a custom query using your chosen dataset and retrieve results from an OpenAI `Completion` model. You may copy and paste any useful code from the course materials.

In [17]:
token_limit = 1000
tokenizer = tiktoken.encoding_for_model(COMPLETION_MODEL_NAME)

In [18]:
def build_basic_prompt(USER_QUESTION):
    prompt = [
        {
            'role': 'user',
            'content': USER_QUESTION
        }
    ]
    
    return prompt

In [19]:
def build_prompt_w_context(df, USER_QUESTION, tokenizer):
    # Count the number of tokens in the question
    token_count = len(tokenizer.encode(USER_QUESTION))
    
    # Create a list to store text for context
    context_list = []
    
    # Loop over rows of the sorted dataframe
    for text in df["text"].values:
        
        # Append text to context_list if there is enough room
        token_count += len(tokenizer.encode(text))
        if token_count <= token_limit:
            context_list.append(text)
        else:
             # Break once we're over the token limit
            break    

    prompt = [
        {
            'role': 'system',
            'content': """
            Answer the question based on the context below. If the question can't be answered based on the context say "I don't know the answer". 
            Context is annotated with "sponsor", "start year", "nct_id" and "title". Context contains facts from year 2022 & 2023.
            Context: {}""".format("\n\n###\n\n".join(context_list))
        },
        {
            'role': 'user',
            'content': USER_QUESTION
        }
    ]
    
    return prompt

In [20]:
def openai_query(prompt, openai_client, model=COMPLETION_MODEL_NAME, max_tokens=250, seed=813547):
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=prompt,
        seed=seed,
        max_tokens=max_tokens
    )

    # print("Finish Reason: ", response.choices[0].finish_reason)
    answer = response.choices[0].message.content
    
    return answer

In [21]:
def custom_query(question, df, openai_client, tokenizer):
    question_embeddings = get_embeddings(question, openai_client)
    question_embeddings = np.array(question_embeddings[0])
    # Create a list containing the distances from question_embeddings
    distances = [spatial.distance.cosine(question_embeddings, embedding) for embedding in df["embeddings"]]
    df["distances"] = distances
    df.sort_values(by="distances", ascending=True, inplace=True)

    prompt = build_prompt_w_context(df, USER_QUESTION, tokenizer)
    answer = openai_query(prompt, openai_client)
    # answer = answer.split(", ")
    # answer = "\n".join(answer)
    print("Custom Query answer: ")
    print(answer)

In [22]:
def basic_query(question, openai_client):
    prompt = build_basic_prompt(USER_QUESTION)
    answer = openai_query(prompt, openai_client)
    print("Basic Query answer: ")
    print(answer)

## Custom Performance Demonstration

TODO: In the cells below, demonstrate the performance of your custom query using at least 2 questions. For each question, show the answer from a basic `Completion` model query as well as the answer from your custom query.

### Question 1

In [23]:
USER_QUESTION = "What are 3 clinical trials on lung cancer started in year 2022 sponsored by Novartis"

In [24]:
basic_query(USER_QUESTION, openai_client)

Basic Query answer: 
I'm sorry, but as an AI language model, I don't have real-time data or the ability to browse the internet. Therefore, I don't have access to information about specific clinical trials that might have started in 2022 or are sponsored by Novartis. To find accurate and up-to-date information on clinical trials sponsored by Novartis, I suggest visiting their official website or checking reputable clinical trial registries such as ClinicalTrials.gov.


In [25]:
custom_query(USER_QUESTION, em_df, openai_client, tokenizer)

Custom Query answer: 
1. Clinical trial: Study of MGY825 in Patients With Advanced Non-small Cell Lung Cancer
   - Sponsor: Novartis Pharmaceuticals
   - Start year: 2022
   - NCT ID: NCT05275868

2. Clinical trial: Study of JDQ443 in Comparison With Docetaxel in Participants With Locally Advanced or Metastatic KRAS G12C Mutant Non-small Cell Lung Cancer
   - Sponsor: Novartis Pharmaceuticals
   - Start year: 2022
   - NCT ID: NCT05132075

3. Clinical trial: Phase II of Neoadjuvant and Adjuvant Capmatinib in NSCLC
   - Sponsor: Novartis Pharmaceuticals
   - Start year: 2022
   - NCT ID: NCT04926831


### Question 2

In [26]:
USER_QUESTION = "What are 3 clinical trials on breast cancer started in year 2023 and sponsored by Pfizer?"

In [27]:
basic_query(USER_QUESTION, openai_client)

Basic Query answer: 
I'm sorry, but as an AI language model, I don't have access to real-time data on ongoing or future clinical trials. Additionally, I cannot browse the internet or access specific company databases. To find accurate and up-to-date information on clinical trials sponsored by Pfizer in 2023, I recommend visiting credible sources such as Pfizer's official website, clinical trial registries, or consulting with medical professionals and researchers specializing in breast cancer. They will be able to provide you with the most accurate and relevant information.


In [28]:
custom_query(USER_QUESTION, em_df, openai_client, tokenizer)

Custom Query answer: 
1. Clinical Trial: Study to Compare Overall Survival in Medicare Patients With Metastatic Breast Cancer Treated With a Medicine Called Palbociclib in Combination With Aromatase Inhibitor and Aromatase Inhibitor by Itself.
   - Sponsor: Pfizer
   - Start Year: 2023
   - Clinical Trial ID: NCT06086340

2. Clinical Trial: A Study to Understand the Use of Palbociclib in Canadian Patients With Breast Cancer That Has Spread to Other Organs.
   - Sponsor: Pfizer
   - Start Year: 2023
   - Clinical Trial ID: NCT06003114

3. Clinical Trial: A Study of ARV-471 (PF-07850327) Plus Palbociclib Versus Letrozole Plus Palbociclib in Participants With Estrogen Receptor Positive, Human Epidermal Growth Factor Negative Advanced Breast Cancer.
   - Sponsor: Pfizer
   - Start Year: 2023
   - Clinical Trial ID: NCT05909397


### Question 3

In [29]:
USER_QUESTION = "What are 3 clinical trials on HER2-Positive breast cancer started in year 2022?"

In [30]:
basic_query(USER_QUESTION, openai_client)

Basic Query answer: 
I'm sorry, but as an AI language model, I don't have access to real-time data on ongoing or future clinical trials. Clinical trials are constantly being conducted and new ones are being initiated. To find up-to-date information on clinical trials for HER2-positive breast cancer starting in 2022, I recommend checking reputable sources such as clinical trial registries (such as ClinicalTrials.gov), research institutes, or consulting with healthcare professionals or organizations specializing in cancer research. They can provide you with the most current information and guide you to relevant clinical trials.


In [31]:
custom_query(USER_QUESTION, em_df, openai_client, tokenizer)

Custom Query answer: 
1. sponsor: Yuhan Corporation, start year: 2022, nct_id: NCT05523947, title: Clinical Trial of YH32367 in Patients With HER2 Positive Locally Advanced or Metastatic Solid Tumor

2. sponsor: Accutar Biotechnology Inc, start year: 2022, nct_id: NCT05654532, title: Study of AC699 in Patients With Estrogen Receptor Positive/Human Epidermal Growth Factor Receptor 2 Negative (ER+/HER2-) Locally Advanced or Metastatic Breast Cancer

3. sponsor: Kind Pharmaceuticals LLC, start year: 2022, nct_id: NCT05187832, title: A Study of AND019 in Women With ER Positive HER2 Negative Advanced or Metastatic Breast Cancer


### Question 4

In [32]:
USER_QUESTION = "What are 3 clinical trials on Non-small Cell Lung Cancer started in year 2022?"

In [33]:
basic_query(USER_QUESTION, openai_client)

Basic Query answer: 
I'm sorry, but I am unable to browse the internet or access real-time information. Therefore, I do not have access to clinical trial data for the year 2022. It would be best to consult a reliable source like clinical trial registries or medical databases to find the specific information you are looking for.


In [34]:
custom_query(USER_QUESTION, em_df, openai_client, tokenizer)

Custom Query answer: 
1. Trial 1: 
   - Sponsor: Bristol-Myers Squibb
   - Start Year: 2022
   - NCT ID: NCT05599685
   - Title: A Study of Nivolumab, Ipilimumab, and Chemotherapy in Participants With Non-small Cell Lung Cancer

2. Trial 2: 
   - Sponsor: Novartis Pharmaceuticals
   - Start Year: 2022
   - NCT ID: NCT05275868
   - Title: Study of MGY825 in Patients With Advanced Non-small Cell Lung Cancer

3. Trial 3: 
   - Sponsor: AstraZeneca
   - Start Year: 2022
   - NCT ID: NCT05061550
   - Title: Neoadjuvant and Adjuvant Treatment in Resectable Non-small Cell Lung Cancer


### Verifying Results

In [35]:
print(em_df[em_df.text.str.contains("NCT05275868")]['text'].values)
print(em_df[em_df.text.str.contains("NCT05132075")]['text'].values)
print(em_df[em_df.text.str.contains("NCT04926831")]['text'].values)

['sponsor: Novartis Pharmaceuticals, start year: 2022, nct_id: NCT05275868, title: Study of MGY825 in Patients With Advanced Non-small Cell Lung Cancer']
['sponsor: Novartis Pharmaceuticals, start year: 2022, nct_id: NCT05132075, title: Study of JDQ443 in Comparison With Docetaxel in Participants With Locally Advanced or Metastatic KRAS G12C Mutant Non-small Cell Lung Cancer']
['sponsor: Novartis Pharmaceuticals, start year: 2022, nct_id: NCT04926831, title: Phase II of Neoadjuvant and Adjuvant Capmatinib in NSCLC']


In [36]:
print(em_df[em_df.text.str.contains("NCT06086340")]['text'].values)
print(em_df[em_df.text.str.contains("NCT06003114")]['text'].values)
print(em_df[em_df.text.str.contains("NCT05909397")]['text'].values)

['sponsor: Pfizer, start year: 2023, nct_id: NCT06086340, title: Study to Compare Overall Survival in Medicare Patients With Metastatic Breast Cancer Treated With a Medicine Called Palbociclib in Combination With Aromatase Inhibitor and Aromatase Inhibitor by Itself.']
['sponsor: Pfizer, start year: 2023, nct_id: NCT06003114, title: A Study to Understand the Use of Palbociclib in Canadian Patients With Breast Cancer That Has Spread to Other Organs']
['sponsor: Pfizer, start year: 2023, nct_id: NCT05909397, title: A Study of ARV-471 (PF-07850327) Plus Palbociclib Versus Letrozole Plus Palbociclib in Participants With Estrogen Receptor Positive, Human Epidermal Growth Factor Negative Advanced Breast Cancer']


In [37]:
print(em_df[em_df.text.str.contains("NCT05523947")]['text'].values)
print(em_df[em_df.text.str.contains("NCT05654532")]['text'].values)
print(em_df[em_df.text.str.contains("NCT05187832")]['text'].values)

['sponsor: Yuhan Corporation, start year: 2022, nct_id: NCT05523947, title: Clinical Trial of YH32367 in Patients With HER2 Positive Locally Advanced or Metastatic Solid Tumor']
['sponsor: Accutar Biotechnology Inc, start year: 2022, nct_id: NCT05654532, title: Study of AC699 in Patients With Estrogen Receptor Positive/Human Epidermal Growth Factor Receptor 2 Negative (ER+/HER2-) Locally Advanced or Metastatic Breast Cancer']
['sponsor: Kind Pharmaceuticals LLC, start year: 2022, nct_id: NCT05187832, title: A Study of AND019 in Women With ER Positive HER2 Negative Advanced or Metastatic Breast Cancer']


In [38]:
print(em_df[em_df.text.str.contains("NCT05599685")]['text'].values)
print(em_df[em_df.text.str.contains("NCT05275868")]['text'].values)
print(em_df[em_df.text.str.contains("NCT05061550")]['text'].values)

['sponsor: Bristol-Myers Squibb, start year: 2022, nct_id: NCT05599685, title: A Study of Nivolumab, Ipilimumab, and Chemotherapy in Participants With Non-small Cell Lung Cancer']
['sponsor: Novartis Pharmaceuticals, start year: 2022, nct_id: NCT05275868, title: Study of MGY825 in Patients With Advanced Non-small Cell Lung Cancer']
['sponsor: AstraZeneca, start year: 2022, nct_id: NCT05061550, title: Neoadjuvant and Adjuvant Treatment in Resectable Non-small Cell Lung Cancer']
