## Setup

In [None]:
!pip install llama-index
!pip install langchain
!pip install pinecone-client
!pip install openai
!pip install sentence-transformers
!pip install replicate
!pip install datasets

Collecting llama-index
  Downloading llama_index-0.9.12-py3-none-any.whl (927 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927.7/927.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting aiostream<0.6.0,>=0.5.2 (from llama-index)
  Downloading aiostream-0.5.2-py3-none-any.whl (39 kB)
Collecting beautifulsoup4<5.0.0,>=4.12.2 (from llama-index)
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from llama-index)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama-index)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting httpx (from llama-index)
  Downloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collect

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "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"
os.environ["REPLICATE_API_TOKEN"]="r8_TYQWDDGbFNhBJIFex1jRN6CuzC8bU5v3fqhkX"

## Step 1: Problem Statement

- Healthcare seekers often rely on online sources for medication side effect information, but the reliability is compromised.
- This project aims to use a Large Language Model (LLM) to address misinformation in information retrieval.
- The project focuses on developing an LLM-based system for accurate and contextually relevant responses to user queries about drug side effects.
- The system will support responses with citations from reputable sources to enhance information credibility.


## Step 2: Load dataset


In [None]:
# from google.colab import files
# uploades = files.upload()

In [None]:
import pandas as pd
data = pd.read_csv("drugs_side_effects_drugs_com.csv")
print(data.head())
print(data.shape)

        drug_name medical_condition  \
0     doxycycline              Acne   
1  spironolactone              Acne   
2     minocycline              Acne   
3        Accutane              Acne   
4     clindamycin              Acne   

                                        side_effects         generic_name  \
0  (hives, difficult breathing, swelling in your ...          doxycycline   
1  hives ; difficulty breathing; swelling of your...       spironolactone   
2  skin rash, fever, swollen glands, flu-like sym...          minocycline   
3  problems with your vision or hearing; muscle o...  isotretinoin (oral)   
4  hives ; difficult breathing; swelling of your ...  clindamycin topical   

                                        drug_classes  \
0         Miscellaneous antimalarials, Tetracyclines   
1  Aldosterone receptor antagonists, Potassium-sp...   
2                                      Tetracyclines   
3  Miscellaneous antineoplastics, Miscellaneous u...   
4       Topical acne a

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

## Step 3: Retrieval technique
- Define Pinecone index
- Upsert vector embeddings to Pinecone index

In [None]:
import pinecone

index_name = 'langchain-rag'
pinecone.init(
    api_key="3da0e6b6-40a1-4094-9ab1-ca22a2a98621",
    environment="gcp-starter"
)

In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.02931,
 'namespaces': {'': {'vector_count': 2931}},
 'total_vector_count': 2931}

### Each chunk needs it's own ID which apparantly can't just be an integer. So we copy the SQUAD dataset's IDs.

In [None]:
from datasets import load_dataset

stanforddata = load_dataset('squad', split='train')

stanforddata = stanforddata.to_pandas()

data['id'] = stanforddata['id'].head(2931)
print(data.head())

column_type = data['side_effects'].dtypes
print(column_type)
print(type(data.loc[1, 'side_effects']))
data['side_effects'] = data['side_effects'].astype(str)

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

        drug_name medical_condition  \
0     doxycycline              Acne   
1  spironolactone              Acne   
2     minocycline              Acne   
3        Accutane              Acne   
4     clindamycin              Acne   

                                        side_effects         generic_name  \
0  (hives, difficult breathing, swelling in your ...          doxycycline   
1  hives ; difficulty breathing; swelling of your...       spironolactone   
2  skin rash, fever, swollen glands, flu-like sym...          minocycline   
3  problems with your vision or hearing; muscle o...  isotretinoin (oral)   
4  hives ; difficult breathing; swelling of your ...  clindamycin topical   

                                        drug_classes  \
0         Miscellaneous antimalarials, Tetracyclines   
1  Aldosterone receptor antagonists, Potassium-sp...   
2                                      Tetracyclines   
3  Miscellaneous antineoplastics, Miscellaneous u...   
4       Topical acne a

### Upsert vector embeddings to pinecone index

In [None]:
# from tqdm.auto import tqdm
# from uuid import uuid4

# batch_size = 100

# texts = []
# metadatas = []
# embed = hf
# for i in tqdm(range(0, len(data), batch_size)):
#     # get end of batch
#     i_end = min(len(data), i+batch_size)
#     batch = data.iloc[i:i_end]
#     # first get metadata fields for this record
#     metadatas = [{
#         'title': record['drug_name'],
#         'text': record['side_effects'],
#         'source': record['drug_link']
#     } for j, record in batch.iterrows()]
#     # get the list of contexts / documents
#     documents = batch['side_effects']
#     # create document embeddings
#     embeds = embed.embed_documents(documents)
#     # get IDs
#     ids = batch['id']
#     # add everything to pinecone
#     index.upsert(vectors=zip(ids, embeds, metadatas))

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.02931,
 'namespaces': {'': {'vector_count': 2931}},
 'total_vector_count': 2931}

## Step 4: Generation using LLM

In [None]:
questions = [
    "What are the side effects of doxycycline?",
    "What are the side effects of spironolactone?",
    "What are the side effects of minocycline?",
    "What are the side effects of Accutane?",
    "What are the side effects of clindamycin?",
    "What are the side effects of Aldactone?",
    "What are the side effects of tretinoin?",
    "What are the side effects of isotretinoin?",
    "What are the side effects of Bactrim?",
    "What are the side effects of Retin-A?",
]

### Generation using Mistral 7B

In [None]:
from llama_index.llms import Replicate

mistral = Replicate(
    model="mistralai/mistral-7b-instruct-v0.1:83b6a56e7c828e667f21fd596c338fd4f0039b46bcfa18d973e8e70e455fda70"
)

In [None]:
mistral_responses = []

for query in questions:
  response = mistral.complete(query).text
  mistral_responses.append(response)

In [None]:
for i, r in enumerate(mistral_responses):
    print(f"Response {i + 1}: {r}\n")

Response 1: Like all medications, doxycycline can cause side effects. The most common side effects include:

1. Nausea and vomiting
2. Diarrhea
3. Stomach upset
4. Headache
5. Dizziness
6. Sore throat
7. Runny or stuffy nose
8. Dry mouth
9. Fatigue
10. Joint pain
11. Muscle weakness

These side effects are usually mild and go away on their own. However, if they persist or worsen, it's important to speak with your

Response 2: Like any medication, spironolactone can cause side effects. Some of the most common side effects include:

1. Nausea and vomiting: These are the most common side effects of spironolactone. They usually occur within the first few weeks of starting the medication and may improve with time.

2. Headache: Some people may experience headaches while taking spironolactone. These headaches may be mild to moderate in intensity and may occur frequently.

3. Dizziness: Spironolactone can cause dizziness, especially when standing up quickly or changing

Response 3: Like any m

### Generation using gpt-turbo-3.5

In [None]:
from llama_index.llms import OpenAI

gpt = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [None]:
gpt_responses = []

for query in questions:
  response = gpt.complete(query).text
  gpt_responses.append(response)

In [None]:
for i, r in enumerate(gpt_responses):
    print(f"Response {i + 1}: {r}\n")

Response 1: Common side effects of doxycycline include:

1. Nausea and vomiting
2. Diarrhea
3. Upset stomach or abdominal pain
4. Loss of appetite
5. Headache
6. Dizziness or lightheadedness
7. Skin rash or itching
8. Sensitivity to sunlight (increased risk of sunburn)
9. Yeast infections (in women)
10. Discoloration of teeth (in children)
11. Changes in the menstrual cycle (in women)

Less common but more serious side effects may include:

1. Severe allergic reactions (rash, itching, swelling, severe dizziness, difficulty breathing)
2. Severe headache or blurred vision
3. Severe stomach pain or cramping
4. Persistent diarrhea or bloody stools
5. Signs of liver problems (yellowing of the skin or eyes, dark urine, persistent nausea or vomiting, abdominal pain)
6. Signs of kidney problems (change in the amount of urine, blood in the urine, swelling in the ankles or feet)
7. Symptoms of a rare condition called pseudotumor cerebri (severe headache, blurred vision, ringing in the ears, dizz

## Step 5: Verify LLM output

*   Calculate the cosine similarity between the embedding vector of the LLM output and the embedding vectors stored in Pinecone.
*   Set a threshold of 0.8 for the cosine similarity score.

In [None]:
import pinecone
api_key = "3da0e6b6-40a1-4094-9ab1-ca22a2a98621"
pinecone.init(api_key=api_key, environment="gcp-starter")
pinecone_index = pinecone.Index("langchain-rag")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
def verify(llm_output_embedding):
  cosine_similarity_threshold = 0.8
  result = pinecone_index.query(vector=llm_output_embedding, top_k=3, include_values=True, include_metadata=True)
  source_links = []
  for i in range(len(result["matches"])):
    if result["matches"][i]["score"] > cosine_similarity_threshold:
      source_links.append(result["matches"][i]["metadata"]["source"])

  if len(source_links) == 0:
    return (False, source_links)
  else:
    return (True, source_links)

### Verify Mistral output

In [None]:
mistral_output_embeddings = []
for response in mistral_responses:
  mistral_output_embedding = hf.embed_query(response)
  mistral_output_embeddings.append(mistral_output_embedding)

In [None]:
mistral_verification_list = []
for embedding in mistral_output_embeddings:
  flag, sources = verify(embedding)
  mistral_verification_list.append([flag, sources])

In [None]:
# Print number of unverified responses.
count = 0
for i in range(len(mistral_verification_list)):
  if mistral_verification_list[i][0] == False:
    count += 1

print(count)

1


### Verify gpt-turbo-3.5 output

In [None]:
gpt_output_embeddings = []
for response in gpt_responses:
  gpt_output_embedding = hf.embed_query(response)
  gpt_output_embeddings.append(gpt_output_embedding)

In [None]:
gpt_verification_list = []
for embedding in gpt_output_embeddings:
  flag, sources = verify(embedding)
  gpt_verification_list.append([flag, sources])

In [None]:
# Print number of unverified responses.
count = 0
for i in range(len(gpt_verification_list)):
  if gpt_verification_list[i][0] == False:
    count += 1

print(count)

0


## Step 6: If LLM output is verified, generate citations.

### Mistral responses with citations

In [None]:
final_mistral_responses = []
for i in range(len(mistral_responses)):
  final_response = mistral_responses[i] + "\n" + "\n"

  if mistral_verification_list[i][0] == False:
    final_response += "This response is not verified. \n"
  else :
    final_response += "This response is verified. For further information, visit \n"
    source_links = "\n".join(mistral_verification_list[i][1])
    final_response += f"{source_links}"

  final_mistral_responses.append(final_response)

In [None]:
for index, r in enumerate(final_mistral_responses):
    print(f"Response {index + 1}: {r}\n")

Response 1: Like all medications, doxycycline can cause side effects. The most common side effects include:

1. Nausea and vomiting
2. Diarrhea
3. Stomach upset
4. Headache
5. Dizziness
6. Sore throat
7. Runny or stuffy nose
8. Dry mouth
9. Fatigue
10. Joint pain
11. Muscle weakness

These side effects are usually mild and go away on their own. However, if they persist or worsen, it's important to speak with your

This response is verified. For further information, visit 
https://www.drugs.com/doxycycline.html
https://www.drugs.com/mtm/doxylamine.html

Response 2: Like any medication, spironolactone can cause side effects. Some of the most common side effects include:

1. Nausea and vomiting: These are the most common side effects of spironolactone. They usually occur within the first few weeks of starting the medication and may improve with time.

2. Headache: Some people may experience headaches while taking spironolactone. These headaches may be mild to moderate in intensity and may

### GPT responses with citations

In [None]:
final_gpt_responses = []
for i in range(len(gpt_responses)):
  final_response = gpt_responses[i] + "\n" + "\n"

  if gpt_verification_list[i][0] == False:
    final_response += "This response is not verified. \n"
  else :
    final_response += "This response is verified. For further information, visit \n"
    source_links = "\n".join(gpt_verification_list[i][1])
    final_response += f"{source_links}"

  final_gpt_responses.append(final_response)

In [None]:
for index, r in enumerate(final_gpt_responses):
    print(f"Response {index + 1}: {r}\n")

Response 1: Common side effects of doxycycline include:

1. Nausea and vomiting
2. Diarrhea
3. Upset stomach or abdominal pain
4. Loss of appetite
5. Headache
6. Dizziness or lightheadedness
7. Skin rash or itching
8. Sensitivity to sunlight (increased risk of sunburn)
9. Yeast infections (in women)
10. Discoloration of teeth (in children)
11. Changes in the menstrual cycle (in women)

Less common but more serious side effects may include:

1. Severe allergic reactions (rash, itching, swelling, severe dizziness, difficulty breathing)
2. Severe headache or blurred vision
3. Severe stomach pain or cramping
4. Persistent diarrhea or bloody stools
5. Signs of liver problems (yellowing of the skin or eyes, dark urine, persistent nausea or vomiting, abdominal pain)
6. Signs of kidney problems (change in the amount of urine, blood in the urine, swelling in the ankles or feet)
7. Symptoms of a rare condition called pseudotumor cerebri (severe headache, blurred vision, ringing in the ears, dizz

## Step 7: RAG (Retrieval-augmented generation)

In [None]:
from llama_index import VectorStoreIndex, ServiceContext
from llama_index.vector_stores import PineconeVectorStore

vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    add_sparse_vector=True,
)

In [None]:
# Create our retriever.
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)

# Fetch the top 3 most relevant chunks.
retriever = index.as_retriever(similarity_top_k=3)

LLM is explicitly disabled. Using MockLLM.


Now, let's try a sample query and pull the most relevant context.

In [None]:
query = "What are the side effects of doxycycline?"
nodes = retriever.retrieve(query)

for node in nodes:
    print(node)
    print("Source: ", node.metadata["source"])
    print('\n')

Node ID: 5733be284776f41900661182
Text: (hives, difficult breathing, swelling in your face or throat) or
a severe skin reaction (fever, sore throat, burning in your eyes, skin
pain, red or purple skin rash that spreads and causes blistering and
peeling). Seek medical treatment if you have a serious drug reaction
that can affect many parts of your body. Symptoms may include: skin
rash,...
Score:  0.840

Source:  https://www.drugs.com/doxycycline.html


Node ID: 56ce75d4aab44d1400b887bd
have very bad and sometimes deadly side effects when taking a drug.
Tell your doctor or get medical help right away if you have any of the
following signs or symptoms that may be related to a very bad side
effect: Signs of an allergic reaction, like rash; hives ; itching;
red, swollen,...
Score:  0.716

Source:  https://www.drugs.com/cdi/doans-pills.html


Node ID: 56bfd8bda10cfb140055132e
have very bad and sometimes deadly side effects when taking a drug.
Tell your doctor or get medical help right away i

### Mistral RAG

In [None]:
from llama_index.llms import Replicate

mistral = Replicate(
    model="mistralai/mistral-7b-instruct-v0.1:83b6a56e7c828e667f21fd596c338fd4f0039b46bcfa18d973e8e70e455fda70"
)

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=mistral)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)
query_engine = index.as_query_engine(similarity_top_k=3)

In [None]:
mistral_rag_responses = []
for question in questions:
  response = query_engine.query(question)
  mistral_rag_responses.append(response)
response = query_engine.query(query)

In [None]:
for i, r in enumerate(mistral_rag_responses):
    print(f"Response {i + 1}: {r}\n")
    source_nodes = r.source_nodes

    for node in source_nodes:
      print("Text: ", node.node.text)
      print("Score: ", node.score)
      print("Source: ", node.node.metadata["source"])
      print("\n")

Response 1: The side effects of doxycycline can include nausea and vomiting, upset stomach, loss of appetite, mild diarrhea, skin rash or itching, darkened skin color, vaginal itching or discharge, severe stomach pain, diarrhea that is watery or bloody, throat irritation, trouble swallowing, chest pain, irregular heart rhythm, feeling short of breath, little or no urination, low white blood cell counts - fever, chills, swollen glands, body aches, weakness, pale skin, easy bruising or bleeding, severe headaches, ring

Text:  (hives, difficult breathing, swelling in your face or throat) or a severe skin reaction (fever, sore throat, burning in your eyes, skin pain, red or purple skin rash that spreads and causes blistering and peeling). Seek medical treatment if you have a serious drug reaction that can affect many parts of your body. Symptoms may include: skin rash, fever, swollen glands, flu-like symptoms, muscle aches, severe weakness, unusual bruising, or yellowing of your skin or ey

### GPT RAG

In [None]:
from llama_index.llms import OpenAI

In [None]:
# Use OpenAI as the LLM to LlamaIndex.
gpt = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=gpt)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)

In [None]:
# Create our query engine.
query_engine = index.as_query_engine(similarity_top_k=3)

In [None]:
import time

gpt_rag_responses = []
questions_per_minute = 3
time_interval = 60 / questions_per_minute

for question in questions:
    response = query_engine.query(question)
    gpt_rag_responses.append(response)
    time.sleep(time_interval)

In [None]:
for i, r in enumerate(mistral_rag_responses):
    print(f"Response {i + 1}: {r}\n")
    source_nodes = r.source_nodes

    for node in source_nodes:
      print("Text: ", node.node.text)
      print("Score: ", node.score)
      print("Source: ", node.node.metadata["source"])
      print("\n")

Response 1: The side effects of doxycycline can include nausea and vomiting, upset stomach, loss of appetite, mild diarrhea, skin rash or itching, darkened skin color, vaginal itching or discharge, severe stomach pain, diarrhea that is watery or bloody, throat irritation, trouble swallowing, chest pain, irregular heart rhythm, feeling short of breath, little or no urination, low white blood cell counts - fever, chills, swollen glands, body aches, weakness, pale skin, easy bruising or bleeding, severe headaches, ring

Text:  (hives, difficult breathing, swelling in your face or throat) or a severe skin reaction (fever, sore throat, burning in your eyes, skin pain, red or purple skin rash that spreads and causes blistering and peeling). Seek medical treatment if you have a serious drug reaction that can affect many parts of your body. Symptoms may include: skin rash, fever, swollen glands, flu-like symptoms, muscle aches, severe weakness, unusual bruising, or yellowing of your skin or ey