## Import libraries

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt

## Dataset

Link: https://www.kaggle.com/datasets/chaitanyakck/medical-text

In [2]:
!pip install datasets transformers



In [3]:
!pip install langchain



In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("chaitanyakck/medical-text")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'medical-text' dataset.
Path to dataset files: /kaggle/input/medical-text


## Loading and chunking dataset

![](https://miro.medium.com/v2/resize:fit:1127/1*Jq9bEbitg1Pv4oASwEQwJg.png)

In [5]:
with open("train.txt", "r") as f:
    data = f.read()

In [6]:
data[:100] # Print first 100 characters

'4\tCatheterization laboratory events and hospital outcome with direct angioplasty for acute myocardia'

In [7]:
from langchain.docstore.document import Document as LangchainDocument

RAW_KNOWLEDGE_BASE = LangchainDocument(page_content=data)

In [8]:
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # The maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=100,  # The number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)

In [11]:
docs_processed = text_splitter.split_documents([RAW_KNOWLEDGE_BASE])

In [12]:
!pip install langchain_community
!pip install sentence-transformers



## Tokenizing/Vectorizing the dataset

In [13]:
!pip install tf-keras



In [14]:
from langchain_community.embeddings import HuggingFaceEmbeddings
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

In [15]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

  embedding_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
emb = embedding_model.embed_query(docs_processed[0].page_content)

In [17]:
import numpy as np
np.array(emb).shape

(384,)

In [18]:
!pip uninstall -y pinecone-client pinecone
!pip install pinecone

[0mFound existing installation: pinecone 7.3.0
Uninstalling pinecone-7.3.0:
  Successfully uninstalled pinecone-7.3.0
Collecting pinecone
  Using cached pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Using cached pinecone-7.3.0-py3-none-any.whl (587 kB)
Installing collected packages: pinecone
Successfully installed pinecone-7.3.0


## Storing dataset into a vector database

Using: https://pinecone.com

In [19]:
from tqdm.notebook import tqdm
from pinecone import Pinecone

pc = Pinecone(api_key="pcsk_4qCEup_EMXt5uRGbbhheqk4QPa8Z6CrtDQxztihf6zivMTZjKn5Vgr1RhzFC3Y5SGenxex")
index = pc.Index("lab-rag-index")

In [20]:
'''
upsert_data = []

for i, entry in tqdm(enumerate(docs_processed[:10])):
    text = entry.page_content
    vector = embedding_model.embed_query(text)
    upsert_data.append(
        {
            "id": "vec{}".format(i),
            "values": vector,
            "metadata": {"text": text}
        }
    )
'''

'\nupsert_data = []\n\nfor i, entry in tqdm(enumerate(docs_processed[:10])):\n    text = entry.page_content\n    vector = embedding_model.embed_query(text)\n    upsert_data.append(\n        {\n            "id": "vec{}".format(i),\n            "values": vector,\n            "metadata": {"text": text}\n        }\n    )\n'

In [21]:
'''
index.upsert(
    vectors=upsert_data,
    namespace= "ns1"
)
'''

'\nindex.upsert(\n    vectors=upsert_data,\n    namespace= "ns1"\n)\n'

## Loading a LLM

In [22]:
!pip install -U bitsandbytes
!pip install accelerate



In [25]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta"

In [26]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [27]:
llm_model = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.4,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)

Device set to use cuda:0


In [28]:
llm_model("Hey there!")

[{'generated_text': ' It’s been a while since I last wrote. A lot has happened in my life, and I want to share some of it with you.\n\nFirst off, I got a new job! After months of searching, I finally landed a position as a marketing coordinator at a local company. It’s been a huge adjustment, but I’m loving the challenge and the opportunity to learn and grow in my career.\n\nSecondly, I’ve been working on myself a lot lately. I’ve been doing a lot of self-reflection and therapy, which has helped me gain a better understanding of who I am and what I want out of life. I’ve also been prioritizing my mental health by practicing mindfulness and self-care. It hasn’t always been easy, but I’m proud of the progress I’ve made so far.\n\nLastly, I’ve been trying to be more present in my relationships. Whether it’s spending quality time with my loved ones or simply listening when they need to talk, I’ve been making an effort to show up for them in a more meaningful way. It’s not always easy, but 

## Prompting the model

In [29]:
prompt = """
<|system|>
You are a helpful assistant that answers on medical questions based on the real information provided from different sources and in the context.
Give the rational and well written response. If you don't have proper info in the context, answer "I don't know"
Respond only to the question asked.

<|user|>
Context:
{}
---
Here is the question you need to answer.

Question: {}
<|assistant|>
"""

In [30]:
user_input = input("User: ")

vectorized_input = embedding_model.embed_query(user_input)

context = index.query(
    namespace="ns1",
    vector=vectorized_input,
    top_k=1,
    include_metadata=True
)

answer = llm_model(prompt.format(context['matches'][0]['metadata']['text'], user_input))

print("AI response: ", answer[0]['generated_text'])

User: What is cardiogenic shock?
AI response:  Cardiogenic shock is a life-threatening condition where the heart is unable to pump enough blood to meet the body's needs. This can be caused by damage to the heart muscle, such as during a heart attack, which impairs its ability to contract effectively. In this study, we see that cardiogenic shock is more commonly associated with heart attacks affecting the left anterior descending coronary artery, but it can occur with damage to other major coronary arteries as well. The symptoms of cardiogenic shock include low blood pressure, cold and clammy skin, confusion, and organ failure due to lack of oxygen-rich blood. Treatment may involve medications to increase heart function, mechanical support devices like intra-aortic balloon pumps, and emergency surgeries or procedures in severe cases.


In [31]:
context['matches'][0]['metadata']['text']

'artery, 90%). Cardiogenic shock was present in eight patients with infarction of the left anterior descending coronary artery, four with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery. Major catheterization laboratory events (cardioversion, cardiopulmonary resuscitation, dopamine or intra-aortic balloon pump support for hypotension, and urgent surgery) occurred in 10 patients with infarction of the left anterior descending coronary artery, eight with infarction of the right coronary artery, and four with infarction of the circumflex coronary artery (16 of 16 shock and six of 234 nonshock patients, p less than 0.001). There was one in-laboratory death (shock patient with infarction of the left anterior descending coronary artery).'