In [1]:
import pandas as pd
from datasets import load_dataset

from langchain import PromptTemplate, LLMChain
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import HuggingFaceEmbeddings

import gpt4all

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download the medical_dialog dataset from Hugging Face
dataset = load_dataset("medical_dialog", "processed.en")

Found cached dataset medical_dialog (/root/.cache/huggingface/datasets/medical_dialog/processed.en/2.0.0/0e925f6f3a036cf46434ddd9e73e9a69bfc91dd467825560d27f04c4e226cba6)
100%|██████████| 3/3 [00:00<00:00, 835.13it/s]


In [3]:
df = pd.DataFrame(dataset["train"])

In [4]:
df.head()

Unnamed: 0,description,utterances
0,throat a bit sore and want to get a good imune...,[patient: throat a bit sore and want to get a ...
1,"hey there i have had cold ""symptoms"" for over ...","[patient: hey there i have had cold ""symptoms""..."
2,i have a tight and painful chest with a dry co...,[patient: i have a tight and painful chest wit...
3,what will happen after the incubation period f...,[patient: what will happen after the incubatio...
4,suggest treatment for pneumonia,[patient: just found out i was pregnant. yeste...


In [5]:
dialog = []
# make each sentence on a seperate row
patient, doctor = zip(*df["utterances"])
for i in range(len(patient)):
    dialog.append(patient[i])
    dialog.append(doctor[i])

dialog_df = pd.DataFrame({"dialog": dialog})
# save the data to txt file
dir_path = "../data/data.txt"
dialog_df.to_csv(dir_path, sep=" ", index=False)

In [6]:
dialog_df.head()

Unnamed: 0,dialog
0,patient: throat a bit sore and want to get a g...
1,doctor: during this pandemic. throat pain can ...
2,"patient: hey there i have had cold ""symptoms"" ..."
3,doctor: yes. protection. it is not enough symp...
4,patient: i have a tight and painful chest with...


In [7]:
# Embed the document and store into chroma DB
# using sentence-transformers and chromadb
loader = TextLoader(dir_path)
index = VectorstoreIndexCreator(embedding=HuggingFaceEmbeddings()).from_loaders(
    [loader]
)

In [8]:
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

# Verbose is required to pass to the callback manager
llm = GPT4All(
    model="ggml-gpt4all-j-v1.3-groovy.bin",
    callbacks=callbacks,
    verbose=True,
    backend="gptj",
)

Found model file at  /root/.cache/gpt4all/ggml-gpt4all-j-v1.3-groovy.bin
gptj_model_load: loading model from '/root/.cache/gpt4all/ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
gptj_model_load: n_vocab = 50400
gptj_model_load: n_ctx   = 2048
gptj_model_load: n_embd  = 4096
gptj_model_load: n_head  = 16
gptj_model_load: n_layer = 28
gptj_model_load: n_rot   = 64
gptj_model_load: f16     = 2
gptj_model_load: ggml ctx size = 5401.45 MB
gptj_model_load: kv self size  =  896.00 MB
gptj_model_load:  done
gptj_model_load: model size =   153.08 MB / num tensors = 6


In [9]:
# perform similarity search and retrieve the context from our documents
results = index.vectorstore.similarity_search(
    "what is the solution for soar throat", k=4
)

# join all context information (top 4) into one string
context = "\n".join([document.page_content for document in results])
print(f"Retrieving information related to your question...")
print(f"Found this content which is most similar to your question: {context}")

Retrieving information related to your question...
Found this content which is most similar to your question: "doctor: in brief: standard precautions covid-19 is now official name for the illness caused by the newly discovered coronavirus (coronavirus infectious disease - 2019). so far it is extremely rare in the us (2/12/20). until and unless covid-19 becomes common no special precautions are necessary. in any dormitory or group living situation people with respiratory symptoms (colds, flu, etc.) should cover their coughs and wash hands frequently."
"patient: is gargling with listerine effective against corona virus induced sore throat? will it kill the virus? how about with mixture of warm water and salt, will this also kill virus!"
"doctor: gargling. you can't be sure but it may help if you do those things as well as using zinc lozenges at the first sign of any throat discomfort and stay hydrated also. i recommend them. at least it'll do no harm."
"patient: i have tonsillitis long t