### Globals and Creating Directories


In [8]:
import os

# Global Vars
cache_dir = "cache/"
vector_db_path = "vector_db/"
data_path = "data/"

collection_name = "ih_docs"

embedding_model = "BAAI/bge-base-en-v1.5"
generative_model = "HuggingFaceH4/zephyr-7b-beta"

paths = [cache_dir, vector_db_path, data_path]
for path in paths:
    os.makedirs(path, exist_ok=True)

os.environ["TRANSFORMERS_CACHE"] = cache_dir

### Downloading Intermountain Health Mental Health Webpages

In [None]:
import requests


def get_html(url):
    response = requests.get(url)
    file_name = "_".join(
        url.replace("?", "_")
        .replace("-", "_")
        .replace("=", "_")
        .rstrip("/")
        .split("/")[-2:]
    )
    with open(f"{data_path}/{file_name}.html", "wb") as file:
        file.write(response.text.encode("latin1", errors="ignore"))


pages = [
    "https://intermountainhealthcare.org/health-information/health-library/depression/",
    "https://intermountainhealthcare.org/medical-specialties/behavioral-health/depression/",
    "https://intermountainhealthcare.org/medical-specialties/behavioral-health",
    "https://intermountainhealthcare.org/medical-specialties/behavioral-health/access-centers",
    "https://intermountainhealthcare.org/medical-specialties/behavioral-health/mindfulness",
]

for page in pages:
    get_html(page)

### Creating Chroma Vector Store


In [3]:
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

html_loader = DirectoryLoader(data_path, loader_cls=BSHTMLLoader)
docs = html_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=128, chunk_overlap=32)
documents = text_splitter.split_documents(docs)

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model, cache_folder=cache_dir
)

db = Chroma.from_documents(
    collection_name=collection_name,
    documents=documents,
    embedding=embeddings,
    persist_directory=vector_db_path,
)
db.persist()


KeyboardInterrupt



### LLM Chain Question and Answering


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(embedding_model)
db = Chroma(
    collection_name=collection_name,
    embedding=embeddings,
    persist_directory=vector_db_path,
)

# number of chunks to retrieve
retriever = db.as_retriever(search_kwargs={"k": 4})

In [None]:
import torch
import transformers
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


def build_qa_chain():
    torch.cuda.empty_cache()
    config = transformers.AutoConfig.from_pretrained(
        generative_model, trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(generative_model, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        generative_model,
        torch_dtype=torch.bfloat16,
        config=config,
        cache_dir=cache_dir,
        trust_remote_code=True,
    )

    template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:
    Use only information in the following paragraphs to answer the question. Explain the answer with reference to these paragraphs. If you don't know, say that you do not know.

    {context}

    {question}

    ### Response:
    """

    prompt = PromptTemplate(input_variables=["context", "question"], template=template)
    end_key_token_id = tokenizer.encode("### End")[0]

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=end_key_token_id,
        do_sample=True,
        temperature=0.7,
        torch_dtype=torch.bfloat16,
        max_new_tokens=128,
        device="auto",
    )

    hf_pipe = HuggingFacePipeline(pipeline=pipe)

    return load_qa_chain(llm=hf_pipe, chain_type="stuff", prompt=prompt)


In [None]:
# might take a bit to download models
qa_chain = build_qa_chain()

In [None]:
def answer_questions(question):
    similar_docs = retriever.get_relevant_documents(question)
    result = qa_chain({"input_documents": similar_docs, "question": question})
    result_html = f"<p><blockquote style=\"font-size:24\">{question}</blockquote></p>"
    result_html += f"<p><blockquote style=\"font-size:18px\">{result['output_text']}</blockquote></p>"
    result_html += "<p><hr/></p>"
    for d in result["input_documents"]:
        source_id = d.metadata["source"]
        result_html += f"<p><blockquote>{d.page_content}<br/>(Source: <a href=\"{source_id}\">{source_id}</a>)</blockquote></p>"
    print(result_html)

In [None]:
answer_questions("What is depression?")

In [None]:
answer_questions("I feel like I'm starting to become depressed. Where can I go for help?")

In [None]:
answer_questions("What are the main symptoms of depression?")

In [None]:
answer_questions("What are some of the signs someone is at risk for suicide?")

In [None]:
answer_questions("If my loved one is struggling with suicidal thoughts. Who can I call for help?")

### Remove All Resources

In [None]:
import os

def remove_directory(path):
    if os.path.exists(path):
        for root, dirs, files in os.walk(path, topdown=False):
            for name in files:
                file_path = os.path.join(root, name)
                os.remove(file_path)
            for name in dirs:
                dir_path = os.path.join(root, name)
                os.rmdir(dir_path)
        os.rmdir(path)

for path in paths:
    remove_directory(path)