In [None]:
# %pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
# %pip install langchain
# %pip install huggingface_hub
# %pip install sentence_transformers
# %pip install faiss-cpu
# %pip install unstructured
# %pip install chromadb
# %pip install Cython
# %pip install tiktoken
# %pip install unstructured[local-inference]
# %pip install -q datasets loralib sentencepiece
# %pip -q install bitsandbytes accelerate xformers einops
# %pip install transformers

In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain

from LangChain_chatbot_util import *

import os
from clinic_match import ClinicMatch

In [37]:
key = read_api_key()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = key
cm = ClinicMatch(key)

## Import Model

Models:<p>
meta-llama/Llama-2-7b-chat-hf<p>
lmsys/vicuna-33b-v1.3<p>

In [46]:
tokenizer = AutoTokenizer.from_pretrained("t5-small",
# # tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [40]:
# model = AutoModelForCausalLM.from_pretrained("t5-small",
# # model = AutoModelForTextClassification.from_pretrained("lmsys/vicuna-7b-v1.3",
# # model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
#                                              device_map='auto',
#                                              torch_dtype=torch.float16,
#                                              token=True,
#                                             # load_in_8bit=True,
#                                             # load_in_4bit=True
#                                              )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [51]:
# pipe = pipeline("text-generation",
#                 model=model,
#                 tokenizer= tokenizer,
#                 torch_dtype=torch.bfloat16,
#                 device_map="auto",
#                 max_new_tokens = 250,
#                 do_sample=True,
#                 top_k=30,
#                 num_return_sequences=1,
#                 eos_token_id=tokenizer.eos_token_id
#                 )

pip = pipeline("text-generation", model="lmsys/vicuna-7b-v1.3", tokenizer=tokenizer, device_map="auto",
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [52]:
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
"""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")


def generate(text, prompt):
    prompt = prompt
    with torch.autocast('cuda', dtype=torch.float16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=50,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs #, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return cut_off_text(wrapped_text, "\n")
        return wrapped_text


## Set up LangChain

In [56]:
# llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

#### Single prompt only
system_prompt = """\
    You are an internal system key search terms assistant who is searching for keywords to search a table for.
    You categorize information in questions into categories following system format. If nothing is found for a category, please leave the section blank. Do not fill blank categories. Additional Categories do not need to be added if blank. 
    PLEASE do not add additional keywords that are not explicitly stated in the entry.

    System Format:

    Required Categories: -age: (Toddler, Child, Adolescent, Adult), -service: (Therapy, Assessment, Diagnosis, Consultation, Advocacy, Educational)
    Additional Categories: -s: (service specific), -insurance: ( ), -language: ( ), -a: (additional keywords)
    
    End System Format

    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    """

instruction = "Please catagorize the keywords in the request. Please start and end the list with START and END tags: \n\n {text}"
template = get_prompt(instruction, system_prompt)
# print(template)

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0}))

output = llm_chain.run(text)
# output = generate(text, template)

search = parse_text(output)

=           =           =     **Mental Health Professional**     **Psychologist**     **Speech-
Language Pathologist**     **Occupational Therapist**     **Physical Therapist**
[P] Recommended services:                                                -age: Adolescent
-service: Assessment
Child Development Center                                                  Community Mental Health
Center                                              University Autism Center
Mental Health Crisis Line                                                   ADPKD Support Group
a 10-year-old with a high screening for autism are Assess




In [None]:
output = cm.query(search)
for i in range(4):
    print("\n")
    print(output[i].page_content)

Sat Nov 18 03:44:03 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                        On | 00000000:60:00.0 Off |                    0 |
| N/A   52C    P0               31W /  70W|  14028MiB / 15360MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                        On | 00000000:61:00.0 Off |  