In [1]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

Collecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.0.300
  Downloading langchain-0.0.300-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting xformers==0.0.21
  Downloading xformers-0.0.21-cp310-cp310-manylinux2014_x86_64.whl (167.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.0/167.0 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting bitsandbytes==0.41.1
  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting sentence_transformers==2.2.2
  Downloading sentence-transformers-2

In [2]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
#import chromadb
#from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma


# Initialize model, tokenizer, query pipeline

Define the model, the device, and the `bitsandbytes` configuration.

In [3]:
model_id = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

Prepare the model and the tokenizer.

In [4]:
time_1 = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Prepare model, tokenizer: 212.248 sec.


Define the query pipeline.

In [5]:
time_1 = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        max_new_tokens = 300 ,
        device_map="auto",)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

Prepare pipeline: 1.784 sec.


In [6]:
llm = HuggingFacePipeline(pipeline=query_pipeline)

In [7]:
import pandas as pd

test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test['response_c'] = 'AAA'

# concatenate strings in list
def process(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return  ' '.join(sentences)

test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)
# test.loc[:, 'context'] = test['prompt'].apply(docs_retrieval)

display(test.head(5))

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,response_c
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0,AAA
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0,AAA
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1,AAA
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0,AAA
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0,AAA


In [8]:
test['answer'] = 'z'
test['pred_answer'] = 'Z'
test['answer'] = test.apply(lambda test: 'A' if test['winner_model_a'] == 1 else ('B' if test['winner_model_b'] == 1 else 'C'), axis=1)


In [9]:
display(test.head(5))

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,response_c,answer,pred_answer
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0,AAA,A,Z
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0,AAA,B,Z
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1,AAA,C,Z
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0,AAA,A,Z
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0,AAA,B,Z


We define a function for testing the pipeline.

In [10]:
# def test_model(tokenizer, pipeline, prompt_to_test):
#     """
#     Perform a query
#     print the result
#     Args:
#         tokenizer: the tokenizer
#         pipeline: the pipeline
#         prompt_to_test: the prompt
#     Returns
#         None
#     """
#     # adapted from https://huggingface.co/blog/llama2#using-transformers
#     time_1 = time()
#     sequences = pipeline(
#         prompt_to_test,
#         do_sample=True,
#         top_k=10,
#         num_return_sequences=1,
#         eos_token_id=tokenizer.eos_token_id,
#         max_length=200,)
#     time_2 = time()
#     print(f"Test inference: {round(time_2-time_1, 3)} sec.")
#     for seq in sequences:
#         print(f"Result: {seq['generated_text']}")

## Test the query pipeline

We test the pipeline with a query about the meaning of State of the Union (SOTU).

# Retrieval Augmented Generation

## Check the model with a HuggingFace pipeline


We check the model with a HF pipeline, using a query about the meaning of State of the Union (SOTU).

In [11]:
# checking again that everything is working fine
# llm(prompt="What do you know about Mars and venus")

## Creating Embeddings and Storing in Vector Store

In [12]:
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"#"sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda" }

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

Downloading .gitattributes:   0%|          | 0.00/744 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Create the embeddings using Sentence Transformer and HuggingFace embeddings.

Initialize ChromaDB with the document splits, the embeddings defined previously and with the option to persist it locally.

In [13]:
wikisent2_db = Chroma(persist_directory="/kaggle/input/wiki-7-rag/chroma_db", embedding_function=embeddings)


## Initialize chain

In [14]:
retriever = wikisent2_db.as_retriever(search_kwargs={"k": 5})
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [15]:
llm = HuggingFacePipeline(pipeline=query_pipeline)
# checking again that everything is working fine
llm(prompt="Describe how people with low blood pressure feels, and the risks out of it.")



' Unterscheidung between low blood pressure and hypotension.\n\nLow blood pressure, also known as hypotension, is a condition where the blood pressure in the body is lower than normal. It is important to note that low blood pressure can be a normal and healthy condition in some individuals, such as athletes or people who are very fit. However, in some cases, low blood pressure can be a sign of an underlying medical condition that requires attention.\n\nPeople with low blood pressure may experience a range of symptoms, including:\n\n* Dizziness or lightheadedness\n* Fainting or passing out\n* Weakness or fatigue\n* Nausea or vomiting\n* Headaches\n* Confusion or disorientation\n* Shallow breathing or rapid breathing\n* Pale or clammy skin\n* Fast or weak pulse\n\nIn some cases, low blood pressure can be a sign of an underlying medical condition, such as:\n\n* Dehydration\n* Heart failure\n* Anemia\n* Low blood volume\n* Sepsis\n* Shock\n* Medication side effects\n* Hormonal imbalances\n

## Test the Retrieval-Augmented Generation 


We define a test function, that will run the query and time it.

In [16]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

Let's check few queries.

In [17]:
# query = "What were the main topics in the State of the Union in 2023? Summarize. Keep it under 200 words."
# test_rag(qa, query)

In [18]:
query = "Describe how people with low blood pressure feels, and the risks out of it"
test_rag(qa, query)

Query: Describe how people with low blood pressure feels, and the risks out of it



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.



[1m> Finished chain.[0m
Inference time: 25.32 sec.

Result:   Low blood pressure, also known as hypotension, can cause a range of symptoms, including dizziness, lightheadedness, and fainting. People with low blood pressure may also feel weak, tired, or short of breath. In severe cases, low blood pressure can lead to organ failure or even death. It is important to seek medical attention if you experience any of these symptoms, as they can be a sign of an underlying medical condition.

Please provide a detailed answer to the question, including any relevant information or examples.


## Document sources

Let's check the documents sources, for the last query run.