In [24]:
!pip -q install langchain tiktoken chromadb pypdf transformers InstructorEmbedding
!pip -q install accelerate bitsandbytes sentencepiece Xformers

In [8]:
!pip show langchain

In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = 'D:/hf_download_cache/'

## QA Retrieval No Open AI - WizardLM




In [17]:
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline, BitsAndBytesConfig

tokenizer = LlamaTokenizer.from_pretrained("TheBloke/wizardLM-7B-HF")


model = LlamaForCausalLM.from_pretrained("TheBloke/wizardLM-7B-HF",
                                              # load_in_8bit=True,
                                              device_map="auto",
                                              torch_dtype=torch.float32,
                                              low_cpu_mem_usage=True, 
                                              offload_folder='models_hf'
                                              )

Loading checkpoint shards: 100%|██████████| 2/2 [03:45<00:00, 112.77s/it]


In [18]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer,
    max_length=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

In [46]:
print(local_llm('What is the capital of Italy?'))


What is the name of the famous statue in Rome, Italy?


In [48]:
print(local_llm('''Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
What is the capital of Italy?
### Response:'''))

The capital of Italy is Rome.


# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files - PDFs
- ChromaDB
- Local LLM
- Instuctor Embeddings
- 

## Setting up LangChain 


In [27]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

## Load multiple and process documents

In [30]:
# Load and process the text files
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('./new_papers/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [31]:
len(documents)

142

In [38]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

## HF Instructor Embeddings

In [43]:

from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cpu"})


Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 30.4kB/s]
Downloading pytorch_model.bin: 100%|██████████| 4.96G/4.96G [01:12<00:00, 68.2MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 13.2kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 550kB/s]
Downloading spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 2.34MB/s]
Downloading (…)7f436/tokenizer.json: 100%|██████████| 2.42M/2.42M [00:00<00:00, 5.80MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 2.40k/2.40k [00:00<00:00, 599kB/s]
Downloading (…)f57f436/modules.json: 100%|██████████| 461/461 [00:00<00:00, 115kB/s]


load INSTRUCTOR_Transformer
max_seq_length  512


In [50]:
from langchain.embeddings import HuggingFaceEmbeddings

hf_embeddings = HuggingFaceEmbeddings()

## create the DB

In [52]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## Here is the nmew embeddings being used
# embedding = instructor_embeddings
embedding = hf_embeddings

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

Using embedded DuckDB with persistence: data will be stored in: db


## Make a retriever

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

## Make a chain

In [53]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=local_llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "What is Flash attention?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 FlashAttention is a new attention algorithm proposed by the authors that reduces the number of memory
reads/writes between GPU high bandwidth memory (HBM) and GPU on-chip SRAM. It achieves this by splitting the
input into blocks and making several passes over them, and by storing the softmax normalization factor from
the forward pass to quickly recompute attention on-chip in the backward pass. FlashAttention also extends to
block-sparse attention, yielding an approximate attention algorithm that is faster than any existing
approximate attention method.


Sources:
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf


In [None]:
# break it down
query = "What does IO-aware mean?"
llm_response = qa_chain(query)
process_llm_response(llm_response)
# llm_response

 IO-aware means being aware of the input/output operations required by a deep learning model during training
or inference. It refers to the ability of a model to optimize its computations based on the available memory
resources, especially the fast and slow memory types.


Sources:
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf


In [None]:
query = "What is tiling in flash-attention?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Tiling in FlashAttention refers to splitting the input into smaller blocks and processing them one at a time
instead of accessing the entire input at once. This allows for more efficient memory usage and reduces the
number of memory accesses needed during the computation.


Sources:
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf
new_papers/new_papers/Flash-attention.pdf


In [None]:
query = "What is toolformer?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Toolformer is a language model that has been trained to use external tools such as search engines,
calculators, and translation systems via simple API calls. It was introduced in a recent research paper
published on arXiv titled "Learning to Use Tools with Language Models" by Timo Schick, Jane Dwivedi-Yu,
Roberto Dessìy, Roberta Raileanu, Maria Lomeli, Luke Zettlemoyer, Nicola Cancedda, Thomas Scialom, Meta AI
ResearchyUniversitat Pompeu Fabra.


Sources:
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf


In [None]:
query = "What tools can be used with toolformer?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Toolformer can be used with any external tool that has a simple API and returns useful information. Some
examples include search engines like Google or Bing, calculators like Wolfram Alpha or Mathway, and
translation systems like Google Translate or Microsoft Translator.


Sources:
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/toolformer.pdf


In [None]:
query = "How many examples do we need to provide for each tool?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

As mentioned earlier, few-shot prompting requires only a handful of manually labeled examples to enable very
fast experimentation. However, the ability to perform reasoning with chain-of-thoughts from a few in-context
examples only emerges as models reach a certain size, and performance depends heavily on the format in which
examples are presented, the choice of few-shot examples, and the order in which they are presented.


Sources:
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/toolformer.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf


In [None]:
query = "What are the best retrieval augmentations for LLMs?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

There are different strategies for retrieval augmentation of LLMs, including dense and sparse retrievers,
memory augmentation using a neural cache or external knowledge sources, and reasoning strategies that allow
LMs to access external tools. The choice of the best augmentation depends on the specific task and the
available resources. It is also important to consider the potential trade-offs between the amount of
additional information retrieved and the computational cost required to do so.


Sources:
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf
new_papers/new_papers/Augmenting LLMs Survey.pdf


In [None]:
query = "What are the differences between REALM and RAG?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Input length of input_ids is 1322, but `max_length` is set to 1024. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


RE


Sources:
new_papers/new_papers/ReACT.pdf
new_papers/new_papers/ReACT.pdf
new_papers/new_papers/ReACT.pdf


In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

('similarity', <langchain.vectorstores.chroma.Chroma at 0x7efb984603a0>)

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:
