RAG Pipeline with LangChain

In [1]:
!pip install datasets
!pip install langchain
!pip install sentence_transformers
!pip install annoy
!pip install langchainhub
!pip3 install pinecone-client==3.0.0rc2
!pip install faiss-cpu


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting annoy
  Using cached annoy-1.17.3.tar.gz (647 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25lerror
  [1;31merror[0m: [

### Load eval data

In [3]:
import json
import sys

sys.path.append("..")

read_files = ["test_Wikipedia.json", "validation_Wikipedia.json"]

from data_preprocessing.preprocessing import create_splits

data_splits = create_splits(create_eval = False)

def read_file(path):
    with open("eval_splits/" + path) as f:
        data = json.load(f)
        return data

test = read_file(read_files[0])
validation = read_file(read_files[1])

Import relevant modules for langchain

In [4]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain import hub
from langchain.schema.runnable import RunnablePassthrough

Create custom LLM class to post requests to t5 (hosted by huggingface)

In [5]:
from langchain import LlamaCpp
from langchain.chains.question_answering import load_qa_chain

model = "/Users/kon/Downloads/orca-2-7b.Q4_0.gguf"
llm = load_qa_chain(LlamaCpp(model_path=model), chain_type="stuff")

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /Users/kon/Downloads/orca-2-7b.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32003,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q4_0     [ 11008,  4096,  

## Implementation of RAG pipeline

Simple paragraph splitter

In [6]:
import string

def retrieve_wiki_headers_and_paragraphs(context, langchain=False):
  data = context.split("\n\n")
  current_header = "General"

  results = []

  for part in data:
    # rule of thumb for detecting headers
    if part[:-1] not in string.punctuation and len(part.split()) < 10:
      current_header = part
    else:
      results.append((current_header, part))

  if not langchain:
    return results
  else:
    return [item[0] + " - " + item[1] for item in results]

Currently most basic version:
- Use Splitter to divide text into paragraphs
- Create Vectorstore with HuggingFaceEmbeddings
- Retrieve most similar chunk for the respective prompt
- Send prompt to specified LLM and print response

-> Can and should be optimized performancewise!

In [25]:

def rag_answer(question, context, log=False):
    paragraphs = retrieve_wiki_headers_and_paragraphs(context, langchain=True)
    vectorstore = FAISS.from_texts(texts=paragraphs, embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1}, return_parents=False)
    par = retriever.get_relevant_documents(question)
    
    #print(par)
    
    answer = llm.run(input_documents=par, question=f"You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use one to two words or numbers maximum and keep the answer concise. question: {question}");
    
    return {
       "context": par[0].page_content,
       "answer": answer
    }


Initial test of pipeline

In [26]:
def build_context(item):
    texts = []
    for text in item["entity_pages"]["wiki_context"]:
      texts.append(text)

    context = " ".join(texts)
    return context

In [27]:
def run_prediction(data, log=False):
    prediction = rag_answer(data["question"], build_context(data), log=log)
    return prediction

run_prediction(validation["Data"][5], log=True)

KeyError: 'question'

### Collect results for validation and test data set

In [28]:
import os
def save_file(data, write_path, filename):
    os.makedirs(write_path, exist_ok=True)
    with open(write_path + "/{}.json".format(filename), "w") as f:
        json.dump(data, f)

In [31]:
from tqdm.notebook import tqdm

def evaluate_model(model_name, only_for: list = None):
    # Validation
    context_results = {}
    answers = {}
    failed = []
    
    trail = ('_' + '+'.join(only_for)) if only_for is not None else ''
    
    for item in tqdm(data_splits["validation"], desc="Validation Progress") :
        qid = item["question_id"]
        if only_for is not None and qid not in only_for:
            continue
        try:
            prediction = run_prediction(item)
            print(f"id: {qid} question: {item['question']} prediction answer: {prediction['answer']}")
            context_results[qid] = prediction["context"]
            answers[qid] = prediction["answer"]
            print("##########################################################################")
        except KeyboardInterrupt as error:
            save_file(context_results, "results/rag/"+model_name+"/wiki", "validation_context" + trail)
            save_file(answers, "results/rag/"+model_name+"/wiki", "validation_answers" + trail)
            print("saved")
            raise error
        except Exception as error:
            print(f"Failure for question {qid} ({type(error).__name__}: {error})")
            failed.append(qid)
    print(f"FAILED: {failed}")
    
    save_file(context_results, "results/rag/"+model_name+"/wiki", "validation_context" + trail)
    save_file(answers, "results/rag/"+model_name+"/wiki", "validation_answers" + trail)



In [32]:
evaluate_model("orca-2-7b")

Validation Progress:   0%|          | 0/7900 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

llama_print_timings:        load time =    6681.67 ms
llama_print_timings:      sample time =       0.40 ms /     4 runs   (    0.10 ms per token, 10050.25 tokens per second)
llama_print_timings: prompt eval time =   11716.23 ms /   167 tokens (   70.16 ms per token,    14.25 tokens per second)
llama_print_timings:        eval time =     204.14 ms /     3 runs   (   68.05 ms per token,    14.70 tokens per second)
llama_print_timings:       total time =   11955.37 ms


id: tc_3 question: Where in England was Dame Judi Dench born? prediction answer:  Heworth
##########################################################################


Llama.generate: prefix-match hit

llama_print_timings:        load time =    6681.67 ms
llama_print_timings:      sample time =       0.21 ms /     2 runs   (    0.11 ms per token,  9433.96 tokens per second)
llama_print_timings: prompt eval time =   16328.01 ms /   275 tokens (   59.37 ms per token,    16.84 tokens per second)
llama_print_timings:        eval time =      71.95 ms /     1 runs   (   71.95 ms per token,    13.90 tokens per second)
llama_print_timings:       total time =   16447.60 ms


id: tc_8 question: From which country did Angola achieve independence in 1975? prediction answer:  Portugal
##########################################################################


Llama.generate: prefix-match hit

llama_print_timings:        load time =    6681.67 ms
llama_print_timings:      sample time =       0.26 ms /     2 runs   (    0.13 ms per token,  7751.94 tokens per second)
llama_print_timings: prompt eval time =   11034.31 ms /   183 tokens (   60.30 ms per token,    16.58 tokens per second)
llama_print_timings:        eval time =      65.18 ms /     1 runs   (   65.18 ms per token,    15.34 tokens per second)
llama_print_timings:       total time =   11130.56 ms


id: tc_9 question: Which city does David Soul come from? prediction answer:  Chicago
##########################################################################


Llama.generate: prefix-match hit

llama_print_timings:        load time =    6681.67 ms
llama_print_timings:      sample time =       0.92 ms /     7 runs   (    0.13 ms per token,  7641.92 tokens per second)
llama_print_timings: prompt eval time =   12334.93 ms /   200 tokens (   61.67 ms per token,    16.21 tokens per second)
llama_print_timings:        eval time =     515.27 ms /     6 runs   (   85.88 ms per token,    11.64 tokens per second)
llama_print_timings:       total time =   12892.62 ms


id: tc_10 question: Who won Super Bowl XX? prediction answer:  The Chicago Bears won.
##########################################################################


Llama.generate: prefix-match hit

llama_print_timings:        load time =    6681.67 ms
llama_print_timings:      sample time =       0.32 ms /     3 runs   (    0.11 ms per token,  9345.79 tokens per second)
llama_print_timings: prompt eval time =   10986.75 ms /   168 tokens (   65.40 ms per token,    15.29 tokens per second)
llama_print_timings:        eval time =     136.32 ms /     2 runs   (   68.16 ms per token,    14.67 tokens per second)
llama_print_timings:       total time =   11155.80 ms


id: tc_11 question: Which was the first European country to abolish capital punishment? prediction answer:  Denmark
##########################################################################


Llama.generate: prefix-match hit

llama_print_timings:        load time =    6681.67 ms
llama_print_timings:      sample time =       0.21 ms /     2 runs   (    0.11 ms per token,  9478.67 tokens per second)
llama_print_timings: prompt eval time =    9681.34 ms /   155 tokens (   62.46 ms per token,    16.01 tokens per second)
llama_print_timings:        eval time =      70.36 ms /     1 runs   (   70.36 ms per token,    14.21 tokens per second)
llama_print_timings:       total time =    9779.80 ms


id: tc_15 question: In which country did he widespread use of ISDN begin in 1988? prediction answer:  Japan
##########################################################################


Llama.generate: prefix-match hit


saved


KeyboardInterrupt: 