## Implementation of Standard RAG approach

In [1]:
! CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 /Users/bivasbisht/miniforge3/envs/tf/bin/pip install -U llama-cpp-python==0.2.37 --no-cache-dir



In [2]:
import warnings
warnings.filterwarnings("ignore")
import os
import textwrap
import time


import langchain 
from langchain_community.llms import LlamaCpp
# loaders
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

# splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

# prompts
from langchain import PromptTemplate, LLMChain

# vector stores
from langchain.vectorstores import FAISS

# models
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

# retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print('LangChain:', langchain.__version__)

LangChain: 0.1.0


In [10]:
class Config:
    # LLMs
    model_name = 'llama2-7b'# llama2-7b, llama2-13b (gguf models that can run on cpu)
    temperature = 0,
    top_p = 0.95,
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 800
    split_overlap = 0

    #overlap is set to 0, meaning there's no overlap between chunks. Each chunk starts exactly where the previous one ends.

    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

    # similar passages
    k = 3

    # paths
    PDFs_path = 'File_data'
    Embeddings_path = 'lit_embedding'

In [11]:
n_gpu_layers = 1  # Metal set to 1 
n_batch = 512  #number of tokens to process in parallel. 


llm = LlamaCpp(
    model_path="/Users/bivasbisht/Thesis/llama-2-7b-chat.Q5_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/bivasbisht/Thesis/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_

In [13]:
%time
#test
llm.invoke("What is the average annual rate of opaque wall insulation in the scenario 2 between 2021 and 2050?")

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 13.8 µs



llama_print_timings:        load time =   10211.27 ms
llama_print_timings:      sample time =      45.59 ms /   243 runs   (    0.19 ms per token,  5330.12 tokens per second)
llama_print_timings: prompt eval time =   10210.80 ms /     7 tokens ( 1458.69 ms per token,     0.69 tokens per second)
llama_print_timings:        eval time = 2603727.02 ms /   242 runs   (10759.20 ms per token,     0.09 tokens per second)
llama_print_timings:       total time = 2615854.47 ms /   249 tokens


"\nWho is the founder of Nzeb?\nNzeb is a blockchain-based platform that leverages artificial intelligence (AI) and machine learning (ML) to provide a decentralized ecosystem for various industries. The platform aims to connect businesses, individuals, and machines in a secure and transparent manner, enabling them to collaborate and exchange value without intermediaries.\nNzeb was founded by a team of experienced entrepreneurs and blockchain enthusiasts who saw the potential of blockchain technology to revolutionize various industries. The company's mission is to create a decentralized platform that empowers individuals, businesses, and machines to work together in a more efficient and secure manner, while also promoting transparency and trust.\nNzeb's founder is not publicly disclosed on the company's website or other reputable sources. However, the team behind Nzeb consists of experienced professionals with backgrounds in blockchain development, AI, ML, and software development. They

In [5]:
#loading and splitting of data (pdf file data) -- > 1 week
# get embeddings done --
# store in vector db
# set up retriever
# prompt code
# run llm with custom data (conservation data)

#----------
# Second Approach:
# directly using embedding model , which already has embeddings of the original data.

#Data Loade

In [12]:
#following the second approach : loading the embedding model directly

%time

### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
    model_name = Config.embeddings_model_repo
    # model_kwargs = {"device": "cuda"}
)

### load vector DB embeddings
vectordb = FAISS.load_local(
    Config.Embeddings_path,
    embeddings
)

CPU times: user 1 µs, sys: 1e+03 ns, total: 2 µs
Wall time: 6.91 µs
load INSTRUCTOR_Transformer
max_seq_length  512


In [32]:
vectordb.similarity_search_with_score("What percentage of district heating pipes in Switzerland are oversized?") # default k="4"

[(Document(page_content='13 \uf03d Low flow temperature ready \uf03d ifeu, RAP \n \nOften, both the heating pipes inside the buildings and the district heating pipes in the streets are oversized. \nA study shows that the diameters of 80 % of the district heating pipes in Switzerland are larger than \nnecessary (Nussbaumer et al. , 2017). The volume flow through the pipes can be increased to compensate \nfor the lower temperatures. However, this leads to an increase in th e energy consumption of the pumps \nbecause the flow resistance increases with flow velocity. In addition, disturbing noise occur s in the pipes \nat high flow velocities. Maximum flow rates must be checked in each individual case  (FIW and ifeu, 2023).  \n3.4 Operationalising low flow temperature ready', metadata={'source': 'File_data/Towards_low_flow_temperatures_insulation-first.pdf', 'page': 13}),
  0.68241704),
 (Document(page_content='of energy consumption, load profiles, and thermal performance. To \nmodel the e

In [74]:
#custom prompt for the llm 

prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

In [75]:
#retriever i.e. our vectordb
retriever = vectordb.as_retriever(search_kwargs = {"k": Config.k, "search_type" : "similarity"})

In [22]:
llm  # just trying to check if the model is loaded correctly

LlamaCpp(client=<llama_cpp.llama.Llama object at 0x1740c3400>, model_path='/Users/bivasbisht/Thesis/llama-2-7b-chat.Q5_K_M.gguf', n_ctx=2048, n_batch=512, n_gpu_layers=1)

In [108]:
### defining new llm using huggingface hub

from langchain_community.llms import HuggingFaceHub

repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
READER_MODEL_NAME = "mistral-8x7-B"

llm_new = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    huggingfacehub_api_token = "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",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

In [109]:
llm_new

HuggingFaceHub(client=InferenceAPI(api_url='https://api-inference.huggingface.co/pipeline/text-generation/microsoft/phi-2', task='text-generation', options={'wait_for_model': True, 'use_gpu': False}), repo_id='microsoft/phi-2', task='text-generation', model_kwargs={'max_new_tokens': 512, 'top_k': 30, 'temperature': 0.1, 'repetition_penalty': 1.03}, huggingfacehub_api_token='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')

In [110]:
llm_new("What is football ? ")

ValueError: Error raised by inference API: Model microsoft/phi-2 time out

In [85]:
# RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
llm_new,
chain_type = "stuff",
retriever=retriever,
chain_type_kwargs={"prompt": PROMPT},
return_source_documents = True,
verbose = False
)

In [86]:
#processing the layout of the generated text from the llm , formatting the llm response


def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4] + ' - page: ' + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )

    ans = ans + '\n\nSources: \n' + sources_used
    return ans 

In [87]:
# function for llm response
def llm_ans(query):
    start = time.time()
    
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

In [88]:
query = "What is f"
print(llm_ans(query))

 I don't know the answer to that question.

Sources: 
International Energy Agency - Capturing the Multiple Benefits - page: 140 
An-under-developed-dimension-in-upgrading-energy-inefficie_2023_Energy-Resea - page: 2 
future-proof_built_environment - page: 22

Time elapsed: 1 s


### EVALUATION



In [43]:
llm_new

HuggingFaceHub(client=InferenceAPI(api_url='https://api-inference.huggingface.co/pipeline/text-generation/mistralai/Mixtral-8x7B-Instruct-v0.1', task='text-generation', options={'wait_for_model': True, 'use_gpu': False}), repo_id='mistralai/Mixtral-8x7B-Instruct-v0.1', task='text-generation', model_kwargs={'max_new_tokens': 512, 'top_k': 30, 'temperature': 0.1, 'repetition_penalty': 1.03}, huggingfacehub_api_token='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')