In [128]:
import warnings
warnings.filterwarnings("ignore")


import matplotlib.pyplot as plt
import seaborn as sns
import os
import textwrap
import time
import pandas as pd

import langchain 
from langchain_community.llms import LlamaCpp
# loaders
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

# splits
from langchain.text_splitter import RecursiveCharacterTextSplitter

# prompts
from langchain import PromptTemplate, LLMChain

# vector stores
from langchain.vectorstores import FAISS
import faiss

# models
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

#sentence transformer library
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity


# retrievers
from langchain.chains import RetrievalQA

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

print('LangChain:', langchain.__version__)

LangChain: 0.1.10


In [265]:
class Config:
    # LLMs
    model_name = 'mistralai-8x7B'# llama2-7b, llama2-13b , mistral-ai(gguf models that can run on cpu)
    temperature = 0,
    top_p = 0.95,
    repetition_penalty = 1.15

    # splitting
    split_chunk_size = 500
    split_overlap = 100 # represents the overlap b/w chunks 

    #if overlap is set to 0, meaning there's no overlap between chunks. Each chunk starts exactly where the previous one ends.

    # embeddings
    embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'

    # similar passages
    k = 3

    # paths
    PDFs_path = 'File_data'
    Embeddings_path = 'lit_embedding_500_100'

In [266]:
## loading mistral-8x7B from huggingface hub API

from langchain_community.llms import HuggingFaceHub

repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
READER_MODEL_NAME = "mistral-8x7-B"

llm_new = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    huggingfacehub_api_token = "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",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
)

In [67]:
#Document Load

loader = DirectoryLoader(
    Config.PDFs_path,
    glob="./*.pdf",
    loader_cls=PyPDFLoader,
    show_progress=True,
    use_multithreading=True
)

documents = loader.load()

100%|██████████| 500/500 [15:44<00:00,  1.89s/it]


In [250]:
documents[0]

[Document(page_content='ScienceDirect\nAvailable online at www.sciencedirect.com\nAvailable online at www.sciencedirect.com\nScienceDirect\nEnergy Procedia 00 (201 7) 000 –000\nwww.elsevier.com/locate/procedia\n1876- 6102 © 201 7The Authors. Published by Elsevier Ltd.\nPeer-review under responsibility of the Scientific Committee of The 15th International Symposium on District Heating and Cooling .\nThe 15th International Symposium on District Heating and Cooling\nAssessing the feasibility of using the heat demand -outdoor \ntemperature function for a long- term district heat demand forecast\nI. Andrića,b,c*, A. Pinaa, P. Ferrãoa, J. Fournierb., B. Lacarrièrec, O. Le Correc\naIN+ Center for Innovation, Technology and Policy Research - Instituto Superior Técnico ,Av. Rovisco Pais 1, 1049- 001 Lisbon, Portugal\nbVeolia Recherche & Innovation, 291 Avenue Dreyfous Daniel, 78520 Limay , France\ncDépartement Systèmes Énergétiques et Environnement -IMT Atlantique, 4 r ue Alfred Kastler, 44300 

In [217]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = Config.split_chunk_size,
    chunk_overlap = Config.split_overlap,
)


texts = []
for i in range(len(documents)):
    texts += text_splitter.split_documents(documents[i])


In [218]:
len(texts)

"File_data/DistrictEES_Periodic Reporting for period 1.pdf"

109230

In [248]:
texts[3]

Document(page_content='greenhouse gas emissions from the building sector. These systems require high investments which are returned through the heatsales . Due to the changed climate conditions and building renovation policies, heat demand in the future could decrease, \nprolonging the investment return period. The main scope of this paper is to assess the feasibility of using the heat demand –outdoor tempe rature function for heat demand', metadata={'source': 'File_data/Criteria-and-structure-of-a-harmonised-data-collection-for-NZ_2017_Energy-Pr.pdf', 'page': 0})

In [251]:
texts[2]

Document(page_content='bVeolia Recherche & Innovation, 291 Avenue Dreyfous Daniel, 78520 Limay , France\ncDépartement Systèmes Énergétiques et Environnement -IMT Atlantique, 4 r ue Alfred Kastler, 44300 Nantes, France\nAbstract\nDistrict heating networks are commonly addressed in the literature as one of the most effective solutions for decreasing the', metadata={'source': 'File_data/Criteria-and-structure-of-a-harmonised-data-collection-for-NZ_2017_Energy-Pr.pdf', 'page': 0})

In [261]:
# # creating embeddings
# embeddings = HuggingFaceInstructEmbeddings(
#     model_name = Config.embeddings_model_repo,
#     # model_kwargs = {"device": "cuda"}
# )

# ### create embeddings and DB
# vectordb = FAISS.from_documents(
#     documents = texts, 
#     embedding = embeddings
# )

# ### create and save vector database
# vectordb.save_local("lit_embedding_500_100")

load INSTRUCTOR_Transformer
max_seq_length  512


In [267]:
%time
### download embeddings model
embeddings = HuggingFaceInstructEmbeddings(
    model_name = Config.embeddings_model_repo
    # model_kwargs = {"device": "cuda"}
)

### load vector DB embeddings
vectordb = FAISS.load_local(
    Config.Embeddings_path,
    embeddings,
    allow_dangerous_deserialization = True
)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 10 µs
load INSTRUCTOR_Transformer
max_seq_length  512


In [268]:
#custom prompt for the llm 

prompt_template = """
Don't try to make up an answer, if you don't know just say that you don't know.
Answer in the same language the question was asked.
Use only the following pieces of context to answer the question at the end. If you don't find the context to answer the question just say the context is not provided.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template = prompt_template,
    input_variables = ["context", "question"]
)

In [280]:
retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})

In [281]:
retriever.get_relevant_documents("What is the current average renovation rate in the EU ?")

[Document(page_content='13  \nEstimates about the full renovation market size of the EU revolve around 100 billion EUR per year15 \n(2016 ), concentrated in Germany, Italy, France, and the UK, and huge disparities across Member States (see \nFigure 6 ). The residential sector represented 65% of this potential  and 45% belonged to the tertiary \nbuilding sector.  \nFigure 6. Estimated value of the renovation market in selected Member States.  \n \nSource: Saheb (2016) in Fabbri (2018 ).', metadata={'source': 'File_data/One-stop shops for residential building.pdf', 'page': 16}),
 Document(page_content='or trends in renovation rates in Europe. In 2011 renovation rate of the building stock per year was  assessed between \n0.5% and 2.5% [8]. An average European renovation rate of 1% has been fixed, a value in line with the study carried \nout for the European Commission  [9] where refurbishment rates of 1.2%, 0.9% and 0.5% per year were found for \nNorth -Western  Europe, Southern Europe an

In [108]:
# #Test: priting filenames of docs from retriever

# for i in range (len(docs)):
#     print(docs[i].metadata['source'])

In [282]:
# RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
llm_new,
chain_type = "stuff",
retriever=retriever,
chain_type_kwargs={"prompt": PROMPT},
return_source_documents = True,
verbose = False
)

In [283]:
def wrap_text_preserve_newlines(text, width=700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'].split("Answer:")[1])
    
    sources_used = ' \n'.join(
        [
            source.metadata['source'].split('/')[-1][:-4] + ' - page: ' + str(source.metadata['page'])
            for source in llm_response['source_documents']
        ]
    )
    
    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [273]:
def llm_ans(query):
    start = time.time()
    llm_response = qa_chain.invoke(query)
    ans = process_llm_response(llm_response)
    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans 

In [274]:
query = "What is the current average renovation rate in the EU ?"
print(llm_ans(query))

 The average renovation rate in the EU is 1%.

Sources: 
One-stop shops for residential building - page: 16 
Criteria-and-structure-of-a-harmonised-data-collection-for-NZ_2017_Energy-Pr - page: 0 
Criteria-and-structure-of-a-harmonised-data-collection-for-NZ_2017_Energy-Pr - page: 1


In [277]:
# pp.split("\n\nSources:")[0]

In [278]:
filtered_df =  pd.read_csv("eval_filtered_dataset.csv", delimiter=";")

In [279]:
fil_question = list(filtered_df["question"])
answers = list(filtered_df["answer"])
context_sources = list(filtered_df["source_doc"])

questions = []
for text in fil_question:
    questions.append(text.replace("\n","").replace("?"," ?"))

### Retrieval Evaluation

In [284]:
file_src = []
retriever = vectordb.as_retriever(search_kwargs = {"k": 3, "search_type" : "similarity"})
for quest in questions:
    
    rel_docs = retriever.get_relevant_documents(quest)

    sources_for_question = [doc.metadata['source'] for doc in rel_docs]
    
    file_src.append(sources_for_question)

In [285]:
sources_dataframe =  pd.DataFrame()
sources_dataframe["Questions"] = questions
sources_dataframe["context_sources"] = context_sources
sources_dataframe["methodology_sources"] = file_src

In [286]:
sources_dataframe.head()

Unnamed: 0,Questions,context_sources,methodology_sources
0,What percentage of district heating pipes in Switzerland are oversized ?,File_data/Towards_low_flow_temperatures_insulation-first.pdf,"[File_data/Towards_low_flow_temperatures_insulation-first.pdf, File_data/Linking-energy-efficiency-indicators-with-policy-evaluation---A_2021_Energy-.pdf, File_data/What-is-the-optimal-robust-environmental-and-cost-effective-_2021_Energy-and.pdf]"
1,What is the recommended charge range for an electric vehicle to mitigate battery degradation and electricity loss ?,File_data/An-approach-for-energy-management-of-renewable-energy-sources-_2023_Energy-a.pdf,"[File_data/An-approach-for-energy-management-of-renewable-energy-sources-_2023_Energy-a.pdf, File_data/An-approach-for-energy-management-of-renewable-energy-sources-_2023_Energy-a.pdf, File_data/An-approach-for-energy-management-of-renewable-energy-sources-_2023_Energy-a.pdf]"
2,"What is the heat pump power consumption on December 19, 2017 ?",File_data/Model-predictive-control-of-distributed-energy-resources-in-_2024_Energy-and.pdf,"[File_data/Energy-saving-and-thermal-comfort-interventions-based-on-occ_2018_Energy-and.pdf, File_data/Introducing the Heat Pump Readiness Indicator.pdf, File_data/Statistical-method-to-identify-robust-building-renovation-_2020_Building-and.pdf]"
3,What is the objective of the optimisation model ?,File_data/Profitability-of-active-retrofitting-of-multi-apartment-buildings_2019.pdf,"[File_data/Sustainable--green--or-smart--Pathways-for-energy-e_2024_Sustainable-Cities-.pdf, File_data/International Energy Agency - Capturing the Multiple Benefits.pdf, File_data/Developing-a-multi-objective-optimization-model-for-improvi_2023_Building-an.pdf]"
4,What is one way the US aims to promote sustainable energy policies ?,File_data/International Energy Agency - Capturing the Multiple Benefits.pdf,"[File_data/International Energy Agency - Capturing the Multiple Benefits.pdf, File_data/International Energy Agency - Capturing the Multiple Benefits.pdf, File_data/International Energy Agency - Capturing the Multiple Benefits.pdf]"


In [289]:
def calculate_metrics(eval_dataset):
    total_precision = 0
    total_recall = 0
    total_rr = 0  # Total reciprocal rank
    total_queries = len(eval_dataset)

    for _, row in eval_dataset.iterrows():
        relevant_document = row['context_sources']  # Actual relevant document
        retrieved_documents = row['methodology_sources']  # Retrieved documents (all 3)

        # Precision@3
        num_relevant_retrieved = 1 if relevant_document in retrieved_documents else 0
        precision_at_3 = num_relevant_retrieved / 3
        # print(precision_at_3)
        total_precision += precision_at_3

        # Recall@3
        recall_at_3 = num_relevant_retrieved / 1  # Since there's only one relevant document
        total_recall += recall_at_3

        # Reciprocal rank (RR)
        rr = 0  # Initialize reciprocal rank for the current query
        for i, doc in enumerate(retrieved_documents):
            if doc == relevant_document:
                rr = 1 / (i + 1)  # Reciprocal rank of the first relevant document
                total_rr += rr
                break  # Stop iterating after finding the first relevant document

    # Calculate average metrics
    avg_precision = total_precision / total_queries
    avg_recall = total_recall / total_queries
    f1_score = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0
    mrr = total_rr / total_queries

    return avg_precision, avg_recall, f1_score, mrr

# Call the function to calculate metrics
avg_precision, avg_recall, f1_score, mrr = calculate_metrics(sources_dataframe)

print("Retriever score for chunk_size : 500 and chunk_overlap of  : 100" + "\n")

print("Average Precision@3:", avg_precision)
print("Average Recall@3:", avg_recall)
print("F1 Score@3:", f1_score)
print("Mean Reciprocal Rank (MRR):", mrr)

Retriever score for chunk_size : 500 and chunk_overlap of  : 100

Average Precision@3: 0.2026666666666665
Average Recall@3: 0.608
F1 Score@3: 0.3039999999999998
Mean Reciprocal Rank (MRR): 0.5773333333333335


#### Answer Evaluation

In [None]:
rag_ans = []
rag_sources = []
for query in questions:
    aa = llm_ans(query)
    rag_ans.append(aa.split("\n\nSources:")[0])