In [55]:
import os
import pickle
from typing import Any
import uuid
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from pydantic import BaseModel
from unstructured.partition.docx import partition_docx
import google.generativeai as genai
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.documents import Document as dc
from langchain.text_splitter import RecursiveCharacterTextSplitter
import streamlit as st
from langchain_community.embeddings import OllamaEmbeddings
#model preperation
os.environ["GOOGLE_API_KEY"]="AIzaSyCxIFTj4hpP2ova-B7j8VGzAb0YPPzlIcY"

model = ChatGoogleGenerativeAI(
                                model="gemini-pro", 
                                temperature=0.5, 
                                convert_system_message_to_human=True
                            )
    
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
from xml.etree.ElementTree import Element, SubElement, tostring
from docx import Document

def table_to_xml(table):
    root = Element('table')
    for row in table.rows:
        row_element = SubElement(root, 'row')
        for cell in row.cells:
            cell_element = SubElement(row_element, 'cell')
            cell_element.text = cell.text.strip()  # Use cell.text directly
    return root

def get_paragraphs_before_tables(doc_path):
    doc = Document(doc_path)
    paragraphs_and_tables = []
    last_paragraph = None 
    for element in doc.element.body:
        if element.tag.endswith('p'):
            last_paragraph = element
        elif element.tag.endswith('tbl'):
            # Find the table object corresponding to this element
            for table in doc.tables:
                if table._element == element:
                    if last_paragraph is not None:
                        xml_root = table_to_xml(table)
                        xml_str = tostring(xml_root, encoding='unicode')
                        langchain_document = "Title: "+ last_paragraph.text + "Content: " + xml_str
                        paragraphs_and_tables.append(langchain_document)
                    break

    return paragraphs_and_tables

# Example usage:
docx_file_path = "./rep.docx"  # Path to your .docx file
table_elements = get_paragraphs_before_tables(docx_file_path)


# Prompt
prompt_text = """You are an assistant tasked with summarizing tables.\ 
Summerize it and keep the most important information.
Also you must put the title at the beginning of the summary. \
If you encounter any table name that has Sc. that means it's a senario \
Give a summary of the table. Table chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

tables = table_elements
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})




with open("./output.txt",encoding='utf-8') as f:
    state_of_the_union = f.read()

#text spliter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=5000,
    chunk_overlap=0 ,
    separators=["\n\n","\n", " ",""],
)

texts = text_splitter.create_documents([state_of_the_union])

In [56]:
import random

table_summaries_documents = [dc(summary) for summary in table_summaries]
grand_context = table_summaries_documents + texts
random.shuffle(grand_context)

In [57]:
print(grand_context[5])
print(grand_context[-1])

page_content="Value for Money Assessment\nIntroduction\nThe Value for Money (hereafter “VfM”) analysis consists of carrying out an assessment of the overall cost of the project, taking into account the value of the inherent risks, depending on whether the project is carried out under a public contract or a PPP.\nThe analysis and comparison of the financial profitability of different options is ultimately measured by means of the VfM. In order to assess the VfM of each option, we use the results of the financial model which adopts the point of view of the MITC and the private partner by estimating the overall cost of the project for the public sector of each option over the project duration.\nThis cost is then risk-adjusted, i.e., the cost of risks borne by the public sector (and which have not been transferred to the private partner) is added. The cost of risks is provided by the risk register, the results of which are in Appendix 3 section.\nBest practice in financial analysis and mod

In [58]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [59]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [60]:
bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [61]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a University Professor creating a test for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.
Don't ask question starting with based on..., the context is provided automaticly.
question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=grand_context[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | model

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [62]:
for k, v in output_dict.items():
  print(k)
  print(v)

question
What are the provisions of the Public-Private Partnership Act (PPP Act) No. 23 of 2022 in Malawi?


In [63]:
from tqdm import tqdm

qac_triples = []

for text in tqdm(grand_context[:50]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

100%|██████████| 50/50 [02:28<00:00,  2.96s/it]


In [64]:
qac_triples[5]

{'question': 'What is the definition of Value for Money (VfM) analysis in the context of public infrastructure projects?',
 'context': Document(page_content="Value for Money Assessment\nIntroduction\nThe Value for Money (hereafter “VfM”) analysis consists of carrying out an assessment of the overall cost of the project, taking into account the value of the inherent risks, depending on whether the project is carried out under a public contract or a PPP.\nThe analysis and comparison of the financial profitability of different options is ultimately measured by means of the VfM. In order to assess the VfM of each option, we use the results of the financial model which adopts the point of view of the MITC and the private partner by estimating the overall cost of the project for the public sector of each option over the project duration.\nThis cost is then risk-adjusted, i.e., the cost of risks borne by the public sector (and which have not been transferred to the private partner) is added. 

In [65]:
answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a University Professor creating a test for advanced students. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | model

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [66]:
for k, v in output_dict.items():
  print(k)
  print(v)

answer
The Public-Private Partnership Act (PPP Act) No. 23 of 2022 provides the legal framework for Public-Private Partnerships (PPPs) in Malawi. The key provisions of the PPP Act include:

- Establishing the Public-Private Partnership Commission
- Facilitating the development and implementation of PPP arrangements for efficient delivery of infrastructure and services
- Promoting expedited and efficient implementation of PPP arrangements
- Allowing for the structuring of PPPs through Special Purpose Vehicles or Joint Venture agreements
- Providing for the possibility of state-owned enterprises, such as the Malawi Investment and Trade Centre (MITC), to enter into PPP contracts


In [67]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

100%|██████████| 49/49 [02:30<00:00,  3.06s/it]


In [68]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})


eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [69]:
# visulize it eval_dataset into a pandas dataframe
eval_dataset.to_csv("groundtruth_eval_dataset.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

123813

In [70]:
from datasets import Dataset
import pandas as pd
# load the eval_dataset
eval_dataset = Dataset.from_pandas(pd.read_csv("groundtruth_eval_dataset.csv", encoding='ISO-8859-1'))

In [71]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings


def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
    llm=model, embeddings=OllamaEmbeddings(model="nomic-embed-text")
  )
  return result

In [72]:
from operator import itemgetter
import os
import pickle
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

from langchain_community.embeddings import OllamaEmbeddings

# The vectorstore to use to index the child chunks
vectorstore1 = Chroma(collection_name="table_summaries", embedding_function=OllamaEmbeddings(model="nomic-embed-text"),persist_directory=r"C:\Users\Bohmid\Desktop\poc-01\doc1\v1")


with open(r"C:\Users\Bohmid\Desktop\poc-01\doc1\store1.pkl", 'rb') as f:
    store1 = pickle.load(f)
# The storage layer for the parent documents
id_key1 = "doc_id"

# The retriever (empty to start)
retriever1 = vectorstore1.as_retriever()

vectorstore2 = Chroma(collection_name="child_chunks", embedding_function=OllamaEmbeddings(model="nomic-embed-text"),persist_directory=r"C:\Users\Bohmid\Desktop\poc-01\doc1\v2")

with open(r"C:\Users\Bohmid\Desktop\poc-01\doc1\store2.pkl", 'rb') as f:
    store2 = pickle.load(f)
id_key2 = "doc_id"
# The retriever (empty to start)
retriever2 = MultiVectorRetriever(
    vectorstore=vectorstore2,
    byte_store=store2,
    id_key=id_key2,
)


from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """
You are a Private-Public Partnership (PPP) feasibility expert. You are tasked with answering questions the feasibility of a PPP project.\n
Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
from langchain.retrievers import EnsembleRetriever
ensemble=EnsembleRetriever(retrievers=[retriever1,retriever2],weights=[0.5,0.5])
# RAG pipeline
chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | ensemble, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | model, "context": itemgetter("context")}
)

In [73]:
from tqdm import tqdm
import pandas as pd

basic_qa_ragas_dataset = create_ragas_dataset(chain, eval_dataset)

100%|██████████| 47/47 [07:10<00:00,  9.16s/it]


In [75]:
basic_qa_ragas_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 47
})

In [76]:
basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

770520

In [77]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating:   0%|          | 0/329 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._achat_with

In [78]:
basic_qa_result

{'context_precision': 0.5573, 'faithfulness': 0.9342, 'answer_relevancy': 0.5130, 'context_recall': 0.8928, 'context_relevancy': 0.0497, 'answer_correctness': 0.6443, 'answer_similarity': 0.8781}