In [23]:
from DataPrepare.PrepareDocx import DocxParser as dp
from DataPrepare.BaseGenerator import BaseGenerator
from DataPrepare.Metrics import EvaluationMetrics

import pandas as pd

In [34]:
data = []

for doc in range(3):
    file_path = f"WordDocuments/sample{doc + 1}.docx"

    parsed_data = dp.parse_docx(file_path)
    data+=parsed_data

In [24]:
import os

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [45]:
from uuid import uuid4

from langchain_core.documents import Document

documents = [Document(
    page_content=ref['text'],
    metadata={"source": "documentation"}
    )
    for ref in data]

uuids = [str(uuid4()) for _ in range(len(documents))]

db = FAISS.from_documents(documents, OllamaEmbeddings(model="nomic-embed-text",show_progress=True))
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# Set up the local model:
local_model = "llama3"
llm = ChatOllama(model=local_model, num_predict=400,
                 stop=["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"])

# Set up the RAG chain:
prompt_template = """
<|start_header_id|>user<|end_header_id|>
Responds user questions taking into accoun the given context, give a precise and short answer without referring to this part of prompt.
Give as little additional information as possible.
Question: {question}
Context: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Querying the LLM (oviously to test here you must ask a relevant question of your data)
question = "What share percent have VoiceOver screen reader?"
print(question)
res = rag_chain.invoke(question)
print(rag_chain.invoke(question))

OllamaEmbeddings: 100%|██████████| 100/100 [00:00<00:00, 185.07it/s]


What share percent have VoiceOver screen reader?


OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 175.71it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 163.21it/s]


9%


In [46]:
evaluator = EvaluationMetrics()

reference = '9%'
results = evaluator.evaluate([reference],[res])

In [47]:
print("ROUGE scores")
pd.DataFrame(results["rouge"])

ROUGE scores


Unnamed: 0,rouge-1,rouge-2,rouge-l
r,1.0,0.0,1.0
p,1.0,0.0,1.0
f,1.0,0.0,1.0


In [48]:
print("BERTScore")
pd.DataFrame(results["bertscore"], index=['score']).T

BERTScore


Unnamed: 0,score
precision,1.0
recall,1.0
f1,1.0
