In [2]:
from docx import Document
import json

import os

## Load and read the data

In [3]:
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

vastu_text = extract_text_from_docx('vastu-shastra-V1.docx')


In [4]:
# we have avoided removing full stops to keep the itentity of sentences

def clean_text(text):
    cleaned_text = ''
    for char in text:
        if char.isalnum() or char.isspace() or char == '.':
            cleaned_text += char
    return cleaned_text

vastu_text = clean_text(vastu_text)


## LLM OpenAI model

In [5]:
from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain



In [6]:
with open('api_keys.json') as api_file:
    api_dict = json.load(api_file)

os.environ['OPENAI_API_KEY'] = api_dict['API OPENAI']

In [7]:
# Initialize the language model
llm = OpenAI()


In [8]:
template_for_code = """
Answer question from Vastu Shashtra document:


Question:
    {question}

Instructions:
    1. Be very specific to the document I have in conversation history
    2. Do not refer to anything else external from the source

Answer:
"""

In [9]:
prompt_repo = PromptTemplate(
    template = template_for_code,
    input_variables=['question']
)
question = "What does Vastu say about kitchen placement?"



In [10]:
# Create a LLMChain instance for generating the summary of the repository
llm_chain = LLMChain(prompt=prompt_repo, llm=llm)
answer_openai = llm_chain.run(
    conversation_history = vastu_text,
    question = question,
)
answer_openai

'According to the Vastu Shashtra document, the kitchen should be located in the south-east corner of the house. It should be well ventilated and the cook should face east while cooking. The sink should be placed in the north-east corner of the kitchen.'

## RAG model

In [11]:
from sentence_transformers import SentenceTransformer

from sklearn.neighbors import NearestNeighbors


In [12]:
# We have used different type of chunkings, all types give different responses

def chunk_by_sentence(text, max_chunk_length=200):
    paragraphs = text.split(".")  # Assuming sentences are separated by full stop.
    chunks = []

    for paragraph in paragraphs:
        if len(paragraph) <= max_chunk_length:
            chunks.append(paragraph)
        else:
            # For long sentences, further split them into smaller chunks
            words = paragraph.split()
            for i in range(0, len(words), max_chunk_length):
                chunk = " ".join(words[i:i+max_chunk_length])
                chunks.append(chunk)
                
    return chunks


def chunk_by_word_count(text, max_words=30):
    words = text.split()
    chunks = []

    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i:i+max_words])
        chunks.append(chunk)

    return chunks


def chunk_by_paragraph(text, max_chunk_length=200):
    paragraphs = text.split("\n") 
    chunks = []

    for paragraph in paragraphs:
        if len(paragraph) <= max_chunk_length:
            chunks.append(paragraph)
        else:
            # For long paragraphs, further split them into smaller chunks
            words = paragraph.split()
            temp_chunk = ""
            for word in words:
                if len(temp_chunk) + len(word) + 1 <= max_chunk_length:
                    temp_chunk += " " + word
                else:
                    chunks.append(temp_chunk.strip())
                    temp_chunk = word
            if temp_chunk:
                chunks.append(temp_chunk.strip())
                
    return chunks


def chunk_by_line_length(text, max_line_length=80):
    lines = text.split("\n")
    chunks = []

    for line in lines:
        if len(line) <= max_line_length:
            chunks.append(line)
        else:
            words = line.split()
            temp_chunk = ""
            for word in words:
                if len(temp_chunk) + len(word) + 1 <= max_line_length:
                    temp_chunk += " " + word
                else:
                    chunks.append(temp_chunk.strip())
                    temp_chunk = word
            if temp_chunk:
                chunks.append(temp_chunk.strip())

    return chunks


document_chunks = chunk_by_sentence(vastu_text)


In [13]:

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(document_chunks)


In [14]:

vector_store = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
vector_store.fit(embeddings)


NearestNeighbors(algorithm='ball_tree', n_neighbors=1)

In [15]:
def query_rag(question, model, vector_store, document_chunks):
    question_embedding = model.encode([question])
    _, indices = vector_store.kneighbors(question_embedding)
    return document_chunks[indices[0][0]]

answer_rag = query_rag("What does Vastu say about kitchen placement?", model, vector_store, document_chunks)
print(answer_rag)



Vastu dining room
Dining Room
The dining room should be located on the west side of the house or on the east or north side of the building


## Comparison of results

In [1]:
from bert_score import score

def compare_responses(baseline_response, rag_response):
    P, R, F1 = score([rag_response], [baseline_response], lang='en')
    return P.mean()[0], R.mean()[0], F1.mean()[0]

compare_responses('answer_openai', 'answer_rag')


: 