In [1]:
#data reading and processing
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from langchain.docstore.document import Document
import re

#read the additional web documents
docs_add_path = ["../data/qa/residensi-zig.txt", "../data/qa/the-connaught-one.txt", "../data/qa/the-minh.txt"]
property_names = ["Residensi Zig", "The Connaught One", "The Minh"]
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 500,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

docs_add = []
for i in range(len(docs_add_path)):
    with open(docs_add_path[i], "r") as f:
        data_add = f.read()
    d = text_splitter.create_documents([data_add])
    docs_add += [Document(page_content="For Property " +property_names[i]+ ": \n"+e.page_content) for e in d]
    

#read property chat inquiry - converted to text 
with open("../data/faq/property_inquiry_for_chatbot 1.txt", "r") as f:
    data_inquiry = f.read()
docs_multi_prop = [Document(page_content=i) for i in re.split("Images:", data_inquiry)]



docs = docs_add #docs_multi_prop + 


In [12]:
#setup model
#setup
import os
import re
import fitz
import copy
import pathlib
import pandas as pd
from tqdm import tqdm
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import (
    WatsonxLLM,
)
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema.embeddings import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()


class MiniLML6V2EmbeddingFunctionLangchain(Embeddings):
    MODEL = SentenceTransformer("all-mpnet-base-v2")

    def embed_documents(self, texts):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode(texts).tolist()

    def embed_query(self, query):
        return MiniLML6V2EmbeddingFunctionLangchain.MODEL.encode([query]).tolist()[0]
    

#set up vector db
embeddings = MiniLML6V2EmbeddingFunctionLangchain()
db = FAISS.from_documents(docs, embeddings)
db.save_local("../code-engine/db")
db = FAISS.load_local("../code-engine/db", embeddings)

In [13]:
#method 2: generate questions from unidentified export, and generate answers from informative documents (pdf and property inquiry)

import pandas as pd
import numpy as np
import re
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from langchain.output_parsers.json import SimpleJsonOutputParser

def generate_questions(rep_words):
    parser = SimpleJsonOutputParser()
    model = Model(
        model_id=ModelTypes.LLAMA_2_70B_CHAT,
        credentials={
            "apikey": os.getenv("IBM_API_KEY"),
            "url": "https://us-south.ml.cloud.ibm.com",
        },
        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.MAX_NEW_TOKENS: 800,
            GenParams.TEMPERATURE: 0,
            GenParams.RANDOM_SEED: 12345,
            GenParams.STOP_SEQUENCES:["]"]
        },
        project_id=os.getenv("PROJECT_ID"),
    )
    llm = WatsonxLLM(model=model)
    prompt = f"""
Provide the responses as a list of JSONs with "question".

Query: The following Questions Database provides by questions asked by many users. Analyze the questions and provide templates for 2-3 commonly asked questions that have been asked at least twice. Ensure that the selected questions are complete sentences. Let us think step by step.

Example:
Questions Database:
["i want to move to malaysia. how?", "how can i move to malaysia?", "where is hamilton playing?", "hamilton is playing in which city?", "what is for lunch", "what is for lunch", "what is for lunch", "where is cappadocia", "who is elon musk"]

JSON output:
[{{"question":"what is for lunch"}}, {{"question":"how can i move to malaysia?"}}, {{"question":"where is hamilton playing?"}}]

Questions Database:
{list(rep_words)}

JSON output:
"""

    d = llm(prompt)
    # print(d)
    d = parser.invoke(d)
    # print(d)
    prompt2 = f"""
Instructions: 
----------------
When the question is specific, such as requesting properties within a certain price range, templatize the question. 

For example:

Input:
[{{"question": "what are examples of terrace in Taman Tun area?"}}, {{"question": "what are facilities at Emporis?"}}, {{"question": "are there any units with price less than 100k?"}}]

Output:

{{"output":[{{"question": "what are examples of terrace in Taman Tun area?", "template": "what are examples of <house type> in <location>?"}}, {{"question": "what are facilities at Emporis?", "template": "what are facilities at <property name>?"}}, {{"question": "are there any units with price less than 100k?", "template": "are there any units with price less than <amount>k?"}}]}}
----------------

Input: 

{d}

Output:
"""
    d2 = llm(prompt2)
    d2_s = re.findall("\'template\':(.*?)}" ,d2)
    d2_s = [e.replace("'", "") for e in d2_s]
    
    
    # d2_s = parser.invoke(d2.strip())
    # if d2_s is None:
    #     print("raise none error", d2)
    # return d2_s
    return d2_s


# def is_property_details_question(question):
#     model = Model(
#         model_id=ModelTypes.LLAMA_2_70B_CHAT,
#         credentials={
#             "apikey": os.getenv("IBM_API_KEY"),
#             "url": "https://us-south.ml.cloud.ibm.com",
#         },
#         params={
#             GenParams.DECODING_METHOD: "greedy",
#             GenParams.MAX_NEW_TOKENS: 800,
#             GenParams.TEMPERATURE: 0,
#             GenParams.RANDOM_SEED: 12345,
#             GenParams.STOP_SEQUENCES:["\n"]
#         },
#         project_id=os.getenv("PROJECT_ID"),
#     )
#     llm = WatsonxLLM(model=model)
#     prompt = f"""
#     Instruction: Determine the question sounds like it pertains to the features of a specific property, or if it is a question which may involve multiple properties
    
#     Input: what are the amenities?
#     Output: specific property

#     Input: which properties have a bumiputera discount?
#     Output: multiple properties

#     Input: which properties are under 200k?
#     Output: multiple properties

#     Input: which properties are located in Sungai Buloh?
#     Output: multiple properties

#     Input: which properties are terrace?
#     Output: multiple properties

#     Input: which properties have swimming pool?
#     Output: multiple properties

#     Input: where is it located?
#     Output: specific property

#     Input: which properties are located in Singapore?
#     Output: multiple properties
    
#     Input: {question}
#     Output:
    # """

    # answer = llm(prompt)
    # return answer

def customize_questions(questions, db):
    responses = []
    new_questions = []
    for q in tqdm(questions):
        if re.search("<.*?>", q) is not None:
            # print(q)
            for x in re.findall("(<.*?>)", q):
                print(q)
                fill_in_blank = input("This is a dynamic FAQ. Fill in the " + x + "for the question above")
                q = q.replace(x, fill_in_blank)
        # print(q)
        new_questions.append(q)

    return new_questions

def generate_answers(questions, db, faq_cat, debug=False):
    responses = []
    model = Model(
        model_id=ModelTypes.LLAMA_2_70B_CHAT,
        credentials={
            "apikey": os.getenv("IBM_API_KEY"),
            "url": "https://us-south.ml.cloud.ibm.com",
        },
        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.MAX_NEW_TOKENS: 800,
            GenParams.TEMPERATURE: 0,
            GenParams.RANDOM_SEED: 12345,
            GenParams.STOP_SEQUENCES: ["\n\n\n"]
        },
        project_id=os.getenv("PROJECT_ID"),
    )
    
    llm = WatsonxLLM(model=model)
    for q in tqdm(questions):
        found_property = False
        for x in property_names:
            if x in q:
                found_property = True
        if found_property == False:
            g = "Please specify a valid property name for your query"
            responses.append(g)
            continue
        if "?" not in q:
            q = q + "?"
        search = db.similarity_search(q, 15)
        search = "\n".join([i.page_content for i in search][::-1])
        if debug:
            print("DEBUG SEARCH LOG: ", search)

        
#         prompt = f"""
# <s>[INST] <<SYS>>
# You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Tell the truth and follow instructions. You work for a company called UEM Sunrise, a property management companuy. Do not bring up other companies/organizations/websites.
# <</SYS>>
# Context:
# {search}

# Guidelines:
# Based on the context above and not on any prior knowledge, answer the question below. If you cannot determine the answer from the provided information, say "I do not know". Answer in five sentences or less and do not provide suggestions.

# Question:
# {q}
# Let us think step by step.

# Response:
# [/INST]
# """

#<s>[INST] <<SYS>>
# You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Tell the truth and follow instructions. You work for a company called UEM Sunrise, a property management company. Do not bring up other companies/organizations/websites such as iProperty.
# <</SYS>>

#[/INST]

        prompt = f"""
Context:
{search}

Guidelines:
Based on the context above and not on any prior knowledge, answer the question below. If you cannot determine the answer based on the context, say "I do not know". Answer in 3 sentences or less and do not provide suggestions.
Ensure that your answers are rooted in the context information provided, and not based on inferences, creativity, or prior knowledge. You are a customer care assistant, so do not use the word "context"


Question:
{q}
Let us think step by step.

Response:
"""
        g = llm(prompt)
        # if "I do not know" in g:
        #     g = "I do not know"
        responses.append(g)

    return responses

def generate_questions_and_answers_type_2(log, db, faq_cat, property=None):
    questions = generate_questions(log)
    if faq_cat=="property recommendation":
        cust = customize_questions(questions, db)
    elif faq_cat=="property details":
        cust = ["For Property Name: "+property+";"+q.replace("<property name>", property) for q in questions]
    # print("cust")
    # print("question")
    answers = generate_answers(cust, db, faq_cat=faq_cat)
    return cust, answers


In [14]:
len(docs_multi_prop)

16

In [10]:
#property-related FAQ
property = input("Name the property for which you want to generate FAQs: ")
raw_log_1 = pd.read_excel("../data/faq/unidentified-export may-8th november_edit.xlsx", sheet_name="property")["property"]
qaa_method_2 = generate_questions_and_answers_type_2(raw_log_1, db, faq_cat="property details", property=property)

for e, i in zip(qaa_method_2[0], qaa_method_2[1]):
    print("\n")
    print("question: ", e)
    print("answer: ", i)

Name the property for which you want to generate FAQs:  Residensi Zig


100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.14s/it]



question:  For Property Name: Residensi Zig; how much is Residensi Zig?
answer:  Residensi Zig's price starts from RM 340,888.





question:  For Property Name: Residensi Zig; may i know the price for each unit at Residensi Zig?
answer:  The price for each unit at Residensi Zig varies depending on the configuration and built-up size. The price range for Residensi Zig is from RM 340,888 to RM 874,888. For specific pricing information, it's best to refer to the floor plan and pricing information provided or contact the developer's customer service.


question:  For Property Name: Residensi Zig; can i get more detail and sales person incharge for Residensi Zig?
answer:  Yes, you can get more details and the salesperson in charge of Residensi Zig. You can visit the showroom or contact the developer, UEM Sunrise, for more information. The salesperson in charge will be able to provide you with detailed information and answer any questions you may have.





In [11]:
#property recommendation - for testing transactional query
raw_log_2 = pd.read_excel("../data/faq/unidentified-export may-8th november_edit.xlsx", sheet_name="rec")["question"]
qaa2_method_2 = generate_questions(raw_log_2)

for e in zip(qaa2_method_2):
    print("\n")
    print("question: ", e)



question:  (' any new project to recommend?',)


question:  (' what is a good house that is less then <amount>k?',)


question:  (' any landed property in <location>?',)


In [6]:
# testing other RAG questions as some additional validation (side task)

pd.set_option('display.max_colwidth', 0)

questions = ["What are the amenities at Residensi Zig?", "What is the completion year of Residensi Zig", "Does Residensi Zig have bumi discount?", "What is the price of the unit?"]

g = generate_answers(questions, db, "property details")

pd.DataFrame({"questions":questions, "generated_answer":g})





100%|█████████████████████████████████████████████| 4/4 [00:14<00:00,  3.58s/it]


Unnamed: 0,questions,generated_answer
0,What are the amenities at Residensi Zig?,"Residensi Zig offers various amenities, including a 50m lap pool, beach entry, sun deck, jacuzzi, leisure deck, kid's pool, kid's aquaplay, lift lobby, main entrance, meeting room, multi-storey car park, multi-purpose hall, pool deck, playground, recreation room, 24-hour security, sky lounge, surau, and swimming pool. Additionally, there are facilities for badminton, barbeques, and a gymnasium. The development also features a jogging track, landscaped garden, and a wading pool."
1,What is the completion year of Residensi Zig,The completion year of Residensi Zig is 2027. It is mentioned in the context that the project's completion year is 2027.
2,Does Residensi Zig have bumi discount?,I do not know.\n\n\n
3,What is the price of the unit?,Please specify a valid property name for your query


In [7]:
#priority 1.1
#today:
#website + ensure answer is correct for generated faqs*** [done]
#need to handle the case of missing property on watson assistant + handle home page property issue [ip]
#update the FAQ document...[done
#for transactional questions, should use Azhar - SQL prompts as well - test on their code***
#ask Randy for confirmation regarding demo


#priority 1.2 (with dependencies on 1.1)
#align the RAG prompt with Indrajit

#priority 2
#clustering of common questions - 

In [8]:
# is_property_details_question("does the property have swimming pool?")