In [1]:
#define the llm models
import sys
sys.path.append("../")
import importlib
import os
import pathlib
import shutil
import re
import action
import prompt
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import (
    WatsonxLLM,
)
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
from hf_hub import HuggingFaceHubEmbeddings

load_dotenv()


MODELS = [
    ModelTypes.FLAN_T5_XXL,
    ModelTypes.LLAMA_2_70B_CHAT,
    "meta-llama/llama-3-70b-instruct",
]

MODELS = {
    x: WatsonxLLM(
        model=Model(
            model_id=x,
            credentials={
                "apikey": os.getenv("IBM_API_KEY"),
                "url": "https://us-south.ml.cloud.ibm.com",
            },
            params={
                GenParams.DECODING_METHOD: "greedy",
                GenParams.MAX_NEW_TOKENS: 300,
                GenParams.TEMPERATURE: 0,
                GenParams.RANDOM_SEED: 12345,
                GenParams.STOP_SEQUENCES: ["\n\n"],
            },
            project_id=os.getenv("PROJECT_ID"),
        )
    )
    for x in MODELS
}

# print(MODELS[ModelTypes.FLAN_T5_XXL]("hello how are you?"))

In [2]:
#embeddings for semantic search

embeddings = HuggingFaceHubEmbeddings(
    model="sentence-transformers/all-mpnet-base-v2",
    task="feature-extraction",
    huggingfacehub_api_token=os.getenv("HUGGINGFACEHUB_API_TOKEN"),
)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
text_files = [
    "../data/qa/the-connaught-one.txt",
    "../data/qa/the-minh.txt",
    "../data/qa/residensi-zig.txt",
]
docs = [
    Document(
        page_content=open(x, encoding="utf-8").read(),
        metadata={"filename": pathlib.Path(x).stem},
    )
    for x in text_files
]
docs = text_splitter.split_documents(docs)
vdb = FAISS.from_documents(docs, embeddings)
vdb.save_local("../../backend/vdb")
vdb = FAISS.load_local("../../backend/vdb", embeddings, allow_dangerous_deserialization=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#prepare structured data into SQL

engine = create_engine("sqlite:///database.db", echo=False)
data = pd.read_excel("../../backend/data/sql/zig-minh-connaught-sample.xlsx", sheet_name=None)
for k, v in data.items():
    v.columns = [x.replace(" ", "_") for x in v.columns]
    table = k.split(" ")[0]
    v.to_sql(table, con=engine, index=False, if_exists="replace")
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

db = SessionLocal()
with db.connection().engine.connect() as conn:
    connaught = pd.read_sql(text("SELECT * from connaught"), conn)
    minh = pd.read_sql(text("SELECT * from minh"), conn)
    zig = pd.read_sql(text("SELECT * from zig"), conn)
# connaught.head()
# minh.head()
# zig.head()

In [4]:
#generate questions & answers

import pandas as pd
import json
import numpy as np
import re
from langchain.output_parsers.json import SimpleJsonOutputParser
from prompt import ROUTING_TEMPLATE, PROPERTY_TEMPLATE
from action import ACTIONS

def generate_questions(rep_words):
    parser = SimpleJsonOutputParser()
    model = Model(
        model_id=ModelTypes.LLAMA_2_70B_CHAT,
        credentials={
            "apikey": os.getenv("IBM_API_KEY"),
            "url": "https://us-south.ml.cloud.ibm.com",
        },
        params={
            GenParams.DECODING_METHOD: "greedy",
            GenParams.MAX_NEW_TOKENS: 800,
            GenParams.TEMPERATURE: 0,
            GenParams.RANDOM_SEED: 12345,
            GenParams.STOP_SEQUENCES:["]"]
        },
        project_id=os.getenv("PROJECT_ID"),
    )
    llm = WatsonxLLM(model=model)
    prompt = f"""
Provide the responses as a list of JSONs with "question".

Query: The following Questions Database provides by questions asked by many users. Analyze the questions and provide templates for 2-3 commonly asked questions that have been asked at least twice. Ensure that the selected questions are complete sentences. Let us think step by step.

Example:
Questions Database:
["i want to move to malaysia. how?", "how can i move to malaysia?", "where is hamilton playing?", "hamilton is playing in which city?", "what is for lunch", "what is for lunch", "what is for lunch", "where is cappadocia", "who is elon musk"]

JSON output:
[{{"question":"what is for lunch"}}, {{"question":"how can i move to malaysia?"}}, {{"question":"where is hamilton playing?"}}]

Questions Database:
{list(rep_words)}

JSON output:
"""

    d = llm(prompt)
    # print(d)
    d = parser.invoke(d)
    # print(d)
    prompt2 = f"""
Instructions: 
----------------
When the question is specific, such as requesting properties within a certain price range, templatize the question. 

For example:

Input:
[{{"question": "what are examples of terrace in Taman Tun area?"}}, {{"question": "what are facilities at Emporis?"}}, {{"question": "are there any units with price less than 100k?"}}]

Output:

{{"output":[{{"question": "what are examples of terrace in Taman Tun area?", "template": "what are examples of <house type> in <location>?"}}, {{"question": "what are facilities at Emporis?", "template": "what are facilities at <property name>?"}}, {{"question": "are there any units with price less than 100k?", "template": "are there any units with price less than <amount>k?"}}]}}
----------------

Input: 

{d}

Output:
"""
    d2 = llm(prompt2)
    d2_s = re.findall("\'template\':(.*?)}" ,d2)
    d2_s = [e.replace("'", "") for e in d2_s]
    
    
    # d2_s = parser.invoke(d2.strip())
    # if d2_s is None:
    #     print("raise none error", d2)
    # return d2_s
    return d2_s


def customize_questions(questions, db):
    responses = []
    new_questions = []
    for q in tqdm(questions):
        if re.search("<.*?>", q) is not None:
            # print(q)
            for x in re.findall("(<.*?>)", q):
                # print(q)
                fill_in_blank = input("This is a dynamic FAQ. Fill in the " + x + "for the question above")
                q = q.replace(x, fill_in_blank)
        # print(q)
        new_questions.append(q)
    return new_questions

def generate(generate_request):
    params = {
    "models": MODELS,
    "db": db,
    "vdb": vdb,
    "property": None, 
    }
    k_docs = generate_request["k_docs"]
    wa_property = generate_request["current_page"]
    question = generate_request["question"]
    prompt = ROUTING_TEMPLATE.replace("{{question}}", question)
    action_output = int(params["models"][ModelTypes.FLAN_T5_XXL](prompt).strip())
    if action_output in [1, 3]:
        property = None
        property_name = params["models"][ModelTypes.FLAN_T5_XXL](
            PROPERTY_TEMPLATE.replace("{{question}}", question)
        ).strip()
        if wa_property is not None:
            property = wa_property
        if property_name != "NONE":
            property = property_name
        if property is not None:
                params.update(
                    {
                        "question": f"For {property}: {question}",
                        "property": property,
                        "detected_property_name": property_name
                    }
                )
        else:
            params.update(
                {
                    "question": question,
                    "property": property,
                    "detected_property_name": property_name
                }
            )
    else:
        params.update({"question": question, "property": None, "detected_property_name": "NONE"})
    print(f"Question: {question} - [Action: {action_output}]")
    generated_text, custom_response = ACTIONS[action_output](params).values()
    return {"generated_text": generated_text, "custom_response": custom_response}
    

def generate_questions_and_answers(log, property):
    questions = generate_questions(log)
    cust = [q.replace("<property name>", property) for q in questions]
    answers = [generate(
        {
    "k_docs": 3,
    "current_page": None,
    "question": c
}
    )["generated_text"] for c in cust]
    return cust, answers


3 cells below demonstrate the faq generation capability

In [5]:
raw_log_1 = pd.read_excel("../data/faq/unidentified-export may-8th november_edit.xlsx", sheet_name="property")["property"]
generate_questions_and_answers(raw_log_1, "residensi zig")

Question:  how much is residensi zig? - [Action: 3]
Property: residensi-zig
Question:  may i know the price for each unit at residensi zig? - [Action: 1]
Generated SQL Query: SELECT Price FROM zig;
Question:  can i get more detail and sales person incharge for residensi zig? - [Action: 3]
Property: residensi-zig


([' how much is residensi zig?',
  ' may i know the price for each unit at residensi zig?',
  ' can i get more detail and sales person incharge for residensi zig?'],
 ['The price range for Residensi ZIG is from RM 340,888 to RM 874,888, depending on the unit type and built-up size.',
  'The price for each unit at residensi Residensi Zig is 781888.0, 773888.0, 781888.0, 611888.0, 605888.0.\nPlease note that there are more than 5 relevant results. Please refer to https://www.uemsunrise.com/property/region/greater-kuala-lumpur/project/residensi-zig for more information',
  'Sure, you can get more details and contact information for the sales person in charge of Residensi ZIG by visiting the showroom located at Kepong Metropolitan Lake-garden, 52100 Kuala Lumpur, Wilayah Persekutuan Kuala Lumpur, Malaysia, or by calling the phone number 012-8181 259 during operating hours.'])

In [6]:
raw_log_1 = pd.read_excel("../data/faq/unidentified-export may-8th november_edit.xlsx", sheet_name="property")["property"]
generate_questions_and_answers(raw_log_1, "the connaught one")

Question:  how much is the connaught one? - [Action: 3]
Property: the-connaught-one
Question:  may i know the price for each unit at the connaught one? - [Action: 1]
Generated SQL Query: SELECT Price FROM connaught;
Question:  can i get more detail and sales person incharge for the connaught one? - [Action: 3]
Property: the-connaught-one


([' how much is the connaught one?',
  ' may i know the price for each unit at the connaught one?',
  ' can i get more detail and sales person incharge for the connaught one?'],
 ['The price of The Connaught One starts from RM290,000 onwards, with units ranging from 42 sqm to 118 sqm (452 sqft to 1,270 sqft) and up to 4 bedrooms.',
  'The price for each unit at the The Connaught One one is 343888, 335888, 338888, 335888.\nPlease note that there are more than 5 relevant results. Please refer to https://www.uemsunrise.com/property/region/greater-kuala-lumpur/project/the-connaught-one for more information',
  'Sure, you can get more details and contact information for the sales person in charge of The Connaught One by visiting the UEM Sunrise website or by contacting their customer care line at 1800-88-8008 or international line at 603-2711-8008.'])

In [7]:
raw_log_1 = pd.read_excel("../data/faq/unidentified-export may-8th november_edit.xlsx", sheet_name="property")["property"]
generate_questions_and_answers(raw_log_1, "the minh")

Question:  how much is the minh? - [Action: 3]
Property: the-minh
Question:  may i know the price for each unit at the minh? - [Action: 1]
Generated SQL Query: SELECT Price FROM minh;
Question:  can i get more detail and sales person incharge for the minh? - [Action: 3]
Property: the-minh


([' how much is the minh?',
  ' may i know the price for each unit at the minh?',
  ' can i get more detail and sales person incharge for the minh?'],
 ["The Minh's selling price ranges from RM1,399,800 to RM3,178,370.",
  'The price for each unit at the The MINH is 2452800, 2670800, 1854800, 1447800, 1828800.\nPlease note that there are more than 5 relevant results. Please refer to https://www.uemsunrise.com/property/region/greater-kuala-lumpur/project/the-minh for more information',
  'Sure, you can get more details and contact information for the sales person in charge of The MINH by visiting the showroom or contacting the developer, Laser Tower Sdn Bhd, directly.'])