In [3]:
import json
import uuid
from sqlalchemy import create_engine
from utils import reset_db, get_session, model_to_dict
from data.models import udahub

# Udahub Application

## Core Database

**Init DB**

In [4]:
udahub_db = "data/core/udahub.db"
reset_db(udahub_db)
engine = create_engine(f"sqlite:///{udahub_db}", echo=False)
udahub.Base.metadata.create_all(bind=engine)

✅ Removed existing data/core/udahub.db
2025-12-06 10:09:50,309 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-12-06 10:09:50,309 INFO sqlalchemy.engine.Engine COMMIT
✅ Recreated data/core/udahub.db with fresh schema


**Account**

In [5]:
account_id = "cultpass"
account_name = "CultPass Card"

with get_session(engine) as session:
    account = udahub.Account(
        account_id=account_id,
        account_name=account_name,
    )
    session.add(account)

---

## Integrations

**Use LLM to Generate Sample Data**

In [38]:
import os
import pandas as pd
import random
from dotenv import load_dotenv
from typing import List
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any, Literal, TypedDict, Annotated
from langgraph.prebuilt import create_react_agent
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import SystemMessage
from langchain_chroma import Chroma
from langchain_core.documents import Document

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
llm_base_url = "https://openai.vocareum.com/v1"

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.0,
    base_url="https://openai.vocareum.com/v1",
    api_key=OPENAI_API_KEY,
)

embeddings_fn = OpenAIEmbeddings(
    model="text-embedding-3-large",
    base_url="https://openai.vocareum.com/v1",
    api_key=OPENAI_API_KEY,
)

**Data Generation Parameters**

In [None]:
num_articles = 20
num_account = 10
num_user = 25
num_ticket = 50
num_msg = num_ticket
num_expected_tag_msg = round(num_msg * 0.8)
num_unexpected_tag_msg = num_msg - num_expected_tag_msg
chromadb_directory = "vectorstore"
collection_name = "knowledge_vecotr_store"

### Sampling Schema

For LLM to produce the data samples

In [8]:
class ArticlesSchema(BaseModel):
    title: Annotated[str, Field(description="Title of the article.")]
    content: Annotated[str, Field(description="Content of the article.")]
    tags: Annotated[str, Field(description="Tags of the article")]

class ArticlesListSchema(BaseModel):
    samples: Annotated[List[ArticlesSchema], Field(description="List of CultpassArticles")]

class AccountSamples(BaseModel):
    """Structured response for account table"""
    company_name: List[str] = Field(description="List of first name and last name")

class UsersSchema(BaseModel):
    name: Annotated[str, Field(description="User's name with first and last name.")]
    email: Annotated[str, Field(description="User's email address.")]
    is_blocked: Annotated[bool, Field(description="User's blocking status.")]
    account_id: Annotated[str, Field(description="Company account ID")]

class UsersListSchema(BaseModel):
    samples: Annotated[List[UsersSchema], Field(description="List of users")]

class SingleTicket(BaseModel):
    """Structured response for a ticket"""
    status: Annotated[Literal['open','pending','closed'], Field(description="Status of the ticket")]
    content: Annotated[str, Field(description="The ticket body message content.")]
    owner_id: Annotated[str, Field(description="User ID")]
    owner_name: Annotated[str, Field(description="User's name")]
    channel: Annotated[Literal['email','chat','web','sms','social','phone'], Field(description="Ticket priority")]
    tags: Annotated[str, Field(description="List of tags used for creating this message")]
    account_id: Annotated[str, Field(description="Company account ID")]
    urgency_score: Annotated[float, Field(description="Urgency of the ticket")]

class TicketSamples(BaseModel):
    """Structured response of ticket list"""
    samples: List[SingleTicket]

## Loading Default Data

* Knowledge Base
* Users Data

In [9]:
# Load Sample Articles
cultpass_articles = []
with open('data/external/cultpass_articles.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        cultpass_articles.append(json.loads(line))

# Extract Tags from Samples
sample_tags = [ article['tags'].split(', ') for article in cultpass_articles ]
sample_tags = set([ item for subtags in sample_tags for item in subtags ])
sample_tags_str = ", ".join(sample_tags)

# Define Tags with Expected Knowledge and Tags without Expected Knowledge
# (This is for sampling ticket questions "with vs without knowledge to answer" using LLM)
expected_tags = sample_tags
unexpected_tags = ['weather', 'stock price', 'dietary', 'health advice']

In [10]:
# Load Users Data
cultpass_users = []
with open('data/external/cultpass_users.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        cultpass_users.append(json.loads(line))

#### Generate Knowledge Articles

In [11]:
# Create Additional Knowledge Articles
num_cultpass_articles = len(cultpass_articles)
num_new_articles = num_articles - num_cultpass_articles
sample_articles_template = """
    You are an helpful expert creating sample of python sample article data based on given examples.  Do not provide any additional description or explanation.
    There is a mock-up project, it has the following description:

    You’ve joined a fast-growing AI startup building the next frontier in customer support automation. Your team is responsible for building UDA-Hub, 
    a Universal Decision Agent designed to plug into existing customer support systems (Zendesk, Intercom, Freshdesk, internal CRMs) and intelligently 
    resolve tickets. You need to generate sample knowledge articles in area including: {sample_tags}.  
    
    The knowledge areas given above will be used as tags for the generated articles.
    
    The first UDA-Hub customer Cultpass has the following sample articles to be used as knowledge to be used by the system:
    {samples}

    Provide {num_articles} more example like above and double check if all articles uses tags from {sample_tags}, if not try again.
    """

sample_articles_prompt = PromptTemplate(
    template = sample_articles_template,
    input_variables = ["samples","num_articles","sample_tags"],
    ).invoke({"samples": cultpass_articles,
              "num_articles": num_new_articles,
              "sample_tags": sample_tags}).to_messages()

new_cultpass_articles = llm.with_structured_output(ArticlesListSchema).invoke(sample_articles_prompt).model_dump()['samples']
cultpass_articles_ls = cultpass_articles + new_cultpass_articles

# Extracting Sample Tags for Each Sample
# new_sample_tags = [ article['tags'].split(', ') for article in cultpass_articles_ls ]
# new_sample_tags = set([ item for subtags in new_sample_tags for item in subtags ])
# new_sample_tags_str = ", ".join(new_sample_tags)

if len(cultpass_articles_ls) < 14:
    raise AssertionError("You should load the articles with at least 14 records")

# cultpass_articles_ls

Creating Vector Embedding for RAG

In [None]:
# Create vector store for knowledge articles
vector_store = Chroma(
    collection_name=collection_name,
    embedding_function=embeddings_fn,
    persist_directory=chromadb_directory,
)

cultpass_articles_docs = [ Document(page_content=article['title']+": "+article["content"],
                                      metadata={"tags": article['tags']},
                                      id=i) for i, article in enumerate(cultpass_articles_ls) ]
_ = vector_store.add_documents(documents=cultpass_articles_docs)

In [41]:
question = "Hi, I have a question regarding the billing for the events I attended last month. There seems to be a discrepancy in the charges, and I would like to understand the breakdown of the costs. Could you please provide a detailed billing statement? Thank you."
vector_store.similarity_search(question)

[Document(id='19', metadata={'tags': 'billing, escalation, support'}, page_content='How to Handle Billing Discrepancies: For billing issues:\n\n- Review your billing history in the CultPass app\n- Contact support if discrepancies are found\n- Provide transaction details for faster resolution\n\n**Suggested phrasing:**\n"If you notice any billing discrepancies, review your history and contact support with transaction details."'),
 Document(id='5', metadata={'tags': 'pricing, subscription, benefits'}, page_content='Understanding CultPass Pricing: CultPass offers flexible pricing options:\n\n- Monthly subscription: Access to 4 experiences per month\n- Annual subscription: Discounted rate for a year-long access\n- Additional fees may apply for premium events\n\n**Suggested phrasing:**\n"Explore our flexible pricing plans, including monthly and annual subscriptions. Note that some premium events may incur additional fees."'),
 Document(id='14', metadata={'tags': 'events, booking, cancelatio

#### Generate Users and Tickets

In [12]:
# Create Company List
account_prompt_template = """You are a company naming expert. Please come up with {num_account} company names."""

account_prompt = PromptTemplate(
    template = account_prompt_template,
    input_variables = ["num_account"],
    ).invoke({"num_account": num_account}).to_messages()

account = llm.with_structured_output(AccountSamples).invoke(account_prompt).model_dump()['company_name']
account_table = pd.DataFrame({"company_name": account})
# account_table['id'] = account_table.index
account_table['account_id'] = [ str(uuid.uuid4()) for i in range(num_account) ]
account_table_ls = account_table.to_dict(orient='records')
# account_table

In [13]:
# Create Additional Users
num_cultpass_users = len(cultpass_users)
num_new_users = num_user - num_cultpass_users
user_prompt_template = """
    You are a people naming expert. Please come up with {num_user} names with both first and last name.
    Randomly choose a company for each user from the account list and based on the name of the company,
    create an email address for this person.  Some companies should have more users than others.
    All users from the same company must have the same domain name on their email address.

    The `is_blocked` status should be randomly choosen as 'True' or 'False'

    The account ID is the ID in the provided table, the ID and account name must match.

    List of account (companies): {account_table}"""

user_prompt = PromptTemplate(
    template = user_prompt_template,
    input_variables = ["num_user","account_table"],
    ).invoke({"num_user": num_new_users, "account_table": account_table}).to_messages()

user_ls = llm.with_structured_output(UsersListSchema).invoke(user_prompt).model_dump()['samples']
user_table = pd.DataFrame(user_ls)
user_table['id'] = [ str(uuid.uuid4().hex[:6]) for i in range(num_new_users) ]
user_table = user_table[['id', 'name', 'email', 'is_blocked','account_id']]
cultpass_users_table = pd.DataFrame(cultpass_users)
cultpass_users_table['account_id'] = [ str(uuid.uuid4()) for i in range(num_cultpass_users) ]
user_table = pd.concat([user_table, cultpass_users_table])

In [14]:
user_table

Unnamed: 0,id,name,email,is_blocked,account_id
0,750f4e,Alice Johnson,alice.johnson@innovatechsolutions.com,False,edf1080c-6261-4e20-aa50-0bb6a6c4c312
1,a80643,Michael Smith,michael.smith@ecospherenterprises.com,True,cae1b4b6-71f9-4ada-994b-636612f7866e
2,564f88,Emma Brown,emma.brown@quantumleapdynamics.com,False,afaf9155-f945-4262-bb58-a4f734bd873d
3,5c86c7,Liam Davis,liam.davis@greenwaveinnovations.com,True,a8f251f5-2f8b-4686-96ba-4681fe9c7be9
4,699ac4,Olivia Wilson,olivia.wilson@skylineventures.com,False,18499c3b-3e32-4700-b3c1-21e4563d0737
5,222431,Noah Martinez,noah.martinez@nexgensynergy.com,True,7ac6a0e2-6374-45af-b670-967c95aa7a72
6,4415e7,Sophia Anderson,sophia.anderson@bluehorizonindustries.com,False,53ec7e25-22be-4bea-99be-a832804e5556
7,9f5c6b,James Taylor,james.taylor@pinnaclepathways.com,True,fdd5543e-ff28-41a5-9632-ae3e53cc1f9a
8,fd9572,Isabella Thomas,isabella.thomas@fusioncoretechnologies.com,False,58086760-65e8-4ea7-81f1-67409def359e
9,b8b5ab,Lucas White,lucas.white@brightfutureholdings.com,True,f6cbc957-f2b8-47eb-8607-b2393a7a1d27


In [15]:
# Random Combination of Tags
expected_list_min_expected_tag = 1
expected_list_max_expected_tag = 4
unexpected_list_min_expected_tag = 0
unexpected_list_max_expected_tag = 2
unexpected_list_min_unexpected_tag = 1
unexpected_list_max_unexpected_tag = 2

expected_tag_combination = [ random.sample(list(expected_tags), random.randint(expected_list_min_expected_tag,expected_list_max_expected_tag)) for i in range(num_expected_tag_msg) ]
unexpected_tag_combination = [ random.sample(list(expected_tags), random.randint(unexpected_list_min_expected_tag,unexpected_list_max_expected_tag)) +
                               random.sample(list(unexpected_tags), random.randint(unexpected_list_min_unexpected_tag,unexpected_list_max_unexpected_tag)) for i in range(num_unexpected_tag_msg) ]
tag_combination = expected_tag_combination + unexpected_tag_combination

In [16]:
# Create Tickets
ticket_prompt_template = """
    You are a IT ticket generation emulator focusing on generating the actual message of users.
    
    Step 1: Generate {num_ticket} tickets, 
        (a) Each `content` should be 20 to 200 words
        (b) Make occasional minor typos in the `content`
        (c) Each ticket `content` should be written based on the topic in the followin table:
        {tag_combination}  

    Step 2: For the `owner_id` (use `id` column in the table), `owner_name` (use `name` column in the table), account_id (`account_id`)
            and allow one user to have multiple tickets:
        {user_table}

    Step 3: Randomly selected a `channel` from ['email','chat','web','sms','social','phone'] for each ticket.
    """

ticket_prompt = PromptTemplate(
    template = ticket_prompt_template,
    input_variables = ["num_ticket","num_expected_tag_msg","num_unexpected_tag_msg","expected_tags","unexpected_tags","ticket_table"],
    ).invoke({"num_ticket": num_ticket,
              "tag_combination": tag_combination, 
              "unexpected_tags": unexpected_tags, 
              "user_table": user_table,
            }).to_messages()

ticket_ls = llm.with_structured_output(TicketSamples).invoke(ticket_prompt).model_dump()['samples']
ticket_table = pd.DataFrame(ticket_ls)
# ticket_table['id'] = msg_table.index
ticket_table['role'] = "user"
ticket_table['status'] = "open"
ticket_table['urgency_score'] = [random.random() for _ in range(num_ticket)]
# ticket_table

In [17]:
ticket_table

Unnamed: 0,status,content,owner_id,owner_name,channel,tags,account_id,urgency_score,role
0,open,"Hello, I was reviewing the pricing plans on yo...",750f4e,Alice Johnson,email,"pricing, pause",edf1080c-6261-4e20-aa50-0bb6a6c4c312,0.918419,user
1,open,"Hi, I am trying to book a session for the upco...",a80643,Michael Smith,web,"attendance, booking, password",cae1b4b6-71f9-4ada-994b-636612f7866e,0.698036,user
2,open,"Hello, I am interested in attending the tech e...",564f88,Emma Brown,social,events,afaf9155-f945-4262-bb58-a4f734bd873d,0.264493,user
3,open,"Hi, I am having trouble logging into my accoun...",5c86c7,Liam Davis,chat,"benefits, attendance, login",a8f251f5-2f8b-4686-96ba-4681fe9c7be9,0.307102,user
4,open,"Hello, I am planning to attend the upcoming ev...",699ac4,Olivia Wilson,phone,events,18499c3b-3e32-4700-b3c1-21e4563d0737,0.28345,user
5,open,"Hi, I have a concern regarding the pricing of ...",222431,Noah Martinez,email,"escalation, pricing",7ac6a0e2-6374-45af-b670-967c95aa7a72,0.083928,user
6,open,"Hello, I have an urgent issue that needs escal...",4415e7,Sophia Anderson,sms,escalation,53ec7e25-22be-4bea-99be-a832804e5556,0.437255,user
7,open,"Hi, I have a question regarding the billing fo...",9f5c6b,James Taylor,web,"billing, events",fdd5543e-ff28-41a5-9632-ae3e53cc1f9a,0.097362,user
8,open,"Hello, I am interested in learning more about ...",fd9572,Isabella Thomas,social,"benefits, billing",58086760-65e8-4ea7-81f1-67409def359e,0.376153,user
9,open,"Hi, I have a question about the pricing of you...",b8b5ab,Lucas White,chat,pricing,f6cbc957-f2b8-47eb-8607-b2393a7a1d27,0.86985,user


In [18]:
len(ticket_table)

50

## Load Samples into Database

In [19]:
with get_session(engine) as session:
    kb = []
    for article in cultpass_articles:
        knowledge = udahub.Knowledge(
            article_id=str(uuid.uuid4()),
            account_id=account_id,
            title=article["title"],
            content=article["content"],
            tags=article["tags"]
        )
        kb.append(knowledge)
    session.add_all(kb) 
    

**Ticket**

In [20]:
def add_ticket(ticket_info):
    """
    Adding a single ticket to the database
    """
    with get_session(engine) as session:
        user = session.query(udahub.User).filter_by(
            account_id=account_id,
            external_user_id=ticket_info["owner_id"],
        ).first()

        if not user:
            user = udahub.User(
                user_id=str(uuid.uuid4()),
                account_id=account_id,
                external_user_id=ticket_info["owner_id"],
                user_name=ticket_info["owner_name"],
            )
        
        ticket = udahub.Ticket(
            ticket_id=str(uuid.uuid4()),
            account_id=account_id,
            user_id=user.user_id,
            channel=ticket_info["channel"],
        )
        metadata = udahub.TicketMetadata(
            ticket_id=ticket.ticket_id,
            status=ticket_info["status"],
            main_issue_type=None,
            tags=ticket_info["tags"],
            urgency_score=ticket_info["urgency_score"]
        )

        first_message = udahub.TicketMessage(
            message_id=str(uuid.uuid4()),
            ticket_id=ticket.ticket_id,
            role=ticket_info["role"],
            content=ticket_info["content"],
        )

        session.add_all([user, ticket, metadata, first_message])


In [21]:
for index, row in ticket_table.iterrows():
    add_ticket(row)

# Tests

In [22]:
with get_session(engine) as session:
    account = session.query(udahub.Account).filter_by(
        account_id=account_id
    ).first()
    print(account)

<Account(account_id='cultpass', account_name='CultPass Card')>


In [23]:
with get_session(engine) as session:
    account = session.query(udahub.Account).filter_by(
        account_id=account_id
    ).first()
    for article in account.knowledge_articles:
        print(article)

<Knowledge(article_id='405b7e33-c8f9-458b-bf4f-e3fa4b9294a7', title='How to Reserve a Spot for an Event')>
<Knowledge(article_id='6d0d2679-f38f-4cc1-ac6b-7fc5fb3cdd4d', title='What's Included in a CultPass Subscription')>
<Knowledge(article_id='9c38bf5a-c933-4a4c-9bd9-cf8e80d9723e', title='How to Cancel or Pause a Subscription')>
<Knowledge(article_id='0f2d93c0-d106-41a2-8080-3e38a7bd7745', title='How to Handle Login Issues?')>


In [24]:
with get_session(engine) as session:
    users = session.query(udahub.User).all()
    for user in users:
        print(user)

<User(user_id='9d4306e3-2290-4ca4-85bd-8ba4f29ec027', user_name='Alice Johnson', external_user_id='750f4e')>
<User(user_id='5016c864-ae5a-4429-a197-943475f4f769', user_name='Michael Smith', external_user_id='a80643')>
<User(user_id='48c6b24e-3f57-4a48-8042-56d67f4ba42b', user_name='Emma Brown', external_user_id='564f88')>
<User(user_id='e456ce99-192a-445c-ab11-e1ce2919dcc4', user_name='Liam Davis', external_user_id='5c86c7')>
<User(user_id='a08e88bb-af0a-4157-8f83-40a7b41cfdd3', user_name='Olivia Wilson', external_user_id='699ac4')>
<User(user_id='6de645f2-73d5-4bb6-b5d8-cac327f16957', user_name='Noah Martinez', external_user_id='222431')>
<User(user_id='612e7573-3902-4ad4-aa3b-8e82c541dc3e', user_name='Sophia Anderson', external_user_id='4415e7')>
<User(user_id='fa32b358-3f12-4746-9f90-acda68ab5e5d', user_name='James Taylor', external_user_id='9f5c6b')>
<User(user_id='3fac9204-fe94-4c16-ad9a-cafec93d25b2', user_name='Isabella Thomas', external_user_id='fd9572')>
<User(user_id='090890d

In [25]:
with get_session(engine) as session:
    user = session.query(udahub.User).filter_by(
        account_id=account_id,
        external_user_id=ticket_table.iloc[2].to_dict()["owner_id"],
    ).first()
    
    ticket:udahub.Ticket = user.tickets[0]
    for message in ticket.messages:
        print(message)

<TicketMessage(message_id='9f9928af-ae43-4e42-b3e2-76a5c681df94', role='user', content='Hello, I am interested in atte...')>
