In [6]:
# Load simulation data
from symtrain.data import load_json_files, create_transcript_dataframe

json_list = load_json_files("../data/raw")
df = create_transcript_dataframe(json_list)
df

Unnamed: 0,name,transcript
0,Copy OSA - Parity 2.0 1,TRAINEE: Thank you for calling Northwestern Mu...
1,Return Request (Bed Frame/Toppers/Misc),"TRAINEE: Thank you for calling Zinus, this is ..."
2,Startek_Compassion- Gift Donation. BEG,"SYM: In this simulation, you will assist the c..."
3,Sales U - Closing Conclusion,SYM: It’s closing time for this module.\nSYM: ...
4,Startek_Telco_Visual_BEG_Demo,"TRAINEE: Thank you for calling Xfinity Mobile,..."
5,Arise_Carnival DS - Sales Inquiries - Booking ...,"TRAINEE: Carnival Fun Ships, this is [Agent Na..."
6,H21 Health insurance coverage question – Step ...,"SYM: In this simulation, you will learn how to..."
7,TRAINEE STARTS HERE: How did you do? View your...,SYM: Congratulations on running your first sym...
8,H21 Health insurance coverage question – Step ...,SYM: You have already opened the call and veri...
9,UCSL eApp – State Specifics,"SYM: In this sym, we will review how the UCSL ..."


In [7]:
# Generate embeddings for all transcripts
from symtrain.embeddings import embed_dataframe_column

df = embed_dataframe_column(df, "transcript")

Generated 38 embeddings → 'transcript_emb' (dim=768)


In [8]:
# Similarity search
from symtrain.search import find_similar

# Example: Find simulations related to payment issues
find_similar(
    df,
    "transcript_emb",
    """Hi, I ordered a shirt last week and paid with my American Express card. I need to update the
payment method because there is an issue with that card. Can you help me?""",
)

Unnamed: 0,name,similarity
36,BN_CANCEL_RELEASED_ORDER_ADV,0.895486
37,C2 Car insurance claim - FNOL – Step 1 (Opening),0.879906
23,Startek_Walmart - Demo - 2025,0.877854
16,BN_ORDER_LOST_REPLACE_ADV,0.877653
6,H21 Health insurance coverage question – Step ...,0.877434


In [9]:
# Approach 1: Transformer-based clustering
from symtrain.clustering import cluster_embeddings, print_cluster_summary

df = cluster_embeddings(df, "transcript_emb", n_clusters=5)
print_cluster_summary(df, "name", "transcript_emb_cluster")


=== Cluster 0 ===
  - Sales U - Closing Conclusion
  - TRAINEE STARTS HERE: How did you do? View your Playback!  
  - UCSL eApp – State Specifics

=== Cluster 1 ===
  - Startek_Compassion- Gift Donation. BEG
  - Startek_Telco_Visual_BEG_Demo 
  - Arise_Carnival DS - Sales Inquiries - Booking created ~ BEG

=== Cluster 2 ===
  - H21 Health insurance coverage question – Step 1 (Opening)
  - H21 Health insurance coverage question – Step 2 (Gather Information)
  - H21 Health insurance coverage question – (Real human voice)

=== Cluster 3 ===
  - Zinus Knowledge Check 6 Version 2

=== Cluster 4 ===
  - Copy OSA - Parity 2.0 1
  - Return Request (Bed Frame/Toppers/Misc)
  - OSA - Parity 2.0


In [6]:
# Approach 2: LLM-based categorization
from symtrain.llm import categorize_with_ollama

# Apply to dataframe
df["llm_category"] = df["transcript"].apply(categorize_with_ollama)
df[["name", "llm_category"]]

Unnamed: 0,name,llm_category
0,Copy OSA - Parity 2.0 1,Insurance Claims
1,Return Request (Bed Frame/Toppers/Misc),Returns
2,Startek_Compassion- Gift Donation. BEG,Account Issues
3,Sales U - Closing Conclusion,Training
4,Startek_Telco_Visual_BEG_Demo,Account Issues
5,Arise_Carnival DS - Sales Inquiries - Booking ...,Booking
6,H21 Health insurance coverage question – Step ...,Insurance Claims
7,TRAINEE STARTS HERE: How did you do? View your...,Technical Support
8,H21 Health insurance coverage question – Step ...,Insurance Claims
9,UCSL eApp – State Specifics,Account Issues


In [None]:
# Task 5: Few-shot learning pipeline
import json
from symtrain.llm import generate_steps_with_ollama

# Test inputs
# test_inputs = {
#     "test_1": "Hi, I ordered a shirt last week and paid with my American Express card. I need to update the payment method because there is an issue with that card. Can you help me?",
#     "test_2": "Hi, I need to update the payment method for one of my recent orders. Can you help me with that?",
#     "test_3": "Hi, I am Sam. I was in a car accident this morning and need to file an insurance claim. Can you help me?",
#     "test_4": "Hi, can you help me file a claim?",
#     "test_5": "Hi, I recently ordered a book online. Can you give me an update on the order status?",
#     "test_6": "Hi, I have been waiting for two weeks for the book I ordered. What is going on with it? Can you give me an update?",
# }

test_inputs = {
    "test_1": "Hi, I ordered a shirt last week and paid with my American Express card. I need to update the payment method because there is an issue with that card. Can you help me?"
}

# Run on all test inputs
results = {}
for test_id, query in test_inputs.items():
    print(f"\nProcessing {test_id}...")
    print(f"test_input: {query}")
    results[test_id] = generate_steps_with_ollama(query, df)
    print(json.dumps(results[test_id], indent=2))


Processing test_1...
test_input: Hi, I ordered a shirt last week and paid with my American Express card. I need to update the payment method because there is an issue with that card. Can you help me?
{
  "category": "Payment Issues",
  "reason": "The customer wants to change the payment method for an existing order due to a problem with the original card.",
  "steps": [
    "Greet the customer and confirm their name and contact information.",
    "Ask for the order number and verify the order details (item, date, amount, original payment method).",
    "Check the payment status of the order in the system.",
    "Explain whether the payment can be modified directly (e.g., if the order is still pending) or if the order must be canceled/refunded and a new order placed.",
    "If the order can be updated, process the new payment method (American Express replacement or another card) and confirm the transaction.",
    "If a cancellation/refund is required, initiate the cancellation, process