In [1]:
import ast

import pandas as pd

raw_path = f"./data/schema/ontology.csv"

colors = [
    "#ff6961",
    "#ffb480",
    "#f8f38d",
    "#42d6a4",
    "#08cad1",
    "#59adf6",
    "#5BC236",
    "#c780e8",
    "#f66d9b",
    "#9561e2",
]

# Renaming.
col_map = {
    f"domain": "domain",
    f"slot_name": "slot",
    f"possible_slot_values": "value",
    "categorical_values": "categorical_values",
    "extractive_values": "extractive_values",
    "abstractive_values": "abstractive_values",
    "time_values": "time_values",
    "Group ID": "group_id",
    "Must have": "must_have",
    "Global": "global",
    "Important slot(have chance to be improved)": "important_slot",
    "Parking Lot": "Parking Lot",
    "Driveway": "Driveway",
    "Highway": "Highway",
    "Roadway": "Roadway",
    "Intersection": "Intersection",
}
df = pd.read_csv(raw_path)[col_map.keys()]
df = df.rename(columns=col_map)
df["value"] = df["value"].apply(
    lambda x: list(map(lambda x: x if x.isupper() else x.title(), ast.literal_eval(x)))
)
df = df.drop_duplicates(subset=["domain", "slot"])

df = df.fillna(0)
cond = (df.domain == "AccidentDetails") & (df.slot == "Accident Location")
accident_locations = df[cond].value.tolist()[0]
df.drop(df[cond].index, inplace=True)
accident_locations.remove("Other")

In [2]:
popular_physical_stores = [
    "The Olivian Apartment",
    "The Century Apartment",
    "The Lyric Apartment",
    "Stratus Apartment",
    "The Wave Apartment",
    "M Street Apartments",
    "The Nolo Apartment",
    "The Cairns Apartment",
    "The Whittaker Apartment",
    "Strada Apartments",
    "CVS Pharmacy",
    "Dollar Tree",
    "Ikea",
    "Trader Joe's",
    "Target",
    "7-Eleven",
    "Macy's",
    "Apple Store",
    "Sam's Club",
    "Ace Hardware",
    "Marshalls",
    "Staples",
    "Costco",
    "Kohl's",
    "GameStop",
    "Sephora",
    "Bed Bath & Beyond",
    "Petco",
    "Microsoft Store",
    "Safeway",
    "Best Buy",
    "PetSmart",
    "Whole Foods Market",
    "Office Depot",
    "Sears",
    "Lowe's",
    "OfficeMax",
    "Home Depot",
    "Kroger",
    "T.J. Maxx",
    "Walmart",
    "McDonald's",
    "Subway",
    "Burger King",
    "Wendy's",
    "KFC",
    "Taco Bell",
    "Pizza Hut",
    "Domino's Pizza",
    "Papa John's Pizza",
    "Dunkin' Donuts",
    "Starbucks",
    "Chipotle Mexican Grill",
    "Chick-fil-A",
    "Sonic Drive-In",
    "In-N-Out Burger",
]


In [3]:
police_departments = {
    "New York": ["New York City Police Department"],
    "California": [
        "Los Angeles Police Department",
        "San Diego Police Department",
        "San Francisco Police Department",
        "Oakland Police Department",
    ],
    "Illinois": ["Chicago Police Department"],
    "Texas": [
        "Houston Police Department",
        "Dallas Police Department",
        "San Antonio Police Department",
        "Fort Worth Police Department",
        "Austin Police Department",
        "Wichita Police Department",
    ],
    "Pennsylvania": ["Philadelphia Police Department"],
    "Arizona": ["Phoenix Police Department", "Tucson Police Department"],
    "District of Columbia": ["Washington D.C. Metropolitan Police Department"],
    "Michigan": ["Detroit Police Department"],
    "Massachusetts": ["Boston Police Department"],
    "Washington": ["Seattle Police Department"],
    "Colorado": ["Denver Police Department"],
    "Maryland": ["Baltimore Police Department"],
    "Oregon": ["Portland Police Department"],
    "Indiana": ["Indianapolis Metropolitan Police Department"],
    "Tennessee": [
        "Nashville Metropolitan Police Department",
        "Memphis Police Department",
    ],
    "Georgia": ["Atlanta Police Department"],
    "North Carolina": [
        "Charlotte-Mecklenburg Police Department",
        "Raleigh Police Department",
        "Virginia Beach Police Department",
    ],
    "Kentucky": ["Louisville Metro Police Department"],
    "Wisconsin": ["Milwaukee Police Department"],
    "Louisiana": ["New Orleans Police Department"],
    "Oklahoma": ["Oklahoma City Police Department"],
    "Missouri": ["St. Louis Metropolitan Police Department"],
    "Nevada": ["Las Vegas Metropolitan Police Department"],
    "Florida": ["Miami-Dade Police Department", "Tampa Police Department"],
}


In [4]:
personalities = [
    {
        "name":
        "Upset",
        "description":
        "feeling distressed or frustrated due to the accident and its consequences.",
    },
    {
        "name":
        "Aggressive",
        "description":
        "feeling angry and confrontational about the accident, may place blame on others or use aggressive language.",
    },
    {
        "name":
        "Confused",
        "description":
        "unsure about what happened during the accident or what to do next, may ask a lot of questions.",
    },
    {
        "name":
        "Cooperative",
        "description":
        "willing to work with the insurance company and other parties involved in resolving the claim.",
    },
    {
        "name":
        "Evasive",
        "description":
        "hesitant to provide information or answer questions about the accident, may be trying to conceal something.",
    },
    {
        "name":
        "Emotional",
        "description":
        "experiencing strong emotions related to the accident, may be crying or struggling to maintain composure during the call.",
    },
    {
        "name":
        "Analytical",
        "description":
        "focused on the details and logistics of the claim process, may ask for precise information and explanations.",
    },
    {
        "name":
        "Defensive",
        "description":
        "feeling the need to justify their actions or place blame on others, may be unwilling to take responsibility for the accident.",
    },
    {
        "name":
        "Reassuring",
        "description":
        "trying to maintain a positive and optimistic outlook during the call, may express gratitude for the assistance being provided.",
    },
    {
        "name":
        "Impatient",
        "description":
        "feeling frustrated with the claim process or the speed at which it is progressing, may express irritation or urgency in their language.",
    },
]


In [5]:
coverage_types = [
    "Liability insurance",
    "Collision insurance",
    "Comprehensive insurance",
    "Personal injury protection (PIP)",
    "Uninsured/underinsured motorist coverage",
    "MedPay",
    "Rental car coverage",
    "Roadside assistance",
    "Gap insurance",
]


In [6]:
import random


def weighted_sample_no_overlap(items, weights, num_samples):
    """
    Randomly selects a specified number of elements from a list with weights,
    ensuring that no element is selected more than once.

    Arguments:
    items -- the list of items to sample from
    weights -- a list of weights corresponding to each item
    num_samples -- the number of samples to select

    Returns:
    A list of randomly selected items with the specified number of samples,
    without duplicates.
    """
    if num_samples > len(items):
        raise ValueError("Number of samples cannot be greater than number of items.")

    # Create a list of (item, weight) tuples
    weighted_items = list(zip(items, weights))

    # Create a list to store the selected items
    selected_items = []

    # Loop through the number of samples
    for i in range(num_samples):
        # Use the random.choices() function to select an item based on weights
        # If the item is already in selected_items, choose again until a unique item is selected
        while True:
            item = random.choices(weighted_items, weights)[0][0]
            if item not in selected_items:
                break

        # Add the selected item to the list of selected items
        selected_items.append(item)

    return selected_items

In [7]:
import copy
import faker
import faker_vehicle as fv
import names

fake = faker.Faker()
fake.add_provider(fv.VehicleProvider)


def get_phone_number():
    numbers = faker.Faker(locale="en_US").phone_number()
    while "x" in numbers:
        numbers = faker.Faker(locale="en_US").phone_number()
    return numbers


def get_address(city, state):
    street = fake.street_address()
    zipcode = fake.postcode()
    return f"{street}, {city}, {state} {zipcode}"


def sample_slots(df, num_total_slots, location, additional_slots=[]):
    removal_slots = df[df[location] == 1]

    must_have_slots = df[(df["must_have"] == 1 | df["slot"].isin(additional_slots))]
    num_normal_slots = num_total_slots - len(must_have_slots)
    slots = df[df.must_have == 0].copy()
    slots.loc[:, "weight"] = slots["important_slot"].apply(lambda x: x * 2 + 1)
    slot_idxs = slots.index.tolist()
    slot_weights = slots.weight.tolist()
    normal_idxs = weighted_sample_no_overlap(slot_idxs, slot_weights, num_normal_slots)
    normal_slots = slots.loc[normal_idxs]

    group_ids = set(normal_slots[normal_slots.group_id != 0].group_id.tolist())
    group_slots = df[df.group_id.isin(group_ids)]

    selected_slots = pd.concat(
        [must_have_slots, normal_slots, group_slots]
    ).drop_duplicates(subset=["domain", "slot"])

    selected_slots = selected_slots.drop(
        selected_slots.index.intersection(removal_slots.index), axis=0
    )

    return selected_slots


def get_tuples(df, location, num_total_slots, skip_global=False):
    vehicle_year_make_model = fake.vehicle_year_make_model()
    year = vehicle_year_make_model[:4].strip()
    make_model = vehicle_year_make_model[4:].strip()
    first_name = names.get_first_name()
    last_name = names.get_last_name()

    state = random.sample(police_departments.keys(), 1)[0]
    police_department = random.sample(police_departments[state], 1)[0]
    city = police_department.split(" Police Department")[0]

    tuples = []
    selected_slots = sample_slots(
        copy.deepcopy(df),
        num_total_slots,
        location=location,
    )
    for _, row in selected_slots.iterrows():
        slot = row.slot
        potential_values = copy.deepcopy(row.value)

        if skip_global:
            if row["global"] == 1:
                continue
            if row.domain in {"Trip"}:
                continue
            if row.slot in {"Police Department Name"}:
                continue

        mandatory_value = True
        if not potential_values:
            # empty value list.
            if slot == "Make/Model":
                potential_values = [make_model]
            elif slot == "Make Year":
                potential_values = [year]
            elif slot == "Color":
                potential_values = [
                    "red",
                    "blue",
                    "green",
                    "black",
                    "white",
                    "silver",
                    "gray",
                    "yellow",
                    "orange",
                    "purple",
                    "pink",
                    "gold",
                    "beige",
                    "brown",
                    "teal",
                    "burgundy",
                    "navy blue",
                    "dark green",
                    "light blue",
                    "dark gray",
                ]
            elif slot == "Car Mileage":
                potential_values = list(
                    map(lambda x: f"{x} miles", range(1000, 100000, 1000))
                )
            elif slot == "Home Address":
                potential_values = [get_address(city, state)]
                # potential_values = [None]
            elif slot == "Phone Number":
                potential_values = [get_phone_number()]
            elif slot == "First Name":
                potential_values = [first_name]
            elif slot == "Last Name":
                potential_values = [last_name]
            elif slot in {
                "Speed",
                "Speed Limit",
                "Date of Accident",
                "Time of Accident",
                "Date of Birth",
                "Email Address",
                "Policy Number",
                "Police Report Number",
                "Purpose of Trip",
            }:
                potential_values = [None]
                mandatory_value = False
            elif slot in {"Origin of Trip", "Destination of Trip"}:
                potential_values = [
                    "home",
                    "work",
                    "school",
                    "grocery store",
                    "restaurant",
                    "gas station",
                    "bank",
                    "shopping mall",
                    "park",
                    "gym",
                    "pharmacy",
                    "church",
                    "library",
                    "movie theater",
                    "bar",
                    "club",
                    "hotel",
                    "airport",
                    "train station",
                    "bus station",
                ]
                mandatory_value = False
            elif slot == "Purpose of Trip":
                potential_values = [
                    "commuting",
                    "errands",
                    "recreation",
                    "special events",
                    "emergency situations",
                ]
                mandatory_value = False
            elif slot == "Police Department Name":
                potential_values = [police_department]
                mandatory_value = False
            elif slot == "Explain Coverages":
                potential_values = coverage_types
                mandatory_value = False
            else:
                raise ValueError(f"No potential values for {slot}")

        if slot == "Permission to Record":
            for _ in range(100):
                potential_values.append("Yes")
        # else:
        # potential_values += ([None] * max(len(potential_values), 1))

        # Slot with multiple values
        if slot in {"Explain Coverages", "Car Motion"}:
            num_samples = random.randint(2, 4)
            value = ", ".join(random.sample(potential_values, num_samples))
        else:
            value = random.sample(potential_values, 1)[0]

        tuples.append(
            {
                "domain": row.domain,
                "slot": slot,
                "value": value,
                "mandatory_value": mandatory_value,
            }
        )

    if not skip_global:
        if location == "Parking Lot":
            location_name = random.choice(popular_physical_stores)
            location_name = f"{location_name} Parking Lot"
        else:
            location_name = None
        tuples.insert(
            0,
            {
                "domain": "AccidentDetails",
                "slot": "Accident Location",
                "value": location_name,
                "mandatory_value": True,
            },
        )
    return tuples, first_name


def get_role_to_party():
    agent_name = names.get_first_name()
    user_name = names.get_first_name()
    while user_name == agent_name:
        user_name = names.get_first_name()
    role_to_party = {
        "agent": agent_name,
        "user": user_name,
    }
    return role_to_party

# Start to create examples (tasks).


In [8]:
PROB_OF_HAVING_OTHER_DRIVER = 0.95
PROB_OF_REMOVING_VALUE = 0.85

# Set number of examples (tasks) you want to create.
# Default is 1 for each accident location.
accident_location_to_num_examples = {
    "Parking Lot": 1,
    "Driveway": 1,
    "Highway": 1,
    "Roadway": 1,
    "Intersection": 1,
}


In [9]:
import collections
import random
import tqdm

location_to_flows = collections.defaultdict(list)
for location in tqdm.tqdm(accident_locations):
    num_examples = accident_location_to_num_examples.get(location, None)
    if not num_examples:
        continue
    for _ in range(num_examples):
        num_total_slots = random.randint(25, 35)
        flow = collections.defaultdict(list)

        caller_tuples, user_first_name = get_tuples(
            df, location, num_total_slots, False
        )
        other_driver_tuples, _ = get_tuples(df, location, num_total_slots, True)

        agent_first_name = names.get_first_name()
        while agent_first_name == user_first_name:
            agent_first_name = names.get_first_name()

        role_to_party = {
            "agent": agent_first_name,
            "user": user_first_name,
        }
        flow["role_to_party"] = role_to_party

        flow["story_caller_tuples"] = copy.deepcopy(caller_tuples)

        if random.random() < PROB_OF_HAVING_OTHER_DRIVER:
            flow["story_other_driver_tuples"] = copy.deepcopy(other_driver_tuples)
        else:
            flow["story_other_driver_tuples"] = []

        for tup in caller_tuples:
            if tup["slot"] not in {
                "Accident Location",
                "Home Address",
                "Phone Number",
                "First Name",
                "Last Name",
            }:
                if not tup["mandatory_value"]:
                    # If not mandatory slot value, remove the value.
                    tup["value"] = None
                elif random.random() < PROB_OF_REMOVING_VALUE:
                    # If mandatory slot value, remove the value with some probability.
                    tup["value"] = None

            key = f"[{tup['domain']}_TUPLES]"
            flow[key].append(tup)

        location_to_flows[location].append(dict(flow))

100%|██████████| 5/5 [00:00<00:00, 13.39it/s]


In [10]:
# Templates

information_summaries = [
    "Accident details: [AccidentDetails_TUPLES].",
    "Evidences of the car accident: [Evidences_TUPLES].",
    "Traffic condition: [TrafficEnvironment_TUPLES].",
    "Caller's driver action: [DriverActions_TUPLES].",
    "Caller's car information: [CarInfo_TUPLES].",
    "Caller's injury details: [InjuryDetails_TUPLES].",
]

step_summaries = [
    "Have role play car accident claim call. One person is an agent [AGENT] from a car insurance company and the other is the caller [USER] who wants to file a claim.",
    "At beginning of the call, have [AGENT] ask for [USER]'s permission to record the call and proceeds with the conversation.",
    "Within some <p> </p>, have simulate poor phone connection. Have [AGENT] and [USER] can not hear each other and need to repeat what they said.",
    "Have [AGENT] verify [USER] personal information to access account information at the beginning of the call.",
    "Have [USER] describe the car accident by using story and tuples above to describe the accident.",
    "Have [AGENT] confirm new information with [USER] during the call to ensure consistency.",
    "Have [AGENT] and [USER] engage in small talk with each other.",
    "Have [AGENT] explain the insurance coverages to [USER].",
]

instructions = [
    "Use the story, information, and personality to create a role play script and follow the steps."
]

In [11]:
def tuple_to_string(tup):
    return f"({tup['slot']} = {tup['value']})"


def story_tuples_to_string(tuples):
    domain_to_tuples = collections.defaultdict(list)
    for tup in tuples:
        domain_to_tuples[tup["domain"]].append(tuple_to_string(tup))
    for domain, tuples in domain_to_tuples.items():
        domain_to_tuples[domain] = f"{domain}: " + ", ".join(tuples)
    return "\n".join(domain_to_tuples.values())


def ignore_tuple(tup):
    if tup["slot"] in {"Permission to Record"}:
        return False
    if tup["value"] is None:
        return False
    return True


def ignore_tuple_for_caller(tup):
    if tup["slot"] in {"Make Year", "Make/Model"}:
        return False
    return True

# Add info in to each flow (task).


In [12]:
for location, flows in location_to_flows.items():
    for flow in flows:
        story_caller_tuples = flow["story_caller_tuples"]
        story_other_driver_tuples = flow["story_other_driver_tuples"]

        story_caller_tuples = list(filter(ignore_tuple, story_caller_tuples))
        story_caller_tuples = list(
            filter(ignore_tuple_for_caller, story_caller_tuples))
        story_other_driver_tuples = list(
            filter(ignore_tuple, story_other_driver_tuples)
        )
        story_summaries = [
            "Here are the information we have about the caller:",
            story_tuples_to_string(story_caller_tuples),
            "Here are the information we have about the other driver:",
            story_tuples_to_string(story_other_driver_tuples),
            f"Use the information above to create a length and detail-enriched story about a car accident in a {location.lower()}.",
            "Ensure the story cover how the crash happened.",
            "No need to provide a summary of the story.",
            "The story is around 1000 words.",
        ]
        user_personality = random.sample(personalities, 1)[0]
        user_personality = f"[USER] is {user_personality['name'].lower()}, {user_personality['description']}"
        agent_personality = "[AGENT] is conversational, personable, patient, empathetic, sympathetic and professional."

        flow["story_summaries"] = story_summaries
        flow["information_summaries"] = information_summaries
        flow["step_summaries"] = step_summaries
        flow["user_personality"] = user_personality
        flow["agent_personality"] = agent_personality
        flow["instructions"] = instructions

# Create a story for each flow (task).


In [13]:
from queue import Queue

from src import config
from src import utils

api_keys = Queue()
for key in config.API_KEYS:
    api_keys.put(key)

for location, flows in location_to_flows.items():
    print()
    print(location)
    print()
    for flow in tqdm.tqdm(flows):
        story_summaries = flow["story_summaries"]
        story_prompt = "\n".join(story_summaries)
        api_key = api_keys.get()
        story = utils.story_generation_via_chatgpt(story_prompt, api_key)
        api_keys.put(api_key)
        story = story.replace("adjuster", role_to_party["agent"])
        story = story.replace("Adjuster", role_to_party["agent"])
        story = story.split("\n")
        story = list(map(lambda x: x.strip(), story))
        # Filters empty strings.
        story = list(filter(None, story))
        flow["story"] = story


Parking Lot



100%|██████████| 1/1 [00:10<00:00, 10.25s/it]



Driveway



100%|██████████| 1/1 [00:10<00:00, 10.75s/it]



Highway



100%|██████████| 1/1 [00:11<00:00, 11.29s/it]



Roadway



100%|██████████| 1/1 [00:11<00:00, 11.55s/it]



Intersection



100%|██████████| 1/1 [00:11<00:00, 11.87s/it]


In [14]:
import os
import random
import string


def generate_random_string(length):
    letters = string.ascii_letters + string.digits
    return "".join(random.choice(letters) for i in range(length))


num_flows = len(sum(location_to_flows.values(), []))

unique_strings = set()
while len(unique_strings) < num_flows:
    random_string = generate_random_string(5)
    unique_strings.add(random_string)

unique_strings = list(unique_strings)

task_table = {}
task_idx = 0
for location, flows in location_to_flows.items():
    FLOW_DIR = os.path.join(config.FLOW_DIR, location)
    os.makedirs(FLOW_DIR, exist_ok=True)
    for flow in flows:
        task_id = unique_strings[task_idx]
        path = os.path.join(FLOW_DIR, f"{task_id}.yaml")
        utils.save_yaml(path, dict(flow))
        task_idx += 1
        task_table[task_id] = dict(
            template_idx=0,
            flow_id=task_id,
            accident_location=location,
            # activate=False)
            activate=True,
            story_generation=True,
        )

utils.save_yaml(config.TASK_TABLE_PATH, task_table)