In [8]:
""" This example demonstrates how to create a dataset from AWS Transcripts
    1. Parse AWS JSON Transcripts
    2. Build Dialog Dataset
"""

' This example demonstrates how to create a dataset from AWS Transcripts\n    1. Parse AWS JSON Transcripts\n    2. Build Dialog Dataset\n'

In [10]:
import json
import os
from llmware.dataset_tools import Datasets
from llmware.library import Library
from llmware.setup import Setup
from llmware.configs import LLMWareConfig

In [12]:
def build_aws_transcribe_datasets(library_name):

    # Setup a library and build a knowledge graph.  Datasets will use the data in the knowledge graph
    print (f"\n > Creating library {library_name}...")
    library = Library().create_new_library(library_name)
    sample_files_path = Setup().load_sample_files()
    library.add_dialogs(os.path.join(sample_files_path ,"AWS-Transcribe"))
    library.generate_knowledge_graph()

    # Create a Datasets object
    datasets = Datasets(library)

    # Build generative conversation dataset
    print (f"\n > Building generative conversation dataset...")
    generative_conversation_dataset = datasets.build_gen_dialog_ds(prompt_wrapper="human_bot", human_first=True)
    dataset_location = os.path.join(library.dataset_path, generative_conversation_dataset["ds_id"])
    print (f"\n > Dataset:")
    print (f"(Files referenced below are found in {dataset_location})")
    print (f"\n{json.dumps(generative_conversation_dataset, indent=2)}")
    sample = datasets.get_dataset_sample(datasets.current_ds_name)
    print (f"\nRandom sample from the dataset:\n{json.dumps(sample, indent=2)}")

    return 0

In [14]:
if __name__ == "__main__":

    LLMWareConfig().set_active_db("sqlite")

    build_aws_transcribe_datasets("aws_transcripts_lib_1")


 > Creating library aws_transcripts_lib_1...

 > Building generative conversation dataset...

 > Dataset:
(Files referenced below are found in C:\Users\hsyyu\llmware_data\accounts\llmware\aws_transcripts_lib_1\datasets\d4c4b9eb-fb20-4820-aef8-359751ea33e5)

{
  "ds_type": "build_gen_ds_dialog",
  "ds_id": "d4c4b9eb-fb20-4820-aef8-359751ea33e5",
  "training_samples": 43,
  "training_files": [
    "training_samples_0.jsonl",
    "training_samples_text_0.txt"
  ],
  "validation_samples": 5,
  "validation_files": [
    "validation_samples_0.jsonl",
    "validation_samples_text_0.txt"
  ],
  "testing_samples": 5,
  "testing_files": [
    "testing_samples_0.jsonl",
    "testing_samples_text_0.txt"
  ],
  "batches": 1,
  "prompt_wrapper": "human_bot",
  "description": "Generative AI fine-tuning dataset, generated in self-supervised process using dialog transcripts to re-create role-based dialog.",
  "features": [
    "text"
  ],
  "time_stamp": "2024-05-15_203732"
}

Random sample from the d

In [None]:
import os
import glob

# Specify the sample file path
sample_file_path = 'C:/Users/hsyyu/llmware_data/sample_files/AWS-Transcribe'

# Use glob to find all the dialog files in the specified path
dialog_files = glob.glob(os.path.join(sample_file_path, 'dialog*.json'))

# Iterate through the dialog files and print their contents
for file in dialog_files:
    with open(file, 'r') as f:
        print(f.read())

In [32]:
import os
import glob
import json

# Specify the sample file path
sample_file_path = 'C:/Users/hsyyu/llmware_data/sample_files/AWS-Transcribe'

# Use glob to find all the dialog files in the specified path
dialog_files = glob.glob(os.path.join(sample_file_path, 'dialog*.json'))

# Iterate through the dialog files and organize messages into pairs
conversation = []
for file in dialog_files:
    with open(file, 'r') as f:
        dialog = json.load(f)
        messages = dialog['results']['items']
        speaker = None
        current_message = ""
        for item in messages:
            if item['type'] == 'pronunciation':
                if item['alternatives'][0]['content'] == "<eos>":
                    conversation.append((speaker, current_message.strip()))
                    current_message = ""
                else:
                    current_message += item['alternatives'][0]['content'] + " "
            elif item['type'] == 'speaker-label':
                speaker = item['speaker_label']
        if current_message.strip():
            conversation.append((speaker, current_message.strip()))

# Format the conversation pairs
for i in range(0, len(conversation), 2):
    if i + 1 < len(conversation):
        print(f"{conversation[i][0]}: {conversation[i][1]}")
        print(f"{conversation[i+1][0]}: {conversation[i+1][1]}")
        print()


None: Good afternoon This is Peter Hi Peter This is Susan So did I catch you in the middle of something Uh just a normal day All right Well uh yeah my name is Susan and I'm with ABC Corp I'm reaching you now because I noticed you're a sales leader over at XYZ Corp And I was hoping to run through a few reasons like yourself why you're speaking with me But if it makes sense we can go a bit further and if not I'll just let you go on with your day Are you comfortable too I take a couple of minutes Sure Ok great Uh Well I'm sorry what's your name And and uh where are you from I didn't catch it Yeah my name is Susan and I'm with ABC And yeah uh we help sales Yeah we uh we help sales leaders similar to yourself solve a few of their key challenges And the first one is you know they're having insufficient time for coaching The second is closing the gap between their A and B players and the third is in within virtual sales and and I was curious out of those three which one is your number one cha

In [None]:
import os
import glob

# Specify the sample file path
sample_file_path = 'C:/Users/hsyyu/llmware_data/sample_files/AWS-Transcribe'

# Use glob to find all the dialog files in the specified path
dialog_files = glob.glob(os.path.join(sample_file_path, 'dialog*.json'))

# Iterate through the dialog files and print their contents in a chatbot format
for file in dialog_files:
    with open(file, 'r') as f:
        print("Chatbot: ")
        for line in f:
            print(line.strip())  # Assuming each line is a separate chatbot response
        print("User: ")