# Formatting Annotations into Test Data Format  

This script is to automatically format the transriptions with their annotations into the proper format that RASA accepts as training data.

## Importing Completed Transcripts

In [1]:
# Array of names of transcripts with the "Complete" tag
completedTranscripts = []

# Going through all 115 transcript files
for i in range(1, 116):
    # Opening transcript files
    fileName = "transcript_" + str(i) + ".txt"
    file = open("./data/transcripts/" + fileName, "r")

    # Parsing away the name
    file.readline()

    # Checking for "Complete" tag and appending file name if completed
    complete = file.readline()[8:16]
    if (complete == "Complete"):
        completedTranscripts.append(fileName)

    # Cleanup
    file.close()

# List of annotated transcripts
print("Number of annotated transcripts:", len(completedTranscripts))
print("Annotated Transcripts:")
for i in completedTranscripts:
    print(" -", i)

Number of annotated transcripts: 14
Annotated Transcripts:
 - transcript_1.txt
 - transcript_10.txt
 - transcript_22.txt
 - transcript_23.txt
 - transcript_24.txt
 - transcript_34.txt
 - transcript_35.txt
 - transcript_44.txt
 - transcript_54.txt
 - transcript_56.txt
 - transcript_74.txt
 - transcript_100.txt
 - transcript_101.txt
 - transcript_102.txt


# Natural Language Understanding (NLU):

## Filtering and Sorting Different Annotations  

**Possible Annotations:**  
Robot to Users (only use these four for the robot + possibly [chit-chat]):  
- _[question]_: Robot asking the question
- _[options]_: Robot giving the answer options to the question
- _[accept-answer]_: Robot confirming that it has received the answer
- _[confirm-agreement]_: Robot asks if the users have agreed, eg. "Final answer?" 

User to Robot: 
- [final-answer()]: A user offering the final answer to the host (as opposed to the other person)
- [confirm-final-answer()]: A user confirms to the robot that it is the final answer they want to submit 

User to User: 
- [offer-answer()]: One of the users offers a specific answer to the other user
- [offer-to-answer]: One of the users says that they would like to answer (without providing a specific answer), eg. "I think I know this one" 
- [check-answer]: A user checks if the other user knows the answer, eg. "You know this, right?"
- [agreement]: General agreement, eg. "Yeah"
- [ask-agreement]: A user asking the other user for agreement, eg. "Do you think so too?"
- [chit-chat]: Non-related talk between the users
- [reject-option()]: A user eliminates one of the answer option, eg. "It's definitely not Germany" 
- [reject-option-agreement()]: A user agrees to reject an option, eg. "Yeah, definitely not"

In [2]:
# Without Input
question = ["question"]
options = ["options"]
accept_answer = ["accept-answer"]
offer_to_answer = ["offer-to-answer"]
check_answer = ["check-answer"]
agreement = ["agreement"]
ask_agreement = ["ask-agreement"]
chit_chat = ["chit-chat"]
confirm_agreement = ["confirm-agreement"]

# With Input
offer_answer = ["offer-answer"]
final_answer = ["final-answer"]
confirm_final_answer = ["confirm-final-answer"]
reject_option = ["reject-option"]
reject_option_agreement = ["reject-option-agreement"]

# For every file
for fileName in completedTranscripts:
    file = open("./data/transcripts/" + fileName, "r")

    # Parse the unimportant lines until "Transcript"
    line = file.readline()
    while (line[:28] != "-------- Transcript --------"):
        line = file.readline()
    file.readline()

    # For every line in the transcript
    for line in file:
        # Remove the user and system tags
        if (line[0] == 'S'):
            line = line[3:]
        elif (line[0] == 'U'):
            line = line[4:]
        else:
            continue

        # Extract the annotation
        annotation = line[line.find(
            '['):line.find(']') + 1].replace(" ", "")[1:-1]
        
        # Match the annotation to its array
        if (annotation == "question"):
            question.append(line[:line.find("[") - 1])
        elif (annotation == "options"):
            options.append(line[:line.find("[") - 1])
        elif (annotation == "offer-to-answer"):
            offer_to_answer.append(line[:line.find("[") - 1])
        elif (annotation == "check-answer"):
            check_answer.append(line[:line.find("[") - 1])
        elif (annotation == "agreement"):
            agreement.append(line[:line.find("[") - 1])
        elif (annotation == "ask-agreement"):
            ask_agreement.append(line[:line.find("[") - 1])
        elif (annotation == "chit-chat"):
            chit_chat.append(line[:line.find("[") - 1])
        elif (annotation == "accept-answer"):
            accept_answer.append(line[:line.find("[") - 1])
        elif (annotation == "confirm-agreement"):
            confirm_agreement.append(line[:line.find("[") - 1])
        else:
            # For annotations with inputs
            annotationName = annotation[:annotation.find('(')]
            if (annotationName == "offer-answer"):
                offer_answer.append([line[:line.find(
                    "[") - 1], annotation[annotation.find('(') + 1:annotation.find(')')]])
            elif (annotationName == "final-answer"):
                final_answer.append([line[:line.find(
                    "[") - 1], annotation[annotation.find('(') + 1:annotation.find(')')]])
            elif (annotationName == "confirm-final-answer"):
                confirm_final_answer.append([line[:line.find(
                    "[") - 1], annotation[annotation.find('(') + 1:annotation.find(')')]])
            elif (annotationName == "reject-option"):
                reject_option.append([line[:line.find(
                    "[") - 1], annotation[annotation.find('(') + 1:annotation.find(')')]])
            elif (annotationName == "reject-option-agreement"):
                reject_option_agreement.append([line[:line.find(
                    "[") - 1], annotation[annotation.find('(') + 1:annotation.find(')')]])
                
    # Cleanup
    file.close()


## Writing the Annotation Examples into a YAML File

In [3]:
# Annotations with inputs
trainingDataNoInput = [
    offer_to_answer,
    check_answer,
    agreement,
    ask_agreement,
    chit_chat
]

# Annotations without inputs
trainingDataInput = [
    offer_answer,
    final_answer,
    confirm_final_answer,
    reject_option,
    reject_option_agreement
]

# Create a new file and add the proper formatting to the beginning
file = open("./RASA_Training_Data/nlu.txt", "w")
file.write("version: \"3.1\"\n\nnlu:\n")

# Write all the examples of annotations to the file in the right format
for annotation in trainingDataNoInput:
    file.write("- intent: " + annotation.pop(0) + "\n  examples: |\n")
    for example in annotation:
        file.write("    - " + example + "\n")
    file.write("\n")

# Write all the examples of annotations to the file in the right format with inputs as entities
for annotation in trainingDataInput:
    file.write("- intent: " + annotation.pop(0) + "\n  examples: |\n")
    for example in annotation:
        example[0] = example[0].lower()
        example[1] = example[1].lower()
        example[0] = example[0].replace(example[1], "[" + example[1] + "](answer)")
        file.write("    - " + example[0] + "\n")
    file.write("\n")

# Cleanup
file.close()

# Dialogue Managment (Stories):

## Extracting Stories from Transcripts

In [4]:
stories = []

for fileName in completedTranscripts:
    file = open("./data/transcripts/" + fileName, "r")

    # Parse the unimportant lines until "Transcript"
    line = file.readline()
    while (line[:28] != "-------- Transcript --------"):
        line = file.readline()
    file.readline()

    # For every line in the transcript
    for line in file:
        # Extract the annotation
        annotation = line[line.find(
            '['):line.find(']') + 1].replace(" ", "")[1:-1]
    
    # Cleanup
    file.close()