### Imports

In [1]:
import json
import os
import re
import sys
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def merge_json_files(input_folder, output_file):
    all_dialogues = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):
            file_path = os.path.join(input_folder, filename)

            with open(file_path, "r", encoding="utf-8") as file:
                dialogues = json.load(file)
                all_dialogues.extend(dialogues)

    with open(output_file, "w", encoding="utf-8") as outfile:
        json.dump(all_dialogues, outfile, indent=2)

input_folder = "train"
output_file = "dialogues.json"
merge_json_files(input_folder, output_file)

In [3]:
def split_user_system_turns(dialogue):
    turns = dialogue.get("turns", [])

    user_turns = [turn["utterance"] for turn in turns if turn["speaker"] == "USER"]
    system_turns = [turn["utterance"] for turn in turns if turn["speaker"] == "SYSTEM"]

    return user_turns, system_turns

def print_user_system_turns(dialogue):
    dialogue_id = dialogue.get("dialogue_id", "N/A")

    if dialogue_id is None or not re.match(r'\d+_\d+', dialogue_id):
        return

    services = dialogue.get("services", [])
    print(f"\nDialogue ID: {dialogue_id}")
    print(f"Services: {', '.join(services)}")
    print("=" * 30)

    user_turns, system_turns = split_user_system_turns(dialogue)

    for user_turn, system_turn in zip(user_turns, system_turns):
        print(f"User Turn: {user_turn}")
        print(f"System Turn: {system_turn}")
        print("-" * 130)


with open("dialogues_info.txt", "w", encoding="utf-8") as output_file:
    sys.stdout = output_file

    with open("dialogues.json", "r", encoding="utf-8") as file:
        dialogues_data = json.load(file)

    for dialogue in dialogues_data:
        print_user_system_turns(dialogue)

    sys.stdout = sys.__stdout__

### Total data length for user and system

In [2]:
def count_metrics(dialogues):
    total_dialogues = len(dialogues)
    total_turns = 0
    total_sentences = 0
    total_words = 0
    total_user_turns = 0
    total_user_sentences = 0
    total_user_words = 0
    total_system_turns = 0
    total_system_sentences = 0
    total_system_words = 0
    

    for dialogue in dialogues:
        user_turns = dialogue.count("User Turn:")
        system_turns = dialogue.count("System Turn:")
        turns_in_dialogue = user_turns + system_turns

        total_turns += turns_in_dialogue
        total_user_turns += user_turns
        total_system_turns += system_turns

        turns = dialogue.split("User Turn:")

        user_turns_list = turns[1::2]
        system_turns_list = turns[2::2]

        for user_turn in user_turns_list:
            sentences = len(sent_tokenize(user_turn))
            words = len(word_tokenize(user_turn))
            
            total_user_sentences += sentences
            total_user_words += words
            total_sentences += sentences
            total_words += words

        for system_turn in system_turns_list:
            sentences = len(sent_tokenize(system_turn))
            words = len(word_tokenize(system_turn))
            
            total_system_sentences += sentences
            total_system_words += words
            total_sentences += sentences
            total_words += words

    return (
        total_dialogues,
        total_turns,
        total_sentences,
        total_words,
        total_user_turns,
        total_user_sentences,
        total_user_words,
        total_system_turns,
        total_system_sentences,
        total_system_words
    )

with open("dialogues_info.txt", "r") as file:
    dialogues_info = file.read()

dialogues = dialogues_info.split("Dialogue ID:")[1:]

(
    total_dialogues,
    total_turns,
    total_sentences,
    total_words,
    total_user_turns,
    total_user_sentences,
    total_user_words,
    total_system_turns,
    total_system_sentences,
    total_system_words
) = count_metrics(dialogues)

print(f"Total dialogues: {total_dialogues} dialogues")
print(f"Total turns: {total_turns} turns")
print(f"Total sentences: {total_sentences} sentences")
print(f"Total words: {total_words} words\n")
print(f"Total user turns: {total_user_turns} turns")
print(f"Total user sentences: {total_user_sentences} sentences")
print(f"Total user words: {total_user_words} words\n")
print(f"Total system turns: {total_system_turns} turns")
print(f"Total system sentences: {total_system_sentences} sentences")
print(f"Total system words: {total_system_words} words")

Total dialogues: 16142 dialogues
Total turns: 329964 turns
Total sentences: 568891 sentences
Total words: 15011509 words

Total user turns: 164982 turns
Total user sentences: 298963 sentences
Total user words: 7885477 words

Total system turns: 164982 turns
Total system sentences: 269928 sentences
Total system words: 7126032 words


### Mean dialogue lengths for user and system

In [3]:
average_total_turns = round(total_turns / total_dialogues, 2)
average_tokens_per_turn = round(total_words / total_turns, 2)
average_total_sentences = round(total_sentences / total_dialogues, 2)
average_total_words = round(total_words / total_dialogues, 2)

average_user_turns = round(total_user_turns / total_dialogues, 2)
average_user_sentences = round(total_user_sentences / total_dialogues, 2)
average_user_words = round(total_user_words / total_dialogues, 2)

average_system_turns = round(total_system_turns / total_dialogues, 2)
average_system_sentences = round(total_system_sentences / total_dialogues, 2)
average_system_words = round(total_system_words / total_dialogues, 2)

print(f"Average total turns: {average_total_turns} turns per dialogue")
print(f"Average tokens per turn: {average_tokens_per_turn} tokens per turn")
print(f"Average total sentences: {average_total_sentences} sentences per dialogue")
print(f"Average total words: {average_total_words} words per dialogue\n")
print(f"Average user turns: {average_user_turns} turns per dialogue for user interactions")
print(f"Average user sentences: {average_user_sentences} sentences per dialogue for user interactions")
print(f"Average user words: {average_user_words} words per dialogue for user interactions\n")
print(f"Average system turns: {average_system_turns} turns per dialogue for system interactions")
print(f"Average system sentences: {average_system_sentences} sentences per dialogue for system interactions")
print(f"Average system words: {average_system_words} words per dialogue for system interactions\n")

Average total turns: 20.44 turns per dialogue
Average tokens per turn: 45.49 tokens per turn
Average total sentences: 35.24 sentences per dialogue
Average total words: 929.97 words per dialogue

Average user turns: 10.22 turns per dialogue for user interactions
Average user sentences: 18.52 sentences per dialogue for user interactions
Average user words: 488.51 words per dialogue for user interactions

Average system turns: 10.22 turns per dialogue for system interactions
Average system sentences: 16.72 sentences per dialogue for system interactions
Average system words: 441.46 words per dialogue for system interactions



### Standard deviation of dialogue lengths for user and system

In [4]:
std_total_turns = round(np.std([len(dialogue.split("User Turn:")) for dialogue in dialogues]), 2)
std_total_sentences = round(np.std([len(sent_tokenize(dialogue)) for dialogue in dialogues]), 2)
std_total_words = round(np.std([len(word_tokenize(dialogue)) for dialogue in dialogues]), 2)

std_user_turns = round(np.std([dialogue.count("User Turn:") for dialogue in dialogues]), 2)
std_user_sentences = round(np.std([len(sent_tokenize(turn)) for dialogue in dialogues for turn in dialogue.split("User Turn:")]), 2)
std_user_words = round(np.std([len(word_tokenize(turn)) for dialogue in dialogues for turn in dialogue.split("User Turn:")]), 2)

std_system_turns = round(np.std([dialogue.count("System Turn:") for dialogue in dialogues]), 2)
std_system_sentences = round(np.std([len(sent_tokenize(turn)) for dialogue in dialogues for turn in dialogue.split("System Turn:")]), 2)
std_system_words = round(np.std([len(word_tokenize(turn)) for dialogue in dialogues for turn in dialogue.split("System Turn:")]), 2)

# Print the results with units
print(f"Standard deviation total turns: {std_total_turns} turns per dialogue")
print(f"Standard deviation total sentences: {std_total_sentences} sentences per dialogue")
print(f"Standard deviation total words: {std_total_words} words per dialogue\n")
print(f"Standard deviation user turns: {std_user_turns} turns per dialogue for user interactions")
print(f"Standard deviation user sentences: {std_user_sentences} sentences per dialogue for user interactions")
print(f"Standard deviation user words: {std_user_words} words per dialogue for user interactions\n")
print(f"Standard deviation system turns: {std_system_turns} turns per dialogue for system interactions")
print(f"Standard deviation system sentences: {std_system_sentences} sentences per dialogue for system interactions")
print(f"Standard deviation system words: {std_system_words} words per dialogue for system interactions\n")

Standard deviation total turns: 3.44 turns per dialogue
Standard deviation total sentences: 10.51 sentences per dialogue
Standard deviation total words: 325.48 words per dialogue

Standard deviation user turns: 3.44 turns per dialogue for user interactions
Standard deviation user sentences: 1.22 sentences per dialogue for user interactions
Standard deviation user words: 26.33 words per dialogue for user interactions

Standard deviation system turns: 3.44 turns per dialogue for system interactions
Standard deviation system sentences: 0.95 sentences per dialogue for system interactions
Standard deviation system words: 22.22 words per dialogue for system interactions



### Vocabulary size for user and system

In [5]:
def calculate_vocabulary_size(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    freq_dist = FreqDist(filtered_words)
    return len(freq_dist)

def calculate_total_vocabulary_size(user_texts, system_texts):
    total_text = ' '.join(user_texts + system_texts)
    total_vocabulary_size = calculate_vocabulary_size(total_text)
    return total_vocabulary_size

user_texts = []
system_texts = []

for dialogue in dialogues:
    turns = dialogue.split("User Turn:")
    user_turns_list = turns[1::2]
    system_turns_list = turns[2::2]
    user_texts.extend(user_turns_list)
    system_texts.extend(system_turns_list)

user_vocabulary_size = calculate_total_vocabulary_size(user_texts, [])
system_vocabulary_size = calculate_total_vocabulary_size([], system_texts)
total_vocabulary_size = calculate_total_vocabulary_size(user_texts, system_texts)

print(f"User vocabulary size: {user_vocabulary_size} words")
print(f"System vocabulary size: {system_vocabulary_size} words")
print(f"Total vocabulary size: {total_vocabulary_size} words")

User vocabulary size: 16909 words
System vocabulary size: 16415 words
Total vocabulary size: 21991 words
