# Data Cleaning and Pre-Processing

In [4]:
import os
import json
from collections import Counter
import csv

In [6]:
data_directory = '../db/raw/dbdc4_en_dev_labeled'

file_list = os.listdir(data_directory)

if not file_list:
    print("No files found in the data directory!")

file_count = 0
for file_name in file_list:
    file_count += 1
    if file_name.endswith('.json'):
        file_path = os.path.join(data_directory, file_name)
        
        with open(file_path, 'r') as file:
            data = json.load(file)
        
        for turn in data.get("turns", []):
            turn.pop("time", None)
            turn.pop("annotation-id", None)
            for annotation in turn.get("annotations", []):
                annotation.pop("comment", None)
                annotation.pop("annotator-id", None)
                annotation.pop("ungrammatical-sentence", None)
        
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)

print(f"Data cleaning complete for {file_count} files!")

Data cleaning complete for 211 files!


#### Create Majority Voting Function and Probability Distribution
##### Treat Breakdown and Possible Breakdown as Breakdown

In [7]:
def interpret_breakdown(annotation):
    return "breakdown" if annotation['breakdown'] in ['X', 'T'] else "non-breakdown"

file_count = 0
data_directory = '../db/raw/dbdc4_en_dev_labeled'
file_list = [f for f in os.listdir(data_directory) if f.endswith('.json')]

if not file_list:
    print("No JSON files found in the directory.")
else:
    for file_name in file_list:
        file_path = os.path.join(data_directory, file_name)

        with open(file_path, 'r') as file:
            data = json.load(file)

        for turn in data["turns"]:

            annotations_interpreted = [interpret_breakdown(annotation) for annotation in turn.get("annotations", [])]

            # Calculate the majority voting
            majority_vote = Counter(annotations_interpreted).most_common(1)[0][0] if annotations_interpreted else None

            # Calculate probability distribution
            total_annotations = len(annotations_interpreted) if annotations_interpreted else 1
            probability_distribution = {
                "breakdown": annotations_interpreted.count("breakdown") / total_annotations,
                "non-breakdown": annotations_interpreted.count("non-breakdown") / total_annotations
            }
            turn["majority_voting"] = majority_vote
            turn["probability_distribution"] = probability_distribution

        file_count += 1

        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)

    print(f"Data processing complete for {file_count} files!")


Data processing complete for 211 files!


Removed majority voting and probability distribution from first utterances or user dialogues

In [8]:
file_count = 0
data_directory = '../db/raw/dbdc4_en_dev_labeled'
file_list = [f for f in os.listdir(data_directory) if f.endswith('.json')]

if not file_list:
    print("No JSON files found in the directory.")
else:
    for file_name in file_list:
        file_path = os.path.join(data_directory, file_name)

        with open(file_path, 'r') as file:
            data = json.load(file)

        for turn in data["turns"]:

            if turn["speaker"] == "U" or turn["turn-index"] == 0:
                turn.pop("majority_voting", None)
                turn.pop("probability_distribution", None)
        file_count += 1
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)

    print(f"Further data cleaning complete for {file_count} files!")


Further data cleaning complete for 211 files!


#### Generate Cleaned CSV Files for LLMs

In [10]:
file_count = 0

data_directory = '../db/raw/dbdc4_en_dev_labeled'
output_directory = '../db/raw/dbdc4_en_dev_labeled/csv'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

file_list = [f for f in os.listdir(data_directory) if f.endswith('.json')]

if not file_list:
    print("No JSON files found in the directory.")
else:
    for file_name in file_list:
        json_file_path = os.path.join(data_directory, file_name)
        csv_file_path = os.path.join(output_directory, os.path.splitext(file_name)[0] + '.csv')

        try:
            with open(json_file_path, 'r') as file:
                data = json.load(file)

            with open(csv_file_path, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['Speaker', 'Utterance'])
                for turn in data["turns"]:
                    speaker = turn["speaker"]
                    utterance = turn["utterance"].replace('\n', ' ').strip()
                    
                    writer.writerow([speaker, utterance])
            
            file_count += 1
        except (json.JSONDecodeError, IOError) as e:
            print(f"An error occurred while processing {file_name}: {e}")

    print(f"Converted {file_count} JSON files to CSV in the '{output_directory}' directory.")

Converted 211 JSON files to CSV in the '../db/raw/dbdc4_en_dev_labeled/csv' directory.


Further cleaning with labelled output (ensures we have user/bot utterances, majority voting and prob distribution)

In [11]:
file_count = 0
data_directory = '../db/raw/dbdc4_en_dev_labeled'
output_directory = '../db/raw/dbdc4_en_dev_labeled/output'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

file_list = [f for f in os.listdir(data_directory) if f.endswith('.json')]

if not file_list:
    print("No JSON files found in the directory.")
else:
    for file_name in file_list:
        json_file_path = os.path.join(data_directory, file_name)
        csv_file_path = os.path.join(output_directory, os.path.splitext(file_name)[0] + '.csv')

        try:
            with open(json_file_path, 'r') as file:
                data = json.load(file)

            with open(csv_file_path, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['Speaker', 'Utterance', 'Majority Voting', 'Probability Distribution'])
                for turn in data["turns"]:
                    speaker = turn["speaker"]
                    utterance = turn["utterance"].replace('\n', ' ').strip()

                    # Initialize majority_voting and formatted_probability as empty strings
                    majority_voting = ''
                    formatted_probability = ''

                    # Check if 'majority_voting' and 'probability_distribution' exist in the turn
                    if 'majority_voting' in turn and 'probability_distribution' in turn:
                        majority_voting = turn["majority_voting"]
                        probability_distribution = turn["probability_distribution"]

                        # Get the probability for the majority_voting value
                        majority_voting_probability = probability_distribution.get(majority_voting, 0)
                        formatted_probability = f"{majority_voting_probability:.1f}"

                    writer.writerow([speaker, utterance, majority_voting, formatted_probability])
                    
            file_count += 1
        except (json.JSONDecodeError, IOError) as e:
            print(f"An error occurred while processing {file_name}: {e}")

    print(f"Converted {file_count} JSON files to CSV in the '{output_directory}' directory.")

Converted 211 JSON files to CSV in the '../db/raw/dbdc4_en_dev_labeled/output' directory.


# Format Files for LLM Analysis

Function to format each utterance by adding a line number and user / bot header

In [None]:
def convert_to_dialogue(json_data, file_path):
    dialogue = ""
    
    line_number = 1
    for turn in json_data['turns']:
        turn['utterance'] = turn["utterance"].replace('\n', ' ').strip()
        
        if turn['speaker'] == 'S':
            dialogue += f"{line_number}. Bot: {turn['utterance']}\n"
        else:
            dialogue += f"{line_number}. User: {turn['utterance']}\n"

        line_number += 1
    
    with open(file_path, 'w') as file:
        file.write(dialogue)
    
    return file_path

In [None]:
file_count = 0

data_directory = '../db/raw/dbdc4_en_dev_labeled/'
output_directory = '../db/raw/dev/text_eval'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

file_list = [f for f in os.listdir(data_directory) if f.endswith('.json')]

if not file_list:
    print("No JSON files found in the directory.")
else:
    for file_name in file_list:
        json_file_path = os.path.join(data_directory, file_name)
        text_path = os.path.join(output_directory, os.path.splitext(file_name)[0] + '.txt')

        try:
            with open(json_file_path, 'r') as file:
                data = json.load(file)

            convert_to_dialogue(data, text_path)
            
            file_count += 1
        except (json.JSONDecodeError, IOError) as e:
            print(f"An error occurred while processing {file_name}: {e}")

    print(f"Converted {file_count} JSON files to Text in the '{output_directory}' directory.")

# Used to Automate LLM response analysis
Labelled cleaning to match similar context to utterances to segments

In [1]:
import pandas as pd
import os
import glob

def transform_dialogue(file_path, output_folder):
    df = pd.read_csv(file_path)
    df['Speaker'] = df['Speaker'].map({'S': 'Bot', 'U': 'User'})
    
    for i in range(len(df)):
        df.at[i, 'Utterance'] = f"{i+1}. {df.at[i, 'Speaker']}: {df.at[i, 'Utterance']}"

    df.drop(columns=['Speaker'], inplace=True)

    df['Utterance'] = df['Utterance'].str.replace('"', "'", regex=False)
    df['Utterance'] = df['Utterance'].str.replace('\n', ' ', regex=True)
    df['Utterance'] = df['Utterance'].str.replace(',', ';', regex=False)

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    output_file_path = os.path.join(output_folder, f"{base_name}.csv")

    df.to_csv(output_file_path, index=False)

    return df

input_dir = '../db/raw/dbdc4_en_dev_labeled/output'
output_dir = '../db/raw/dev_labelled'
os.makedirs(output_dir, exist_ok=True)

for file_path in glob.glob(f"{input_dir}/*.csv"):
    transform_dialogue(file_path, output_dir)