In [1]:
import os
import random

# Define file paths
dataset_dir = "dataset"
file_path = os.path.join(dataset_dir, "dataset.txt")
train_file_path = os.path.join(dataset_dir, "train.txt")
validation_file_path = os.path.join(dataset_dir, "validation.txt")
test_file_path = os.path.join(dataset_dir, "test.txt")

# Delete existing train, validation, and test files if they exist
for file in [train_file_path, validation_file_path, test_file_path]:
    if os.path.exists(file):
        os.remove(file)

# Load the dataset
with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Parse dataset into topic-based dialogues
dialogues = []
current_dialogue = []
inside_topic = False

for line in lines:
    line = line.strip()
    if not line:
        continue  # Skip empty lines
    
    if line.lower().startswith("topic"):  # Detect topic headers
        if current_dialogue:
            dialogues.append(current_dialogue)  # Store previous topic dialogues
        current_dialogue = []  # Reset for new topic
        inside_topic = True
    else:
        current_dialogue.append(line)  # Collect user-bot exchanges

# Add the last topic's dialogues if any
if current_dialogue:
    dialogues.append(current_dialogue)

# Flatten dialogues to only user-bot pairs
user_bot_pairs = []
for dialogue in dialogues:
    for i in range(0, len(dialogue) - 1, 2):
        user_bot_pairs.append((dialogue[i], dialogue[i + 1]))

# Shuffle dataset
random.shuffle(user_bot_pairs)

# Compute split indices
test_split = int(0.2 * len(user_bot_pairs))  # 20% for test
remaining = user_bot_pairs[test_split:]      # 80% remaining for train + validation
train_split = int(0.8 * len(remaining))      # 80% of remaining for train

# Split dataset
test_dialogues = user_bot_pairs[:test_split]
train_dialogues = remaining[:train_split]
validation_dialogues = remaining[train_split:]

# Function to save dialogues to a file
def save_dialogues(file_path, dialogues):
    with open(file_path, "w", encoding="utf-8") as file:
        for user, bot in dialogues:
            file.write(f"{user}\n{bot}\n")

# Save datasets
save_dialogues(train_file_path, train_dialogues)
save_dialogues(validation_file_path, validation_dialogues)
save_dialogues(test_file_path, test_dialogues)

# Print results
print(f"Train file saved at: {train_file_path}, Size: {len(train_dialogues)} dialogues")
print(f"Validation file saved at: {validation_file_path}, Size: {len(validation_dialogues)} dialogues")
print(f"Test file saved at: {test_file_path}, Size: {len(test_dialogues)} dialogues")


Train file saved at: dataset/train.txt, Size: 320 dialogues
Validation file saved at: dataset/validation.txt, Size: 80 dialogues
Test file saved at: dataset/test.txt, Size: 100 dialogues
