In [7]:
import json
import argparse
import ast
import argparse
import random
from datetime import date, timedelta

# Set seed and date boundaries
random.seed(42)


def save_jsonl(data, filename):
    with open(filename, "w+", encoding="utf-8") as file:
        for line in data:
            file.write(json.dumps(line) + "\n")

with open("../../Dataset_Helping/names.txt", "r") as file:
    names = ' '.join([line.strip().replace(" ", "") for line in file.readlines()])
    names_list = list(set(ast.literal_eval(names)))

forum_topics = []
with open("../../Dataset_Helping/T_Multi/T_Multi_forum.jsonl", "r") as file:
    for line in file:
        forum_topic = json.loads(line.strip())
        forum_topics.append(forum_topic)

In [4]:
import random
from datetime import datetime, timedelta

# ---------------------------
# Helper Functions
# ---------------------------
def parse_date(date_str):
    """Parse a 'YYYY-MM-DD' string into a date object."""
    return datetime.strptime(date_str, "%Y-%m-%d").date()

def format_date(dt):
    """Format a date object back into a 'YYYY-MM-DD' string."""
    return dt.strftime("%Y-%m-%d")

def random_date_between(start_date, end_date):
    """Return a random date between two date objects."""
    delta = (end_date - start_date).days
    rand_days = random.randint(0, delta)
    return start_date + timedelta(days=rand_days)

def generate_work_dates(n, overall_start, overall_end):
    """
    Generate a chain of n work dates starting from a random date between overall_start and overall_end,
    with each subsequent work date at least 2 and at most 7 days after the previous one.
    Additionally, ensure that the final work date is at least 14 days before overall_end.
    """
    # We'll use overall_end - 14 days as the last allowed start.
    last_allowed = overall_end - timedelta(days=14)
    # Choose an initial date between overall_start and last_allowed - ((n-1)*2) days (minimum gap)
    min_total_gap = (n - 1) * 2  # minimum gap
    max_initial = last_allowed - timedelta(days=min_total_gap)
    start = random_date_between(overall_start, max_initial)
    dates = [start]
    for i in range(1, n):
        # For each step, add a random gap between 2 and 7 days.
        gap = random.randint(2, 7)
        next_date = dates[-1] + timedelta(days=gap)
        dates.append(next_date)
    return dates

def generate_message_dates(work_dates):
    """
    Given a sorted list of work dates (as date objects), generate a corresponding list of message dates.
    For the first n-1 work dates, the message date is set equal to the subsequent work date.
    For the final work date, the message date is randomly chosen within 1 week after it.
    """
    message_dates = []
    for i in range(len(work_dates) - 1):
        message_dates.append(work_dates[i+1])
    # For the final work date:
    last_work = work_dates[-1]
    # The message date must be within one week after the last work date.
    message_dates.append(random_date_between(last_work, last_work + timedelta(days=7)))
    return message_dates


In [14]:

# ---------------------------
# Dataset Generation for Forum-Style Multi-Conversation
# ---------------------------
# Define overall interval for date generation.
OVERALL_START = parse_date("2024-01-01")
OVERALL_END = parse_date("2024-12-31")


# We'll generate dataset records for each topic.
# For each topic, we generate 20 (work_date, message_date) pairs.
# Then, for each pair, we assign a unique user (from a pool) and a forum item (randomly chosen).
dataset = []
shuffled_names_list = names_list.copy()

# We'll generate enough unique users for each topic; here, 20 per topic.
for topic in forum_topics:
    dataset_row = {
        'topic': topic["topic"],
        'forum_question': topic["base_question"],
        "posts": []
    }

    
    # Generate 20 work dates.
    work_dates = generate_work_dates(20, OVERALL_START, OVERALL_END)
    # Generate corresponding message dates.
    # Select 20 random names and remove them from shuffled_names_list
    topic_users = random.sample(shuffled_names_list, 20)
    for user in topic_users:
        shuffled_names_list.remove(user)

    message_dates = generate_message_dates(work_dates)

    
    # For each pair, create a record.
    for i in range(20):
        work_date = format_date(work_dates[i])
        message_date = format_date(message_dates[i])
        # Calculate offset: number of days between message_date and work_date.
        offset = (message_dates[i] - work_dates[i]).days
        # Randomly assign a user.
        user = topic_users[i]
        # Randomly select a forum item.
        forum_item = topic["items"][i]
        forum_post = f"{forum_item} , {offset} days ago"
        # Generate the question by replacing {date} with message_date.
        question_template = topic["question"]
        question = question_template.replace("{date}", work_date)
        answer = user
        record = {
            "forum_post": (message_date, user, forum_post),
            "question": question,
            "answer": answer
        }
        dataset_row["posts"].append(record)

    dataset.append(dataset_row)

# For demonstration, print the first 5 records.
for rec in dataset[:5]:
    print(rec)

{'topic': 'Reading Books', 'forum_question': "Hi everyone, I'd love some book recommendations—any favorites?", 'posts': [{'forum_post': ('2024-02-16', 'Valeen', 'Pride and Prejudice , 3 days ago'), 'question': 'Who was reading a book on 2024-02-13?', 'answer': 'Valeen'}, {'forum_post': ('2024-02-18', 'Dysis', '1984 , 2 days ago'), 'question': 'Who was reading a book on 2024-02-16?', 'answer': 'Dysis'}, {'forum_post': ('2024-02-24', 'Iara', 'To Kill a Mockingbird , 6 days ago'), 'question': 'Who was reading a book on 2024-02-18?', 'answer': 'Iara'}, {'forum_post': ('2024-02-29', 'Jimena', 'The Great Gatsby , 5 days ago'), 'question': 'Who was reading a book on 2024-02-24?', 'answer': 'Jimena'}, {'forum_post': ('2024-03-06', 'Elowen', 'Moby-Dick , 6 days ago'), 'question': 'Who was reading a book on 2024-02-29?', 'answer': 'Elowen'}, {'forum_post': ('2024-03-12', 'Roderic', 'War and Peace , 6 days ago'), 'question': 'Who was reading a book on 2024-03-06?', 'answer': 'Roderic'}, {'forum_p

In [15]:
! ls ../../

Data  Dataset_Helping  GeneratingCodes


In [16]:
import json
from pathlib import Path

# Create output directory if it doesn't exist
output_dir = Path("../..//Dataset_Helping/T_Multi")
output_dir.mkdir(parents=True, exist_ok=True)

# Create filename with date range
output_file = output_dir / f"T_Multi_{OVERALL_START.year}_Structured.jsonl"

# Write data in jsonl format
with open(output_file, "w", encoding="utf-8") as f:
    for item in dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Data saved to {output_file}")


Data saved to ../../Dataset_Helping/T_Multi/T_Multi_2024_Structured.jsonl


In [17]:
dataset[0]['forum_question']

"Hi everyone, I'd love some book recommendations—any favorites?"

### Evaluating the works

In [1]:
import json
from pathlib import Path

# Create output directory if it doesn't exist
output_dir = Path("../../Dataset_Helping")
output_dir.mkdir(parents=True, exist_ok=True)

# Create filename with date range
output_file = output_dir / f"T_Multi/T_Multi_2024_Structured.jsonl"

# Load the generated data
with open(output_file, 'r') as f:
    data = [json.loads(line) for line in f]

In [2]:
work_dates_lists = []
for item in data:
    work_dates_lists.extend(item['posts'])
print(len(work_dates_lists))

500


In [4]:
print(work_dates_lists[0])

{'forum_post': ['2024-02-16', 'Ishmael', 'I was on holiday and read Pride and Prejudice 3 days ago. It was a fantastic escape!'], 'question': 'Who was reading a book on 2024-02-13?', 'answer': 'Ishmael'}


In [6]:
extracted_prices_path = "../../Dataset_Helping/T_Multi/T_Multi_2024_Structured_Generated_step_2.jsonl"
with open(extracted_prices_path, 'r') as f:
    extracted_prices = [json.loads(line) for line in f]
len(extracted_prices)
extracted_prices[0]


{'work': 'read Pride and Prejudice', 'days': ['2024-02-13']}

In [7]:
for idx in range(len(extracted_prices)):
    if extracted_prices[idx] == '-':
        print('--------------------------------')
        print("Conv issue", idx)
        print(work_dates_lists[idx])
    elif len(extracted_prices[idx]["days"]) == 0:
        print('--------------------------------')
        print("Price issue", idx)
        print(extracted_prices[idx])


--------------------------------
Price issue 3
{'work': '', 'days': []}
--------------------------------
Price issue 4
{'work': '', 'days': []}
--------------------------------
Price issue 9
{'work': '', 'days': []}
--------------------------------
Price issue 31
{'work': '', 'days': []}
--------------------------------
Price issue 45
{'work': '', 'days': []}
--------------------------------
Price issue 70
{'work': 'Task not identified', 'days': []}
--------------------------------
Price issue 72
{'work': '', 'days': []}
--------------------------------
Price issue 76
{'work': '', 'days': []}
--------------------------------
Price issue 79
{'work': '', 'days': []}
--------------------------------
Price issue 99
{'work': 'making Banana Pudding or Banana Cream Pie', 'days': []}
--------------------------------
Conv issue 139
{'forum_post': ['2024-05-23', 'Karia', 'I started Splatoon 2 1 days ago. Already can’t wait for my next session!'], 'question': 'Who was playing a video game on 2024

## Merging Conversations and structured

In [2]:
import json 
# Load the structured data
structured_data_path = "../../Dataset_Helping/T_Multi/T_Multi_2024_Structured.jsonl"

with open(structured_data_path, 'r', encoding='utf-8') as f:
    structured_data = [json.loads(line) for line in f]

print(f"Loaded {len(structured_data)} records from structured data file")


# Load the generated conversation data
generated_data_path = "../../Dataset_Helping/T_Multi/T_Multi_2024_Structured_Generated_step_1.jsonl"

with open(generated_data_path, 'r', encoding='utf-8') as f:
    generated_data = [json.loads(line) for line in f]

print(f"Loaded {len(generated_data)} records from generated data file")

Loaded 25 records from structured data file
Loaded 500 records from generated data file


Struture:

topic - forum_question - forum_post(message_date) - forum_post(user) - forum_post(user_response) - question - answer

user_response is the conversation

In [7]:
dataset = []

x = 0
for i in range(len(structured_data)):
    topic = structured_data[i]['topic']
    forum_question = structured_data[i]['forum_question']
    posts = structured_data[i]['posts']
    for j in range(len(posts)):
        message_date = posts[j]['forum_post'][0]
        user = posts[j]['forum_post'][1]
        if i*20 + j != x:
            print(i*20 + j, x)
        assert i*20 + j == x
        x += 1
        user_response = generated_data[int(i*20 + j)]
        if user_response == '-':
            print(user_response, i*20 + j)
            raise Exception("user_response is '-'")
        dataset.append({
            "user_ID": i,
            "topic": topic,
            "forum_question": forum_question,
            "message_date": message_date,
            "user": user,
            "user_response": user_response,
            "question": posts[j]['question'],
            "answer": posts[j]['answer']
        })


In [8]:
with open('../../Data/T_Multi.jsonl', 'w', encoding='utf-8') as f:
    for item in dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')