### Structure Generation

In [1]:
! pwd

/mounts/Users/cisintern/zeinabtaghavi/MetatagIndexing_2/Dataset_Generation/GeneratingCodes/S_Multi


In [13]:
import ast
import json
import random

with open("../../Dataset_Helping/names.txt", "r") as file:
    names = ' '.join([line.strip() for line in file.readlines()])
    names_list = sorted(list(ast.literal_eval(names)))

# Load the shopping data from the JSON file.
# The file should have the structure as you described.
with open("../../Dataset_Helping/S_Both/Wikidata_Final.jsonl", "r") as f:
    wikidata_samples = [json.loads(line) for line in f]


with open("../../Dataset_Helping/S_Multi/S_Multi_forum.jsonl", "r") as f:
    forum_topics = [json.loads(line) for line in f]


In [14]:
# untill you keep the country as a label
for sample in wikidata_samples:
    if 'country' not in sample.keys():
        country = sample['label'].split('-')[-1].strip()
        sample['country'] = country

In [15]:
wikidata_samples[0]

{'item': 'Nkrumah University',
 'label': 'Kabwe - Zambia',
 'score': 0.17619562149047852,
 'c4_count': 8527,
 'country': 'Zambia'}

In [16]:
forum_topics[0]

{'topic': 'Solo backpacking essentials',
 'forum_question': "Hi everyone, I'm planning my first solo backpacking trip and could use some advice on what essentials to pack. Any tips for staying safe and traveling light?"}

In [39]:
import random
import datetime

def generate_user_trip_destination(country_list, num_items):
    country_list_names = sorted(list(country_list.keys()))
    country_list_names = random.sample(country_list_names, num_items)
    user_trip_destinations = []
    for country in country_list_names:
        user_trip_destinations.append(random.choice(country_list[country]))

    return user_trip_destinations


def generate_message_for_each_user(list_of_users, list_of_destinations, year=2024):
    messages = []
    all_countries = []
    for user, destination in zip(list_of_users, list_of_destinations):
        random_date = f"{year}-{random.randint(1,12)}-{random.randint(1,28)}"
        destination_name = destination['item']
        destination_country = destination['country']
        all_countries.append(destination_country)
        message = {
            "forum_post": (random_date, user, destination_name),
            "question": f"Who has ever been to {destination_country}?",
            "answer": user
        }
        
        messages.append(message)
    assert (len(list(set(all_countries))) == len(list_of_destinations))
    return messages


In [40]:
dataset = []
shuffled_names_list = names_list.copy()
num_items = 20

country_list = {}
for sample in wikidata_samples:
    if sample['country'] not in country_list.keys():
        country_list[sample['country']] = [sample]
    else:
        country_list[sample['country']].append(sample)

# We'll generate enough unique users for each topic; here, 20 per topic.
for topic in forum_topics:
    dataset_row = {
        'topic': topic["topic"],
        'forum_question': topic["forum_question"],
    }

    list_of_users = random.sample(shuffled_names_list, num_items)
    list_of_destinations = generate_user_trip_destination(country_list, num_items)
    messages = generate_message_for_each_user(list_of_users, list_of_destinations)
    dataset_row["posts"] = messages
    dataset.append(dataset_row)


In [42]:
from pathlib import Path

# Save dataset
output_dir = Path("../../Dataset_Helping/S_Multi")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / f"S_Multi_Structured.jsonl"

with open(output_file, "w", encoding="utf-8") as f:
    for item in dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"Step1: Arithmetic Multi Dataset Structure saved to {output_file}") 


Step1: Arithmetic Multi Dataset Structure saved to ../../Dataset_Helping/S_Multi/S_Multi_Structured.jsonl


### Merging

In [1]:
import json
import ast
import random
from datetime import date, timedelta
from pathlib import Path
import os


random.seed(42)

import json 

def merging_dataset(base_path):  
    # Load the structured data
    structured_data_path = base_path + "Dataset_Helping/S_Multi/S_Multi_Structured.jsonl"

    with open(structured_data_path, 'r', encoding='utf-8') as f:
        structured_data = [json.loads(line) for line in f]

    print(f"Loaded {len(structured_data)} records from structured data file")

    # Load the generated conversation data
    generated_data_path = base_path + "Dataset_Helping/S_Multi/S_Multi_Structured_Generated_conversation.jsonl"

    with open(generated_data_path, 'r', encoding='utf-8') as f:
        generated_data = [json.loads(line) for line in f]

    print(f"Loaded {len(generated_data)} records from generated data file")

    dataset = []

    x = 0
    for i in range(len(structured_data)):
        topic = structured_data[i]['topic']
        forum_question = structured_data[i]['forum_question']
        posts = structured_data[i]['posts']
        for j in range(len(posts)):
            message_date = posts[j]['forum_post'][0]
            user = posts[j]['forum_post'][1]
            if i*20 + j != x:
                print(i*20 + j, x)
            assert i*20 + j == x
            x += 1
            user_response = generated_data[int(i*20 + j)]
            if user_response == '-':
                print(user_response, i*20 + j)
                raise Exception("user_response is '-'")
            dataset.append({
                "user_ID": i,
                "topic": topic,
                "forum_question": forum_question,
                "message_date": message_date,
                "user": user,
                "user_response": user_response,
                "question": posts[j]['question'],
                "answer": posts[j]['answer']
            })

    with open(base_path + 'Data/S_Multi.jsonl', 'w', encoding='utf-8') as f:
        for item in dataset:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    print(f"A_Multi: {len(dataset)}, stored in {base_path}/Data/A_Multi.jsonl")

if __name__ == "__main__":
    base_path = '../../../Dataset_Generation/'
    merging_dataset(base_path)

Loaded 25 records from structured data file
Loaded 500 records from generated data file
- 7


Exception: user_response is '-'