### Schedule Generation

In [7]:
import ast

# Define two lists of 50 distinct works each for W_once and W_repeat.
# These lists do not overlap.
with open("../../Dataset_Helping/T_Uni/works_Temp_Uni_Once.txt", "r") as file:
    phrases = ' '.join([line.strip() for line in file.readlines()])
    W_once_list = sorted(list(ast.literal_eval(phrases)))

with open("../../Dataset_Helping/T_Uni/works_Temp_Uni_Repeat.txt", "r") as file:
    phrases = ' '.join([line.strip() for line in file.readlines()])
    W_repeat_list = sorted(list(ast.literal_eval(phrases)))

with open("../../Dataset_Helping/names.txt", "r") as file:
    names = ' '.join([line.strip() for line in file.readlines()])
    names_list = sorted(list(ast.literal_eval(names)))

In [8]:
import random
from datetime import timedelta

random.seed(42)

# Constants
NUM_USERS = 50
WINDOW_DAYS = 14           # Two-week window: days 1 to 14
DAY_START = 7              # 7:00
DAY_END = 19               # 19:00
MIN_DURATION = 2           # hours
MAX_DURATION = 4           # hours
MAX_BLOCK = 4              # Maximum block length to consider from a border


# Simulate 50 users: User1, User2, …, User50.
users = random.sample(names_list, 50)

# --- Free Time Management Functions ---

def initial_free_intervals():
    """Return the initial free interval for a day: [(DAY_START, DAY_END)]."""
    return [(DAY_START, DAY_END)]

def schedule_in_interval(free_interval, duration):
    """
    Given a free interval (start, end) and desired duration,
    choose one of its borders (start or end) as candidate block
    (limited to MAX_BLOCK hours) and schedule the activity.
    Returns a tuple (scheduled_start, scheduled_end) or None if not possible.
    """
    start, end = free_interval
    interval_length = end - start
    # Define candidate blocks from the start and end:
    block_start = start
    block_length_start = min(MAX_BLOCK, interval_length)
    block_end = end
    block_length_end = min(MAX_BLOCK, interval_length)
    choice = random.choice(["start", "end"])
    if choice == "start":
        available_delay = block_length_start - duration
        if available_delay < 0:
            return None
        delay = random.randint(0, available_delay)
        scheduled_start = block_start + delay
        scheduled_end = scheduled_start + duration
    else:
        available_delay = block_length_end - duration
        if available_delay < 0:
            return None
        delay = random.randint(0, available_delay)
        scheduled_end = block_end - delay
        scheduled_start = scheduled_end - duration
    return (scheduled_start, scheduled_end)

def update_interval(free_interval, scheduled_slot):
    """
    Subtracts the scheduled_slot (s_slot, e_slot) from free_interval (s_free, e_free)
    and returns the list of resulting free intervals.
    """
    s_free, e_free = free_interval
    s_slot, e_slot = scheduled_slot
    new_intervals = []
    if s_slot > s_free:
        new_intervals.append((s_free, s_slot))
    if e_slot < e_free:
        new_intervals.append((e_slot, e_free))
    return new_intervals

def intersect_intervals(intervals):
    """
    Given a list of intervals (start, end), returns their intersection,
    or None if they don't overlap.
    """
    common_start = max(interval[0] for interval in intervals)
    common_end = min(interval[1] for interval in intervals)
    if common_end > common_start:
        return (common_start, common_end)
    else:
        return None


In [9]:
def making_schedule(user, WINDOW_DAYS, W_repeat, W_once):
    user_scheduled_correctly = False
    scheduling_attempts = 0
    while not user_scheduled_correctly:
        scheduling_attempts += 1
        # Initialize free intervals for each day (1 to WINDOW_DAYS)
        free_times = {day: initial_free_intervals() for day in range(1, WINDOW_DAYS+1)}
        schedule = []  # List of scheduled activities for this user.
        
        # -----------------------------
        # 1. Repeating-Sequential Activities
        RS_works = random.sample(W_repeat, 3)
        RS_durations = [3, 3, 4]  # consecutive days
        for work, duration in zip(RS_works, RS_durations):
            valid_start_days = list(range(1, WINDOW_DAYS - duration + 2))
            placed = False
            attempts = 0
            while not placed and attempts < 100:
                start_day = random.choice(valid_start_days)
                day_sequence = list(range(start_day, start_day + duration))
                candidate_intervals = []
                for day in day_sequence:
                    if free_times[day]:
                        candidate_intervals.append(free_times[day][0])
                    else:
                        candidate_intervals = []
                        break
                if not candidate_intervals:
                    attempts += 1
                    continue
                common_interval = intersect_intervals(candidate_intervals)
                if common_interval is None or (common_interval[1] - common_interval[0]) < MIN_DURATION:
                    attempts += 1
                    continue
                dur = random.randint(MIN_DURATION, MAX_DURATION)
                slot = schedule_in_interval(common_interval, dur)
                if slot is None:
                    attempts += 1
                    continue
                # Update free intervals for each day in the sequence.
                for day in day_sequence:
                    new_intervals = []
                    for interval in free_times[day]:
                        if interval[0] <= common_interval[0] and interval[1] >= common_interval[1]:
                            new_intervals.extend(update_interval(interval, slot))
                        else:
                            new_intervals.append(interval)
                    free_times[day] = new_intervals
                if duration == 3:
                    qs_day = day_sequence[1]
                else:
                    qs_day = random.choice(day_sequence[1:3])
                s_slot, e_slot = slot
                if e_slot - s_slot > 1:
                    qs_hour = random.randint(s_slot + 1, e_slot - 1)
                else:
                    qs_hour = s_slot
                user_2 = random.choice([s_u for s_u in users if s_u != user])
                schedule.append({
                    "user_2": user_2,
                    "work": work,
                    "activity_type": "Repeating-Sequential",
                    "days": day_sequence,
                    "hours": slot,
                    "question_time": (qs_day, qs_hour)
                })
                placed = True
            if not placed:
                print(f"Warning: Could not place repeating-sequential work {work} for {user}")
        
        # -----------------------------
        # 2. Repeating-Non-Sequential Activities (with attempt loop)
        remaining_repeat = [w for w in W_repeat if w not in RS_works]
        RN_works = random.sample(remaining_repeat, 3)
        RN_durations = [2, 3, 2]  # non-consecutive days
        for work, duration in zip(RN_works, RN_durations):
            placed = False
            attempts = 0
            valid_days = list(range(1, WINDOW_DAYS+1))
            while not placed and attempts < 100:
                day_sequence = sorted(random.sample(valid_days, duration))
                candidate_intervals = []
                for day in day_sequence:
                    if free_times[day]:
                        candidate_intervals.append(free_times[day][0])
                    else:
                        candidate_intervals = []
                        break
                if not candidate_intervals:
                    attempts += 1
                    continue
                common_interval = intersect_intervals(candidate_intervals)
                if common_interval is None or (common_interval[1] - common_interval[0]) < MIN_DURATION:
                    attempts += 1
                    continue
                dur = random.randint(MIN_DURATION, MAX_DURATION)
                slot = schedule_in_interval(common_interval, dur)
                if slot is None:
                    attempts += 1
                    continue
                # Update free intervals for each day in the sequence.
                for day in day_sequence:
                    new_intervals = []
                    for iv in free_times[day]:
                        if iv[0] <= common_interval[0] and iv[1] >= common_interval[1]:
                            new_intervals.extend(update_interval(iv, slot))
                        else:
                            new_intervals.append(iv)
                    free_times[day] = new_intervals
                if len(day_sequence) >= 2:
                    qs_day = day_sequence[1]
                else:
                    qs_day = day_sequence[0]
                s_slot, e_slot = slot
                if e_slot - s_slot > 1:
                    qs_hour = random.randint(s_slot + 1, e_slot - 1)
                else:
                    qs_hour = s_slot
                user_2 = random.choice([s_u for s_u in users if s_u != user])
                schedule.append({
                    "user_2": user_2,
                    "work": work,
                    "activity_type": "Repeating-Non-Sequential",
                    "days": day_sequence,
                    "hours": slot,
                    "question_time": (qs_day, qs_hour)
                })
                placed = True
                break
            if not placed:
                print(f"Warning: Could not place repeating-non-sequential work {work} for {user}")
        
        # -----------------------------
        # 3. One-Time Activities (with attempt loop)
        OT_works = random.sample(W_once, 9)
        for work in OT_works:
            placed = False
            attempts = 0
            while not placed and attempts < 100:
                day = random.randint(1, WINDOW_DAYS)
                if not free_times[day]:
                    attempts += 1
                    continue
                interval = free_times[day][0]
                dur = random.randint(MIN_DURATION, MAX_DURATION)
                slot = schedule_in_interval(interval, dur)
                if slot is None:
                    attempts += 1
                    continue
                new_ints = []
                for iv in free_times[day]:
                    if iv == interval:
                        new_ints.extend(update_interval(iv, slot))
                    else:
                        new_ints.append(iv)
                free_times[day] = new_ints
                s_slot, e_slot = slot
                if e_slot - s_slot > 1:
                    qs_hour = random.randint(s_slot + 1, e_slot - 1)
                else:
                    qs_hour = s_slot
                user_2 = random.choice([s_u for s_u in users if s_u != user])
                schedule.append({
                    "user_2": user_2,
                    "work": work,
                    "activity_type": "One-Time",
                    "days": [day],
                    "hours": slot,
                    "question_time": (day, qs_hour)
                })
                placed = True
                break
            if not placed:
                print(f"Warning: Could not place one-time work {work} for {user}")
        
        if len(schedule) == 15:
            user_scheduled_correctly = True
        # Store the schedule for the user.

        if scheduling_attempts == 100:
            print(f"Warning: Could not schedule activities for {user}")
            raise Exception(f"Could not schedule activities for {user}")

    return schedule

In [10]:
# -----------------------------
# Scheduling for Uni-Conversation (Generating the schedules)
# For each user, we maintain:
# - A two-week window (days 1 to 14).
# - Free intervals for each day (initially [(DAY_START, DAY_END)]).
# - A Scheduling list (each activity with work, type, days, time slot, and message time).
user_schedules = {}

for user in users:
    
    W_once_1 = random.sample(W_once_list, 25)
    W_once_2 = [w for w in W_once_list if w not in W_once_1]
    W_repeat_1 = random.sample(W_repeat_list, 10)
    W_repeat_2 = [w for w in W_repeat_list if w not in W_repeat_1]
    schedule_1 = making_schedule(user, WINDOW_DAYS, W_repeat_1, W_once_1)
    schedule_2 = making_schedule(user, WINDOW_DAYS, W_repeat_2, W_once_2) 

    for i in range(len(schedule_2)):
        schedule_2[i]['days'] = [day + 14 for day in schedule_2[i]['days']]
        schedule_2[i]['question_time'] = (schedule_2[i]['question_time'][0]+14 , schedule_2[i]['question_time'][1])

    schedule = schedule_1 + schedule_2
    user_schedules[user] = {
        "user": user,
        "schedule": schedule
    }



In [11]:
# Verify that all users have exactly 10 activities
for user in users:
    if len(user_schedules[user]['schedule']) != 30:
        raise Exception(f"User {user} has {len(user_schedules[user]['schedule'])} activities instead of required 10")


In [12]:
# Example: Print schedule for the first 2 users.
for user in users[:1]:
    print(f"Schedule for {user}:")
    for activity in user_schedules[user]["schedule"]:
        print(activity)
    print("\n" + "="*50 + "\n")



Schedule for Elita:
{'user_2': 'Astraea', 'work': 'practice a musical instrument', 'activity_type': 'Repeating-Sequential', 'days': [7, 8, 9], 'hours': (15, 19), 'question_time': (8, 17)}
{'user_2': 'Granger', 'work': 'attend a virtual meeting', 'activity_type': 'Repeating-Sequential', 'days': [4, 5, 6], 'hours': (15, 19), 'question_time': (5, 18)}
{'user_2': 'Kiahra', 'work': 'check daily emails', 'activity_type': 'Repeating-Sequential', 'days': [6, 7, 8, 9], 'hours': (9, 11), 'question_time': (8, 10)}
{'user_2': 'Madoc', 'work': 'monitor system performance', 'activity_type': 'Repeating-Non-Sequential', 'days': [3, 11], 'hours': (15, 19), 'question_time': (11, 17)}
{'user_2': 'Urania', 'work': 'schedule routine maintenance', 'activity_type': 'Repeating-Non-Sequential', 'days': [2, 11, 12], 'hours': (11, 15), 'question_time': (11, 12)}
{'user_2': 'Matilda', 'work': 'attend daily stand-up meeting', 'activity_type': 'Repeating-Non-Sequential', 'days': [12, 14], 'hours': (7, 10), 'questio

In [13]:
from datetime import date, timedelta
import random

# Define overall period boundaries.
START_YEAR = 2020
END_YEAR = 2024

overall_start = date(START_YEAR, 1, 1)
overall_end = date(END_YEAR, 12, 31)

# Total number of users: 50.
NUM_USERS = 50

# We need to assign 50 non-overlapping two-week intervals (14-day blocks)
# within the overall period. One approach is to partition the overall period
# into 50 segments and then choose a random start date from each segment.
total_days = (overall_end - overall_start).days
segment_length = total_days // NUM_USERS

user_start_dates = {}
sorted_users = sorted(user_schedules.keys())  # Assuming user_schedules was generated earlier.

for i, user in enumerate(sorted_users):
    # For segment i, the segment start:
    seg_start = overall_start + timedelta(days=i * segment_length)
    # The latest valid start in this segment must allow for a 14-day window.
    seg_end = seg_start + timedelta(days=segment_length - 28)
    if seg_end < seg_start:
        raise ValueError(f"Segment {i} has no valid start date.")
        
    # Choose a random start date within [seg_start, seg_end]
    delta = (seg_end - seg_start).days
    offset = random.randint(0, delta) if delta > 0 else 0
    base_date = seg_start + timedelta(days=offset)
    user_start_dates[user] = base_date

# Now, update the schedules: for every scheduled activity for each user,
# convert the day indices into actual calendar dates by adding that many days
# to the user's base start date. Here, we assume that day "d" in the schedule
# corresponds to: actual_date = base_date + d days.
#
# (For example, if a user's base date is 2024-03-05, then:
#    day 1  => 2024-03-06,
#    day 2  => 2024-03-07, etc.)
#
# Also update the message_time: its day component is replaced with the actual date.

for user, schedule_info in user_schedules.items():
    base_date = user_start_dates[user]
    # Update each scheduled activity
    for activity in schedule_info["schedule"]:
        # Update the list of days
        actual_days = []
        for day in activity["days"]:
            # Map day index to actual calendar date: add "day" days to base_date.
            # (So day 1 becomes base_date + 1 day.)
            actual_date = base_date + timedelta(days=day)
            actual_days.append(actual_date.strftime("%Y-%m-%d"))
        activity["days"] = actual_days
        
        # Update message_time: convert its day (which is a day index) to actual date.
        qs_day_index, qs_hour = activity["question_time"]
        actual_qs_date = base_date + timedelta(days=qs_day_index)
        activity["question_time"] = (actual_qs_date.strftime("%Y-%m-%d"), qs_hour)


In [14]:
# Example: Print schedule for the first 2 users.
for user in users[:2]:
    print(f"Schedule for {user}:")
    for activity in user_schedules[user]["schedule"]:
        print(activity)
    print("\n" + "="*50 + "\n")


Schedule for Elita:
{'user_2': 'Astraea', 'work': 'practice a musical instrument', 'activity_type': 'Repeating-Sequential', 'days': ['2021-08-09', '2021-08-10', '2021-08-11'], 'hours': (15, 19), 'question_time': ('2021-08-10', 17)}
{'user_2': 'Granger', 'work': 'attend a virtual meeting', 'activity_type': 'Repeating-Sequential', 'days': ['2021-08-06', '2021-08-07', '2021-08-08'], 'hours': (15, 19), 'question_time': ('2021-08-07', 18)}
{'user_2': 'Kiahra', 'work': 'check daily emails', 'activity_type': 'Repeating-Sequential', 'days': ['2021-08-08', '2021-08-09', '2021-08-10', '2021-08-11'], 'hours': (9, 11), 'question_time': ('2021-08-10', 10)}
{'user_2': 'Madoc', 'work': 'monitor system performance', 'activity_type': 'Repeating-Non-Sequential', 'days': ['2021-08-05', '2021-08-13'], 'hours': (15, 19), 'question_time': ('2021-08-13', 17)}
{'user_2': 'Urania', 'work': 'schedule routine maintenance', 'activity_type': 'Repeating-Non-Sequential', 'days': ['2021-08-04', '2021-08-13', '2021-08

In [18]:
import random
from datetime import datetime

for user in user_schedules:
    # Get all question times for this user's schedule
    question_times = [
        activity["question_time"] 
        for activity in user_schedules[user]["schedule"]
    ]
    
    # Create a shuffled copy of question times to use as message times
    message_times = question_times.copy()
    
    # Keep shuffling until no item has same question and message time
    valid_shuffle = False
    while not valid_shuffle:
        random.shuffle(message_times)
        # Check if any item would get same question and message time
        valid_shuffle = all(q != m for q, m in zip(question_times, message_times))
    
    # Assign the shuffled message times to activities
    for activity, message_time in zip(user_schedules[user]["schedule"], message_times):
        activity["message_time"] = message_time

        # Calculate offset days between message time and each activity day
        message_date = datetime.strptime(message_time[0], "%Y-%m-%d")
        offset_days = []

        for activity_day in activity["days"]:
            activity_date = datetime.strptime(activity_day, "%Y-%m-%d")
            # Calculate the difference in days
            delta = (activity_date - message_date).days
            offset_days.append(delta)
        
        activity["offset_days"] = offset_days

In [19]:
# Example: Print the post-processed schedule for the first user.
first_user = sorted_users[0]
for schedule in user_schedules[first_user]["schedule"]:
    schedule["message_time"] = (schedule["message_time"][0], datetime.strptime(schedule["message_time"][0], "%Y-%m-%d").strftime("%A").lower() , schedule["message_time"][1])
print(f"User: {first_user}")
print("Base Start Date:", user_start_dates[first_user].strftime("%Y-%m-%d"))
# print("Two-week window:", user_schedules[first_user]["actual_two_week_window"])
print("Scheduled Activities:")
for activity in sorted(user_schedules[first_user]["schedule"], key=lambda x: x["message_time"]):
    print(activity)

User: Alaina
Base Start Date: 2020-01-04
Scheduled Activities:
{'user_2': 'Dariusz', 'work': 'record a podcast episode', 'activity_type': 'One-Time', 'days': ['2020-01-14'], 'hours': (7, 10), 'question_time': ('2020-01-14', 9), 'message_time': ('2020-01-05', 'sunday', 16), 'offset_days': [9]}
{'user_2': 'Keanu', 'work': 'set up a conference call', 'activity_type': 'One-Time', 'days': ['2020-01-07'], 'hours': (12, 15), 'question_time': ('2020-01-07', 13), 'message_time': ('2020-01-06', 'monday', 10), 'offset_days': [1]}
{'user_2': 'Kalina', 'work': 'update a training manual', 'activity_type': 'One-Time', 'days': ['2020-01-26'], 'hours': (8, 11), 'question_time': ('2020-01-26', 9), 'message_time': ('2020-01-07', 'tuesday', 13), 'offset_days': [19]}
{'user_2': 'Omar', 'work': 'conduct a market survey', 'activity_type': 'One-Time', 'days': ['2020-01-19'], 'hours': (7, 9), 'question_time': ('2020-01-19', 8), 'message_time': ('2020-01-08', 'wednesday', 10), 'offset_days': [11]}
{'user_2': 'S

In [62]:
import json
from pathlib import Path

# Create output directory if it doesn't exist
output_dir = Path("../../Dataset_Helping/T_Uni")
output_dir.mkdir(parents=True, exist_ok=True)

# Create filename with date range
output_file = output_dir / f"T_Uni_Structured.jsonl"

with open(output_file, 'w') as f:
    for user, schedule_data in user_schedules.items():
        # Create a copy of the schedule data and add the user
        schedule_with_user = schedule_data.copy()
        schedule_with_user['user'] = user
        for schedule in schedule_with_user['schedule']:
            schedule["message_time"] = (schedule["message_time"][0], datetime.strptime(schedule["message_time"][0], "%Y-%m-%d").strftime("%A").lower() , schedule["message_time"][1])
        # Write each user's schedule as a JSON line
        json.dump(schedule_with_user, f)
        f.write('\n')

print(f"User schedules saved to {output_file}")


User schedules saved to ../../Dataset_Helping/T_Uni/T_Uni_Structured.jsonl


### Reforming

In [63]:
import re

generated_conversation = []
with open("../../Dataset_Helping/T_Uni/T_Uni_Structured_Generated_step_1.jsonl", "r") as file:
    for line in file:
        output = line.strip()
        generated_conversation.append(output)


In [64]:
def filter_generated_conversation_step_1(generated_conversation):

    pattern = r'[\[|\{|\(]\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*,\s*[\[|\{|\(](.*?)[\)|\]|\}]\s*[,]?\s*[\]|\)|\}]'
    conversation_list = []
    mistaken_conversation_idx = []
    num_conversation = 10
    for idx , text in enumerate(generated_conversation):
        # Find all matches; each match is a tuple of three strings
        matches = re.findall(pattern, str(text).replace("\\n", '').replace("\\", ''))

        # Option 1: Convert to a list of lists
        conversation_list_extracted = [tuple(match) for match in matches]
        if len(conversation_list_extracted) != 1:
            mistaken_conversation_idx.append(idx)
            conversation_list.extend('-')
        if len(conversation_list_extracted) == 1:
            if len(conversation_list_extracted[0]) == num_conversation:
                final_coversation = ast.literal_eval("[" + " ".join(["(" + conv + ") ," for conv in conversation_list_extracted[0]])[:-1] + "]")
                conversation_list.append(final_coversation)
            else:
                conversation_list.extend('-')
                print('------------')
                print(conversation_list_extracted)

    return mistaken_conversation_idx, conversation_list

mistaken_conversation_idx, conversation_list = filter_generated_conversation_step_1(generated_conversation)
print(len(mistaken_conversation_idx))

0


In [65]:
mistaken_conversation_idx

[]

In [67]:
import json
structured_data = []
with open("../../Dataset_Helping/T_Uni/T_Uni_Structured.jsonl", "r") as file:
    for line in file:
        output = json.loads(line.strip())
        structured_data.append(output)
schedules = []
users_name = []
for i in range(len(structured_data)):
    for schedule in structured_data[i]['schedule']:
        schedules.append(schedule)
        users_name.append(structured_data[i]['user'])


len(schedules)

1500

In [68]:
import re
import ast
generated_response = []
with open("../../Dataset_Helping/T_Uni/T_Uni_Structured_Generated_step_2.jsonl", "r") as file:
    for line in file:
        output = line.strip()
        generated_response.append(output)

In [69]:
def filter_generated_works_parsing_step_2(generated_response, schedules):

    pattern_2 = r'\{["\'\s]*work["\'\s]*:[^}]+,["\'\s]*days["\'\s]*:[^}]+,["\'\s]*hours["\'\s]*:[^}]+\}'
    extracted_work_list = generated_response.copy()
    mistaken_extracted_idx = []
    for idx , text in enumerate(generated_response):
        # Find all matches; each match is a tuple of three strings
        matches = re.findall(pattern_2, text.replace("\\n", '').replace("\\", ''))
        schedule = schedules[idx]
        if len(matches) > 0:
        # Option 1: Convert to a list of lists
            try:
                extracted_item = [(match) for match in matches]
                try:
                    if type(extracted_item) == list:
                        work_dictionary = ast.literal_eval(extracted_item[0])
                    else:
                        work_dictionary = ast.literal_eval(extracted_item)

                    correct_day = schedule['days']
                    extracted_day = work_dictionary['days']

                    correct_hour = schedule['hours']
                    extracted_hour = work_dictionary['hours']

                    correct_hour_flag = False
                    if correct_hour[0] == extracted_hour[0] and correct_hour[1] == extracted_hour[1]:
                        correct_hour_flag = True

                    correct_day_flag = True
                    for day in set(correct_day):
                        if day not in set(extracted_day):
                            correct_day_flag = False

                    if len(set(correct_day)) != len(set(extracted_day)):
                        correct_day_flag = False

                    if not (correct_day_flag and correct_hour_flag):
                        mistaken_extracted_idx.append(idx)
                    else:
                        extracted_work_list[idx] = work_dictionary
                        mistaken_extracted_idx = [i for i in mistaken_extracted_idx if i != idx]

                except:
                    mistaken_extracted_idx.append(idx)
                    extracted_work_list[idx] = '-'
            except:
                mistaken_extracted_idx.append(idx)
                extracted_work_list[idx] = '-'
        else:
            mistaken_extracted_idx.append(idx)
            extracted_work_list[idx] = '-'
    
    return sorted(set(mistaken_extracted_idx)), extracted_work_list


In [70]:
mistaken_works_idx, extracted_work_list = filter_generated_works_parsing_step_2(generated_response, schedules)

In [71]:
[i for i in range(len(extracted_work_list)) if extracted_work_list[i] == '-']

[582, 584, 614, 700, 1245, 1272, 1362]

In [72]:
mistaken_works_idx

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [73]:
handy_edited_convs = {
"i_12" : '''[ ["2020-05-10", "Alaina", "Hey Aquila, how's it going?"], ["2020-05-10", "Alaina", "I feel like I haven't seen you in ages, what's new with you?"], ["2020-05-10", "Alaina", "By the way, I was thinking of trying out a new restaurant this weekend, have you eaten there before?"], ["2020-05-10", "Alaina", "So, I was just thinking about you and I wanted to reach out."], ["2020-05-10", "Aquila", "I've been keeping busy, thanks for asking. I have to monitor system performance today from 10 a.m. for 2 hours."], ["2020-05-10", "Aquila", "I also have to do it a couple of weeks ago, on the 24th of last month, and then I'll do it again soon, on the 11th and 12th of next month."], ["2020-05-10", "Alaina", "That sounds like a lot of work, are you sure you can handle it all?"], ["2020-05-10", "Aquila", "Yeah, I'm confident I can manage it. I just need to make sure I stay on top of things."], ["2020-05-10", "Alaina", "Well, if you need any help don't hesitate to ask, okay?"], ["2020-05-10", "Alaina", "Take care and talk to you soon!"]]''',
"i_76" : '''[ ["2023-10-29", "Alaina", "Hey Serilda, how's it going?"],  ["2023-10-29", "Alaina", "I feel like I haven't seen you in ages. What's new with you?"],  ["2023-10-29", "Alaina", "By the way, have you got any fun plans for the upcoming week?"],  ["2023-10-29", "Alaina", "I'm trying to get a headcount for a party I'm planning."],  ["2023-10-29", "Serilda", "I have to review a research paper in three days later, from 7 in the morning for 2 hours."],  ["2023-10-29", "Alaina", "That sounds like a big task! Make sure you're taking breaks to stretch and stay focused."],  ["2023-10-29", "Serilda", "Yeah, I'm planning to get it done quickly so I can move on to other things."],  ["2023-10-29", "Alaina", "I'm sure you'll do great. You're always so on top of things."],  ["2023-10-29", "Alaina", "So, I was thinking of having the party on a weekend. What do you think?"],  ["2023-10-29", "Alaina", "Let me know if you have any suggestions or ideas!"] ]''',
"i_184" : '''[ ["2020-02-26", "Alaina", "Hey Alair, how's it going?"], ["2020-02-26", "Alaina", "What's on your mind these days?"], ["2020-02-26", "Alaina", "You seem a bit busy lately."], ["2020-02-26", "Alaina", "Do you have any fun plans for the weekend?"], ["2020-02-26", "Alair", "I have to prepare a daily briefing starting from first of the next month for three consecutive days."], ["2020-02-26", "Alair", "I'll be doing it from 3 p.m. until 3 hours later each day."], ["2020-02-26", "Alaina", "That sounds like a lot of work. Are you feeling up to the task?"], ["2020-02-26", "Alaina", "Do you get a chance to take breaks in between?"], ["2020-02-26", "Alaina", "I hope it all pays off in the end. You must be looking forward to wrapping it up."], ["2020-02-26", "Alaina", "Take care and let me know if there's anything I can do to help."] ]''',
"i_189" : '''[ ["2020-03-07", "Person2", "Hey Alair, how's it going?"],["2020-03-07", "Person2", "I feel like I haven't seen you in ages, what's new with you?"], ["2020-03-07", "Person2", "How was your week? Any exciting plans?"], ["2020-03-07", "Person2", "I'm so behind on my own schedule, I need to get my act together."], ["2020-03-07", "Alair", "I have to manage a daily work schedule on the 12th, from 8 in the morning for 2 hours, I have done it a while ago, on the 20th of the last month, same time, for 2 hours."], ["2020-03-07", "Person2", "That sounds like a big task! Are you feeling organized?"], ["2020-03-07", "Alair", "Actually, not really, I have a lot to do."], ["2020-03-07", "Person2", "You're a rockstar, Alair! I'm sure you'll get it done."], ["2020-03-07", "Alair", "Thanks for the support! I'm just trying to stay on top of things."], ["2020-03-07", "Person2", "No problem, that's what friends are for. Let's catch up soon!"] ]''',
"i_280" : '''[ ["2021-09-11", "Alaina", "Hey Giselle, how's it going?"], ["2021-09-11", "Alaina", "I feel like I haven't seen you in ages, what's new with you?"], ["2021-09-11", "Alaina", "By the way, I was thinking of trying out a new coffee shop this weekend, have you tried any good ones lately?"], ["2021-09-11", "Alaina", "I'm so busy with work, I'm looking forward to the weekend already."], ["2021-09-11", "Giselle", "I've been pretty occupied with work on daily planning, actually."], ["2021-09-11", "Giselle", "I have done daily planning on the 23rd of last month, again two days after that, but it did not finish, hence I had to do it again tomorrow, I hope it gets finished."], ["2021-09-11", "Giselle", "It usually takes me a couple of hours, from 9 in the morning until 2 hours later on each of these three days."], ["2021-09-11", "Giselle", "It's been a bit hectic, but I'm managing to stay on top of things."], ["2021-09-11", "Alaina", "That sounds like a lot of work, but I'm sure you'll get it done. Do you have any fun plans for the rest of the weekend?"], ["2021-09-11", "Alaina", "I'm thinking of going for a hike on Sunday, the weather is supposed to be great."] ]''',
"i_402" : '''[ ["2023-02-07", "Alaina", "Hey Nestor, how's it going?"], ["2023-02-07", "Alaina", "I feel like I haven't seen you around the office much lately. What's new with you?"], ["2023-02-07", "Alaina", "By the way, I was thinking of grabbing lunch today. Have you eaten yet?"], ["2023-02-07", "Alaina", "I'm thinking of trying out that new sandwich shop down the street. Have you been there before?"], ["2023-02-07", "Nestor", "I've had a pretty busy schedule lately. I've been monitoring team activity on certain days."], ["2023-02-07", "Nestor", "It's been a bit sporadic, but I've had to keep an eye on things from 1 p.m. until 2 hours later."], ["2023-02-07", "Nestor", "I did it on the 30th of the last month, and then I had to do it again twelve days after today. And actually, I have another one coming up in two days after that."], ["2023-02-07", "Nestor", "It's been a bit of a challenge keeping track of everything, but I'm managing."], ["2023-02-07", "Alaina", "That sounds like a lot of work! I'm sure it's not easy keeping track of everything."], ["2023-02-07", "Alaina", "If you need any help or just want to chat about it, I'm here to listen."], ]''',
"i_424" : '''[ ["2023-08-25", "Alex", "Hey Rionne, how's it going? I feel like I haven't seen you in ages."], ["2023-08-25", "Alex", "So, I was thinking of grabbing lunch this weekend. Have you tried that new sandwich shop downtown?"], ["2023-08-25", "Alex", "By the way, I was meaning to ask you, have you been keeping busy lately?"], ["2023-08-25", "Alex", "I'm a bit worried that I might have some work to do over the weekend, but I'm hoping to get it all done on Friday."], ["2023-08-25", "Rionne", "I've been pretty busy actually. I had to update the task board a few days ago, from 10 in the morning until 4 hours later, and I had to do it for 3 consecutive days, starting on 15 days ago."], ["2023-08-25", "Alex", "That sounds like a lot of work! I'm sure it was worth it in the end, though. Did you get any help with it or was it all on your own?"], ["2023-08-25", "Alex", "I've been meaning to organize my own workspace, maybe I can get some inspiration from you."], ["2023-08-25", "Alex", "So, are you free to hang out this weekend or are you still catching up on work?"], ["2023-08-25", "Rionne", "I'm actually free now that I've finished updating the task board. Let's plan something for this weekend!"], ["2023-08-25", "Alex", "Awesome! I'll send you a message on Friday to finalize plans. Have a great evening!"] ]''',
"i_582" : '''[ ["2022-06-21", "Alaina", "Hey Lilleth, how's it going? I feel like I haven't seen you in ages."], ["2022-06-21", "Alaina", "So, I was thinking of trying out that new coffee shop downtown. Have you been there yet?"], ["2022-06-21", "Alaina", "By the way, I hope you're doing well. I know you've been busy lately."], ["2022-06-21", "Alaina", "I'm thinking of planning a little get-together this weekend. Would you be interested?"], ["2022-06-21", "Lilleth", "I've been doing some checking of routine communications recently."], ["2022-06-21", "Lilleth", "I did it a while ago, 17 days ago, from 7 in the morning until 4 hours later."], ["2022-06-21", "Lilleth", "And then I had to do it four days ago, again from 7 in the morning until 4 hours later."], ["2022-06-21", "Lilleth", "And I've got one more to do, 2 days after that one, same hours, you know, checking routine communications."], ["2022-06-21", "Alaina", "That sounds like a lot of work! I'm sure it's not easy keeping track of all those communications."], ["2022-06-21", "Alaina", "But I'm glad you're on top of it. You're always so organized, Lilleth."] ]''',
"i_584" : '''[ ["2022-06-09", "Steffen", "Hey Lilleth, how's it going?"], ["2022-06-09", "Lilleth", "Hey Steffen, I'm good. Just been busy with some work. I have to monitor quality control metrics tomorrow."], ["2022-06-09", "Steffen", "Ah, that sounds important. How long do you think you'll be tied up with that?"], ["2022-06-09", "Lilleth", "Just a couple of hours, from 11 a.m. until 2 hours after that."], ["2022-06-09", "Steffen", "Okay, got it. And do you have any other commitments like that coming up?"], ["2022-06-09", "Lilleth", "Actually, I should repeat it 7 days after today. Also have done in once in the last day of the last month."], ["2022-06-09", "Steffen", "Alright, well just make sure you stay on top of things. You know how important quality control is."], ["2022-06-09", "Lilleth", "Definitely, I'm on it."], ["2022-06-09", "Steffen", "Great, I'm sure it'll all go smoothly then."], ["2022-06-09", "Steffen", "Anyway, I should let you get back to work. Talk to you later!"] ]''',
"i_614" : '''[ ["2024-01-11", "Alaina", "Hey Shifra, how's it going? Thursday is almost here!"], ["2024-01-11", "Alaina", "I feel like I haven't seen you in ages, what's new with you on almost-thursday?"], ["2024-01-11", "Alaina", "By the way, I was thinking of starting a new exercise routine, have you ever tried anything like that on Thursdays?"], ["2024-01-11", "Alaina", "I'm thinking of trying to get some fresh air and get moving, you know?"], ["2024-01-11", "Shifra", "I've been trying to get some exercise too, actually. I go for a morning jog at various times. At the end of last year, 28th of the last month, I did, then 6 days ago. Another one is scheduled soon, two days later from now. from eleven until a few hours later "], ["2024-01-11", "Alaina", "That sounds like a great plan! I'm sure it'll be worth it. You always seem so energized after your workouts."], ["2024-01-11", "Alaina", "Do you have any favorite jogging spots?"], ["2024-01-11", "Alaina", "I'm always looking for new places to explore."], ["2024-01-11", "Alaina", "Anyway, it was great catching up with you, Shifra. Let's do it again soon!"], ["2024-01-11", "Alaina", "Take care!"] ] ''',
"i_673" : '''[ ["2021-06-28", "Alaina", "Hey Fortune, how's it going?"], ["2021-06-28", "Alaina", "I feel like I haven't seen you in ages, what's new with you?"], ["2021-06-28", "Alaina", "By the way, I was thinking of trying out a new coffee shop this week, have you tried any good ones lately?"], ["2021-06-28", "Alaina", "So, do you have any fun plans for the summer?"], ["2021-06-28", "Fortune", "I've been pretty busy with work lately. I have to prepare a daily briefing sometimes."], ["2021-06-28", "Fortune", "It's a bit sporadic, but I had to do it about 19 days ago, and I have to do it again in a 5 days later."], ["2021-06-28", "Fortune", "It usually takes me a couple of hours, from noon until 2 p.m."], ["2021-06-28", "Fortune", "I'm just trying to stay on top of things, you know?"], ["2021-06-28", "Alaina", "That sounds like a lot of work! I'm sure you'll get it done, though."], ["2021-06-28", "Alaina", "Well, it was great catching up with you, Fortune. Let's do it again soon!"] ]''',
"i_700" : '''[ ["2021-12-29", "Alaina", "Hey Josmar, how's it going? I feel like I haven't seen you in ages."], ["2021-12-29", "Alaina", "So, what's new with you? Any exciting plans or just the usual routine?"], ["2021-12-29", "Alaina", "By the way, I was thinking of trying out a new hobby. Have you ever thought of learning a new skill?"], ["2021-12-29", "Alaina", "I'm thinking of something musical, but I'm not sure if I have the time for it."], ["2021-12-29", "Josmar", "I've been pretty busy lately, but I did manage to practice a musical instrument a few times recently."], ["2021-12-29", "Josmar", "I had to do it about 22 days ago, then two weeks ago, and then I had another session 4 days after today. It is quite a spread-out schedule."], ["2021-12-29", "Josmar", "Each time, I had to do it from 7 in the morning for 3 hours. It's always a bit of a challenge to get started that early!"], ["2021-12-29", "Josmar", "But it's great to have some consistency, even if it's not on consecutive days. How about you, have you been up to anything exciting?"], ["2021-12-29", "Alaina", "That's really cool that you're making time for it. I'm sure it's not easy to practice a musical instrument."], ["2021-12-29", "Alaina", "I'm actually thinking of attending a concert soon. Have you been to any good ones recently?"] ]''',
"i_703" : '''[ ["2021-12-26", "Steffen", "Hey Josmar, how's it going? I feel like I haven't seen you in ages."], ["2021-12-26", "Steffen", "What's new with you? Any exciting plans for the holidays?"], ["2021-12-26", "Steffen", "I'm trying to get some work done before the new year, but it's tough to focus."], ["2021-12-26", "Steffen", "Do you have any fun activities planned for the weekend?"], ["2021-12-26", "Josmar", "I've been quite busy with reviewing project metrics, actually."], ["2021-12-26", "Josmar", "I had to do it a three days ago and I'll have to do it again 6 days later from now."], ["2021-12-26", "Josmar", "It's a couple of hours of work, from 3 p.m. until 6 p.m. on both days."], ["2021-12-26", "Josmar", "I'm just trying to get it out of the way so I can enjoy the rest of my holiday."], ["2021-12-26", "Steffen", "That sounds like a drag. I'm sure it'll be over before you know it."], ["2021-12-26", "Steffen", "Well, I should let you get back to it then. Have a great day and happy holidays!"] ]''',
"i_764" : '''[ ["2023-12-18", "Lara", "Hey Sherise, how's it going? I feel like I haven't seen you all week."], ["2023-12-18", "Sherise", "Hey Lara, I'm doing alright. Just been busy with some work. I have to review operational data for a few days this week."], ["2023-12-18", "Lara", "That sounds like a lot of work! Are you doing it all in one day or spread out over a few days?"], ["2023-12-18", "Sherise", "It's spread out. I did it a while back, on the 5th, and then I have to do it again 7 days ago, and finally 5 days later from now. It's just a few hours each time, from 7 in the morning until 4 hours later."], ["2023-12-18", "Lara", "Okay, that makes sense. I'm sure it's not fun, but at least it's not all at once."], ["2023-12-18", "Lara", "By the way, have you heard about the holiday party plans?"], ["2023-12-18", "Sherise", "No, I haven't. What's going on?"], ["2023-12-18", "Lara", "I heard it's going to be on the 25th. We should grab lunch together before it starts."], ["2023-12-18", "Sherise", "Sounds like a plan!"], ["2023-12-18", "Lara", "Great, I'll send you a message on Monday to finalize plans."] ]''',
"i_820" : '''[ ["2022-07-04", "Lara", "Good morning, Mairwen! How are you doing today?"], ["2022-07-04", "Mairwen", "I'm doing alright, thanks. Just been thinking about some work I have to do."], ["2022-07-04", "Lara", "Oh yeah? What's been on your mind?"], ["2022-07-04", "Mairwen", "I've been tasked with monitoring quality control metrics. I had to do it a while back on the 7 days ago, then again tomorrow, and another time is 15 days later."], ["2022-07-04", "Lara", "That sounds interesting. I'm sure it's keeping you on your toes."], ["2022-07-04", "Mairwen", "It is. On those days, I had to do it from 8 in the morning for 3 hours."], ["2022-07-04", "Lara", "Okay, well you're doing great. I'm sure it'll all pay off in the end."], ["2022-07-04", "Mairwen", "I hope so! I'm just taking it one day at a time."], ["2022-07-04", "Lara", "That's the right attitude. I'm sure you'll do great."], ["2022-07-04", "Lara", "Anyway, I should let you get back to work. Talk to you later!"] ] ''',   
"i_822" : '''[ ["2022-07-21 Thursday", "Alaina", "Hey Mairwen, how's it going? I feel like I haven't seen you in ages."], ["2022-07-21 Thursday", "Alaina", "So, what's new with you? Any exciting plans coming up?"], ["2022-07-21 Thursday", "Alaina", "I'm thinking of planning a weekend getaway. Have you traveled anywhere fun recently?"], ["2022-07-21 Thursday", "Alaina", "By the way, I was thinking of trying out a new restaurant downtown. Have you been there?"], ["2022-07-21 Thursday", "Mairwen", "I've been pretty busy, actually. I have to conduct a daily review session 18 days ago, and then 3 days ago, and finally 2 days later from now, from 7 in the morning until 4 hours later."], ["2022-07-21 Thursday", "Alaina", "That sounds like a lot of work! I'm sure it's keeping you on your toes."], ["2022-07-21 Thursday", "Alaina", "I'm glad you're staying organized, though. That's definitely important for your line of work."], ["2022-07-21 Thursday", "Alaina", "So, do you have any downtime planned or are you always going to be busy like this?"], ["2022-07-21 Thursday", "Mairwen", "I'm just taking things one day at a time, I suppose. I'm just trying to stay focused and get everything done."], ["2022-07-21 Thursday", "Alaina", "Well, you're definitely doing a great job of juggling everything. I'm sure it'll all pay off in the end."] ]''',
"i_842" : '''[ ["2024-11-18", "Alaina", "Hey Zosime, how was your week?"], ["2024-11-18", "Alaina", "Did you have a chance to catch up on anything important recently?"], ["2024-11-18", "Alaina", "I feel like I haven't seen you around much lately."], ["2024-11-18", "Alaina", "How's life been treating you?"], ["2024-11-18", "Zosime", "I've been pretty occupied with work. I had to participate in a daily briefing a few days ago, from 7 in the morning until 2 hours later."], ["2024-11-18", "Zosime", "It was a sequence of days, starting from 12 days ago and lasting for 4 days."], ["2024-11-18", "Zosime", "I had to do it at the same time every day."], ["2024-11-18", "Zosime", "I'm glad that's over now."], ["2024-11-18", "Alaina", "That sounds intense. I'm sure you're relieved it's done."], ["2024-11-18", "Alaina", "Let's catch up soon, maybe over coffee?"] ]''',
"i_1060" : '''[ ["2023-09-25", "Alaina", "Hey Roxandra, how's your week going so far?"], ["2023-09-25", "Alaina", "I feel like I haven't seen you around much lately, what's new?"], ["2023-09-25", "Alaina", "By the way, have you got any fun plans for the weekend?"], ["2023-09-25", "Alaina", "I'm thinking of trying out a new restaurant downtown, have you heard anything about it?"], ["2023-09-25", "Roxandra", "I've been pretty busy with work on daily planning, I have to do it a few times."], ["2023-09-25", "Roxandra", "Actually, I did it a while ago, nine days ago from 11 in the morning for four hours"], ["2023-09-25", "Roxandra", "And I also have to do it four days later, and one day after that"], ["2023-09-25", "Roxandra", "It's always a bit of a challenge to fit it in, but it's worth it in the end"], ["2023-09-25", "Alaina", "That sounds like a lot of work, but I'm sure it's worth it"], ["2023-09-25", "Alaina", "Well, let me know if you want to grab lunch or coffee sometime, I'd love to catch up"] ]''',
"i_1111" : '''[ ["2022-09-28", "Alessia", "Hey Marisolyn, how's it going? I feel like I haven't seen you in ages."], ["2022-09-28", "Alessia", "What have you been up to lately? Any new projects or just the usual routine?"], ["2022-09-28", "Alessia", "I was just thinking about you the other day and wondering if you're doing okay."], ["2022-28-09", "Alessia", "So, I was thinking of grabbing lunch this weekend, want to join me?"], ["2022-09-28", "Marisolyn", "I've been pretty busy actually, I had to check system alerts a few days ago, from 7 in the morning for 2 hours, and I had to do it for 3 consecutive days, starting 12 days ago."], ["2022-09-28", "Marisolyn", "It was a bit of a challenge, but I got it done. I'm just glad it's over now."], ["2022-09-28", "Marisolyn", "I'm looking forward to having some free time again, I haven't had a chance to relax in a while."], ["2022-09-28", "Marisolyn", "Maybe we can catch up then and I can fill you in on everything that's been going on."], ["2022-09-28", "Alessia", "That sounds great, I'd love to hear about what's new with you. I'm looking forward to it!"], ["2022-09-28", "Alessia", "Let's plan something for this weekend and I'll send you a message to confirm."] ]''',
"i_1245" : '''[ ["2024-02-26", "Alaina", "Hey Toliver, how's it going? I feel like I haven't seen you in ages."], ["2024-02-26", "Alaina", "How was your week? Any exciting plans coming up?"], ["2024-02-26", "Alaina", "I'm so behind on work, I feel like I'm drowning in tasks. Do you have any tips on how to stay organized?"], ["2024-02-26", "Alaina", "By the way, I was thinking of trying out that new coffee shop downtown. Have you been there yet?"], ["2024-02-26", "Toliver", "I have to revise a project timeline on 22nd of the next month from 7 in the morning for three hours."), ["2024-02-26", "Alaina", "That sounds like a big task! Make sure you take breaks and stay focused."], ["2024-02-26", "Toliver", "I'm planning to get it done quickly so I can move on to other things."), ["2024-02-26", "Alaina", "That's the right attitude! I'm sure you'll get it done in no time."], ["2024-02-26", "Alaina", "I'm looking forward to hearing about how it goes. Let's catch up soon!"], ["2024-02-26", "Alaina", "Take care and have a great day!"] ]''',
"i_1272" : '''[ ["2020-10-07", "Carnelian", "Hey, I'm getting a bit swamped with work. Just wanted to touch base with you."], ["2020-10-07", "Sasha", "Hey Carnelian! How's it going? What's new with you?"], ["2020-10-07", "Sasha", "You seem a bit busy, need any help or just someone to talk to?"], ["2020-10-07", "Sasha", "By the way, have you had a chance to catch up on any of your routine tasks?"], ["2020-10-07", "Carnelian", "Actually, I've had to check routine communications a few times recently. It's been a bit of a challenge keeping up."], ["2020-10-07", "Carnelian", "I had to do it a while ago, on the 27th of last month, and then I had to do it again 3 days ago."], ["2020-10-07", "Carnelian", "And I still have to do it once again in this month, specifically two days from now."], ["2020-10-07", "Carnelian", "It's always from 8 in the morning for 3 hours, so it's not too bad, but it's just been hard to keep track of everything."], ["2020-10-07", "Sasha", "Yeah, it can get overwhelming. But it sounds like you're on top of it. Do you have any tips for staying organized?"], ["2020-10-07", "Sasha", "I'm sure it's not easy, but you're doing great so far!"] ]''',
"i_1302" : '''[ ["2022-04-08", "Steffen", "Hey Kian, how's it going? I feel like I haven't seen you all week."], ["2022-04-08", "Steffen", "By the way, I was thinking of trying out that new coffee shop downtown. Have you been there yet?"], ["2022-04-08", "Steffen", "I'm so busy with work right now, I'm not sure how I'm going to get everything done."], ["2022-04-08", "Steffen", "Do you have any fun plans for the weekend?"], ["2022-04-08", "Kian", "I've had a pretty packed schedule lately. I have to conduct a daily review session in a few days, and also yesterday, and some time before that."], ["2022-04-08", "Kian", "The one a few days ago was quite a while ago, and the one yesterday was a bit of a break, but the one before that was quite some time before."], ["2022-04-08", "Kian", "I had to conduct a daily review session 28th of the last month at 10 in the morning for 2 hours."], ["2022-04-08", "Kian", "Then I have to do it again 4 days later from now, and then again 5 days after that, both at the same time."], ["2022-04-08", "Steffen", "That sounds like a lot of work! I'm sure you'll get it all done."], ["2022-04-08", "Steffen", "I'm looking forward to trying out that coffee shop with you soon. Let's plan something for next week!"] ]''',
"i_1362" : '''[ ["2022-03-02", "Steffen", "Hey Kenji, how's it going? I feel like I haven't seen you in ages."], ["2022-03-02", "Kenji", "Hey Steffen, I'm good. I've been pretty busy lately. I have to check inventory levels on the 12th of last month from 11 in the morning for 4 hours."], ["2022-03-02", "Steffen", "That sounds like a lot of work. Do you have any fun plans coming up?"], ["2022-03-02", "Kenji", "Actually, I have to do the same thing a few days later, on the 15th, and then again a week later, on the 21st."], ["2022-03-02", "Steffen", "Wow, you're really staying on top of things. I'm a bit jealous, to be honest. I've been slacking off lately."], ["2022-03-02", "Kenji", "Thanks, I guess it's just part of my job. Anyway, I should probably get back to work."], ["2022-03-02", "Steffen", "Alright, well, take care and talk to you later."], ["2022-03-02", "Steffen", "By the way, have you tried that new coffee shop downtown?"], ["2022-03-02", "Kenji", "Not yet, I haven't had a chance. I'll have to check it out soon."], ["2022-03-02", "Steffen", "Definitely do that. I'll hold you to it."] ]''',
}

In [74]:
for i in range(len(mistaken_works_idx)):
    conversation_list[mistaken_works_idx[i]] = handy_edited_convs[f"i_{mistaken_works_idx[i]}"]
    if extracted_work_list[mistaken_works_idx[i]] == '-':
        extracted_work_list[mistaken_works_idx[i]] = {'work': schedules[mistaken_works_idx[i]]['work'], 'days': schedules[mistaken_works_idx[i]]['days'], 'hours': schedules[mistaken_works_idx[i]]['hours']}

KeyError: 'i_0'

In [47]:
# testing
a = [str(item) for item in extracted_work_list]
mistaken_works_idx, a = filter_generated_works_parsing_step_2(a, schedules)
mistaken_works_idx

[]

In [48]:
mistaken_conversation_idx, conversation_list = filter_generated_conversation_step_1(conversation_list)
mistaken_conversation_idx

[]

### Saving

In [76]:
import json

# Store conversations
with open("../../Dataset_Helping/T_Uni/T_Uni_conversations.jsonl", "w") as file:
    for conv in conversation_list:
        # Convert tuples to lists and save as proper JSON
        file.write(json.dumps(conv) + "\n")



In [77]:
# Load the files again
with open("../../Dataset_Helping/T_Uni/T_Uni_conversations.jsonl", "r") as f:
    conversation_list = [json.loads(line) for line in f]

with open("../../Dataset_Helping/T_Uni/T_Uni_works.jsonl", "r") as f:
    extracted_work_list = [json.loads(line) for line in f]


### Evaluation

In [35]:
import json
structured_data = []
with open("../Data/T_Uni_2020_2024_Structured.jsonl", "r") as file:
    for line in file:
        output = json.loads(line.strip())
        structured_data.append(output)
schedules = []
users_name = []
for i in range(len(structured_data)):
    for schedule in structured_data[i]['schedule']:
        schedules.append(schedule)
        users_name.append(structured_data[i]['user'])

In [36]:
# transforming work phrases to different forms

def transform_phrase(phrase):
    """
    Given a work phrase in the form "verb rest_of_phrase", returns a dictionary with keys:
    - base: the original phrase
    - ing: present continuous (verb in -ing form)
    - past: simple past form
    - pp: present perfect (has + past participle)
    - ppc: present perfect continuous (has been + present participle)
    - future: future continuous (will be + present participle)
    - infinitive: "to" + verb
    - modal_should: "should" + verb
    """
    # Split the phrase into verb and the rest (object, etc.)
    tokens = phrase.split()
    if not tokens:
        return {}
    verb = tokens[0]
    rest = " ".join(tokens[1:])  # may be empty

    # Helper to form present participle (simple rule)
    if verb.lower() != "be" and verb.endswith("e"):
        present_participle = verb[:-1] + "ing"
    else:
        present_participle = verb + "ing"
    
    # Helper to form simple past (very naive rule)
    if verb.endswith("e"):
        simple_past = verb + "d"
    else:
        simple_past = verb + "ed"
    
    # Build the various forms; we simply attach the unchanged remainder.
    # In a full solution, you might need a more robust morphological transformation.
    def combine(form):
        return f"{form} {rest}".strip()  # strip to remove extra space if rest is empty

    transformed = {
        "base": phrase,
        "ing": combine(present_participle),
        "past": combine(simple_past),
        "pp": "has " + combine(simple_past),
        "ppc": "has been " + combine(present_participle),
        "future": "will be " + combine(present_participle),
        "infinitive": "to " + verb + (" " + rest if rest else ""),
        "modal_should": "should " + combine(verb)
    }
    return transformed

In [37]:
gold_asnwers = [transform_phrase(schedule['work'])['base'] for schedule in schedules]
required_information = [schedule['question_time'] for schedule in schedules]
gold_asnwers[:5]

['prepare a daily briefing',
 'perform routine testing',
 'prepare a daily summary',
 'maintain a training schedule',
 'review daily feedback']

In [119]:
# results_names = ['Msg_def_wCoT_output', 'Msg_def_woCoT_output', 'Msg_det_wCoT_output', 'Msg_det_woCoT_output',
#                  'Msg_def_wCoT_woConv_output', 'Msg_def_woCoT_woConv_output', 'Msg_det_wCoT_woConv_output', 'Msg_det_woCoT_woConv_output']

results_names = ['test_3', 'test_4']
resutls = []



for results_name in results_names:
    answer_missed = 0
    with open(f"../Data/T_Uni_2020_2024_E1_LC_{results_name}.jsonl", "r") as f:
        dataset = []
        for line in f:
            parsed = json.loads(line)
            parts = parsed.split('Answer:')
            if len(parts) > 1:
                dataset.append(parts[-1])
            else:
                dataset.append("")
                answer_missed += 1
        for item in dataset:
            if len(item.strip()) == 1:
                print(f"Line {i}")
                print(item)
        resutls.append(dataset)
    print(f"Answer missed: {answer_missed}")
len(resutls) 

Answer missed: 862
Answer missed: 1177


2

## Merging Conversations and structured

In [79]:
import json 
# Load the structured data
structured_data_path = "../../Dataset_Helping/T_Uni/T_Uni_Structured.jsonl"

with open(structured_data_path, 'r', encoding='utf-8') as f:
    structured_data = [json.loads(line) for line in f]

print(f"Loaded {len(structured_data)} records from structured data file")

# Load the generated conversation data
generated_data_path = "../../Dataset_Helping/T_Uni/T_Uni_Structured_Generated_step_1.jsonl"

with open(generated_data_path, 'r', encoding='utf-8') as f:
    generated_data = [json.loads(line) for line in f]

print(f"Loaded {len(generated_data)} records from generated data file")


Loaded 50 records from structured data file
Loaded 1500 records from generated data file


Struture:

user_ID - user - user_2 - conversation - extra_info - question - answer

conversation is a list of 10 utterances
extra_info is a dictionary with keys ['shopping_type' , 'item_to_buy', 'prices', 'payment cost']
the question has the following format: What did 'user' bought that had the price of 'payment cost'?


In [98]:
structured_data[0]['schedule'][0]

{'user_2': 'Astraea',
 'work': 'practice a musical instrument',
 'activity_type': 'Repeating-Sequential',
 'days': ['2021-08-09', '2021-08-10', '2021-08-11'],
 'hours': [15, 19],
 'question_time': ['2021-08-10', 17],
 'message_time': ['2021-08-04', 'wednesday', 8]}

In [99]:
import ast
from datetime import datetime, timedelta
import random

dataset = []


x = 0
for i in range(len(structured_data)):
    user_ID = i
    user = structured_data[i]['user']
    schedule_info_list = structured_data[i]['schedule']

    for j in range(len(schedule_info_list)):
        user_2 = schedule_info_list[j]['user_2']
        if i*30 + j != x:
            print(i*30 + j, x)
        assert i*30 + j == x
        x += 1
        conversation = generated_data[int(i*20 + j)]
        conversation = ast.literal_eval(conversation.replace('\\', ''))


        hour = random.randint(8, 17)
        minute = sorted(random.sample(range(0, 60), 10))

        conversation = [(f"{conversation[s][0]} {hour:02d}:{minute[s]:02d}", conversation[s][1], conversation[s][2]) for s in range(len(conversation))]


        # user_response = generated_data[int(i*20 + j)]
        if conversation == '-':
            print(conversation, i*20 + j)
            raise Exception("conversation is '-'")
        
        question = f"What exactly did {user} mentioned in the messages that had scheduled to do on  {schedule_info_list[j]['question_time'][0]} at {schedule_info_list[j]['question_time'][1]}?"
        dataset.append({
            "user_ID": user_ID,
            "user": user,
            "user_2": user_2,
            "conversation": conversation,
            "extra_info": {k:v for k,v in schedule_info_list[j].items() if k in ['activity_type', 'days', 'hours']},
            "question": question,
            "answer": schedule_info_list[j]['work']
        })


In [100]:
with open('../../Data/T_Uni.jsonl', 'w', encoding='utf-8') as f:
    for item in dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write('\n')

### Recall

In [120]:
from rouge_score import rouge_scorer

def calculate_rouge_scores(predictions, evaluation_text):
    # Initialize the ROUGE scorer with specific metrics
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE scores for each pair
    rouge_scores = []
    for pred, gold in zip(predictions, evaluation_text):

        # Clean up prediction text
        if len(pred)!=0:
            pred = transform_phrase(pred)['base'].strip().lower()
        # Clean up gold answer text
        gold = gold.strip().lower()
        scores = scorer.score(gold, pred)
        rouge_scores.append(scores)

    # Calculate average scores using recall instead of f-measure
    avg_rouge1 = sum(score['rouge1'].recall for score in rouge_scores) / len(rouge_scores)
    avg_rouge2 = sum(score['rouge2'].recall for score in rouge_scores) / len(rouge_scores)
    avg_rougeL = sum(score['rougeL'].recall for score in rouge_scores) / len(rouge_scores)

    return avg_rouge1, avg_rouge2, avg_rougeL

for i in range(len(results_names)):
    avg_rouge1, avg_rouge2, avg_rougeL = calculate_rouge_scores(resutls[i], gold_asnwers)
    print(results_names[i])
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")


test_3
Average ROUGE-1: 0.0444
Average ROUGE-2: 0.0344
Average ROUGE-L: 0.0442
test_4
Average ROUGE-1: 0.0392
Average ROUGE-2: 0.0332
Average ROUGE-L: 0.0392


In [121]:
note_mentioned = ['none' for _ in range(len(resutls[0]))]
having_conversation = ['Having a conversation with' for _ in range(len(resutls[0]))]

In [122]:
for i in range(len(results_names)):
    avg_rouge1, avg_rouge2, avg_rougeL = calculate_rouge_scores(resutls[i], note_mentioned)
    print(results_names[i])
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")

test_3
Average ROUGE-1: 0.2947
Average ROUGE-2: 0.0000
Average ROUGE-L: 0.2947
test_4
Average ROUGE-1: 0.1280
Average ROUGE-2: 0.0000
Average ROUGE-L: 0.1280


In [123]:
for i in range(len(results_names)):
    avg_rouge1, avg_rouge2, avg_rougeL = calculate_rouge_scores(resutls[i], having_conversation)
    print(results_names[i])
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")

test_3
Average ROUGE-1: 0.0170
Average ROUGE-2: 0.0002
Average ROUGE-L: 0.0170
test_4
Average ROUGE-1: 0.0103
Average ROUGE-2: 0.0000
Average ROUGE-L: 0.0100


### EM

In [124]:
def calculate_exact_match(predictions, evaluation_text, term = 'dfadsfadsfdasfa'):
    # Calculate exact matches for each pair
    matches = []
    i = 0
    for pred, gold in zip(predictions, evaluation_text):
        # Clean up prediction text
        if len(pred)!=0:
            pred = transform_phrase(pred)['base'].strip().lower()
        # Clean up gold answer text 
        gold = gold.strip().lower()
        # Check for exact match
        matches.append(int((gold in pred)))
        if term in pred.lower(): 
            i += 1
    if i>0:
        print(i)
    # Calculate average exact match score
    avg_exact_match = sum(matches) / len(matches)
    return avg_exact_match

for i in range(len(results_names)):
    exact_match_score = calculate_exact_match(resutls[i], gold_asnwers)
    print(results_names[i])
    print(f"Exact Match Score: {exact_match_score:.4f}")


test_3
Exact Match Score: 0.0167
test_4
Exact Match Score: 0.0200


In [125]:
for i in range(len(results_names)):
    exact_match_score = calculate_exact_match(resutls[i], note_mentioned, "none")
    print(results_names[i])
    print(f"Exact Match Score: {exact_match_score:.4f}")

442
test_3
Exact Match Score: 0.2947
192
test_4
Exact Match Score: 0.1280


In [126]:
for i in range(len(results_names)):
    exact_match_score = calculate_exact_match(resutls[i], having_conversation, "conversation")
    print(results_names[i])
    print(f"Exact Match Score: {exact_match_score:.4f}")

9
test_3
Exact Match Score: 0.0000
4
test_4
Exact Match Score: 0.0000


In [8]:
import json
# Load the JSONL files
def load_jsonl(filename):
    with open(filename, 'r') as file:
        return [json.loads(line) for line in file]

In [9]:
dataset = load_jsonl(f'../Data/T_Multi_2024_Structured.jsonl')