In [1]:
import json
import pandas as pd
import glob
import os
import ast
import numpy as np
from collections import Counter

In [32]:
##### load Odin extractions #####

combined_ie = pd.read_csv("combined_data_ie_label.csv")
ie_extraction = combined_ie[['message_id', 'participant', 'start_timestamp', 'end_timestamp', 'ie_extraction']]
# convert the dataframe to iterable list
list_ie_data = ie_extraction.values.tolist()

In [3]:
# read callout, checkback, and closing labelset
with open('label_selection.json', 'r') as file:
    label_selection = json.load(file)
callout_selection = label_selection["callout_selection"]
checkback_selection = label_selection["checkback_selection"]
closing_selection = label_selection["closing_selection"]

In [55]:
######## CLC extraction #######

def extract_clc(list_ie_data, win_size):
    '''This function takes the nested list of the data (message_id, participant, ie_extraction),
    and output the extracted clc event 
    eg.{turn_1:{message_id: 'a'}, turn_2:{message_id:'a', message_id:'b'}'''

    # Create a dict to store the clc extraction
    clc_extraction = {}

    for n in range(len(list_ie_data) - win_size):
        # Get the n:n+win_size window
        window = list_ie_data[n : n + win_size]
        
        # Find a
        message_id, participant, start, end, ie_extraction = list_ie_data[n]
        if not start == start: start = end
        is_a = False
        if ie_extraction == ie_extraction:
            for label in ie_extraction.split(', '):
                if label in callout_selection:
                    is_a = True
        if is_a:
            clc_extraction[n] = {message_id: 'a'}
            # Find b
            for item in window[1:]:
                next_id, next_participant, next_start, next_end, next_ie = item
                is_b = False
                if not next_start == next_start: next_start = next_end
                if next_participant != participant and next_start > start:
                    if next_ie == next_ie:
                        for label in next_ie.split(', '):
                            if label in checkback_selection:
                                is_b = True
                if is_b:
                    clc_extraction[n][next_id] = 'b'
                    start = next_start # deal with the situation that c occurs before b
            # Find c
            if is_b:
                for item in window[1:]:
                    next_id, next_participant, next_start, next_end, next_ie = item
                    is_c = False
                    if not next_start == next_start: next_start = next_end
                    if next_participant == participant and next_start > start:
                        if next_ie == next_ie:
                            for label in next_ie.split(', '):
                                if label in closing_selection:
                                    is_c = True
                    if is_c:
                        clc_extraction[n][next_id] = 'c'
    return clc_extraction

In [56]:
clc_extraction = extract_clc(list_ie_data, 5)

In [59]:
clc_extraction

{5: {'f2138136-d378-497a-b7a4-7376895e2143': 'a',
  '4e12ebb4-828d-4807-90a7-684ef7580dc3': 'b'},
 7: {'da7c5c01-5a72-46e0-93ff-4c59e1284e5f': 'a',
  '4e12ebb4-828d-4807-90a7-684ef7580dc3': 'b',
  'fb51ecec-3f8f-437f-8191-26e1b35b8cc7': 'b'},
 8: {'4e12ebb4-828d-4807-90a7-684ef7580dc3': 'a',
  'd8f40c5f-afd0-478b-8e8f-f2e4eeca4264': 'b'},
 11: {'fb51ecec-3f8f-437f-8191-26e1b35b8cc7': 'a',
  '2583f1c2-18ff-49c9-a23e-8d08f3ff59b2': 'b'},
 13: {'c4c5c5e2-830d-4569-8232-528bf5a5e947': 'a',
  '2583f1c2-18ff-49c9-a23e-8d08f3ff59b2': 'b',
  'afc8d49a-7d10-4fb9-820d-eedaee49ec6b': 'b'},
 16: {'afc8d49a-7d10-4fb9-820d-eedaee49ec6b': 'a',
  '2cbe08ab-9571-47f4-b10f-2e1fee45fb45': 'b',
  'ce97fd05-3d0d-4e6c-b9fd-4c8ecf6d6fc6': 'b'},
 19: {'6a1ad81e-2e2b-4b6c-b523-7bd67b81f7de': 'a',
  'ce97fd05-3d0d-4e6c-b9fd-4c8ecf6d6fc6': 'b',
  'f9ba47ea-6aef-450f-9d56-8486c2d38b11': 'b'},
 20: {'ce97fd05-3d0d-4e6c-b9fd-4c8ecf6d6fc6': 'a',
  'ccae358c-a446-4538-8aab-e64640b493ea': 'b',
  'f9ba47ea-6aef-450f-9d