In [2]:
# Input: 1) A system NLG database (containing system utterances along with conversational strategies)
# Input: 2) Dialog transcripts containing user and system turns. 
# Output: Each system turn tagged with an appropriate set of conversational strategies
import pandas as pd
import os
from nltk.tokenize import word_tokenize
import string
import json

parent_path = os.path.abspath('../')
map_to_none = ['BC', 'OUT', 'REC', 'RSE', 'o']
data_dir = parent_path + '/data/davos/'
nlg_db_fname = 'nlg_database.csv'
sentence_col_name = 'SENTENCE'

df = pd.read_csv(data_dir + nlg_db_fname)
utterances = df[sentence_col_name].tolist()
translator = str.maketrans('', '', string.punctuation)
tokenized_utterances = []
clean_utterances = []
for i, u in enumerate(utterances):
    # Remove punctuations and make all characters lower case
    cu = u.translate(translator).lower()
    clean_utterances.append(cu)
    tu = word_tokenize(cu)
    tokenized_utterances.append(tu)

In [3]:
transcripts_dir = data_dir + 'transcripts/'
processed_files_dir = transcripts_dir + 'processed/'
agent_cs_dir = data_dir + 'agent_cs/'
txt_suffix = '.txt'

def make_dir(dir_name):
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

all_files = os.listdir(transcripts_dir)
session_ids = []
for file in all_files:
    if file.endswith(txt_suffix):
        session_ids.append(file.split(txt_suffix)[0])

make_dir(processed_files_dir)
make_dir(agent_cs_dir)

Session IDs:  ['514007', '510275', '1001444', '510881', '2016938', '513033', '1001335', '514819', '1002013', '513024', '1001877', '513961', '510918', '510688', '510500', '516417', '2001299', '509907', '1002648', '510410', '515233', '515151', '511862', '511447', '513324', '511250', '516517', '2004828', '2002040', '1001636', '510398', '510401', '509969', '513058', '1002655', '510196', '515762', '509997', '516135', '2005133', '2008165', '511512', '1003158', '511665', '510999', '1002281', '510422', '1002242', '2001541', '1003016', '514928', '513935', '513210', '509921', '513762', '512508', '510849', '514232', '1002422', '515289', '2005036', '510910', '513969', '510457', '1001918', '2001521', '510734', '515843']


In [15]:
import json
import operator
import time

matches = []
sys_turns = []
step = 0.05
base_thresh = 0.8
options_suffix = '_options.csv'
agent_name = 'SARA'
time_format = '%M:%S.%f'
time_alt_format = '%M:%S:%f'
min_to_sec = 60
min_id = 4
sec_id = 5
agent_name_col = 0
agent_turn_col = 2

participants = ['sara', 'user']
line_num = {}

for sid in session_ids:
    print("Getting potential matches for file %s.txt..." % (sid), end='')
    
    prev = None
    curr = None
    for p in participants:
        line_num[p] = 0
    
    options_fname = processed_files_dir + sid + options_suffix
    if os.path.exists(options_fname):
        os.remove(options_fname)
    fw = open(options_fname, 'w', encoding='utf-8-sig')

    with open(transcripts_dir + sid + txt_suffix, 'r') as fr:
        rows = fr.readlines()
        line = 0
        for rid, r in enumerate(rows):
            r = r.rstrip("\n")
            r = r.split(",")
            line += 1
            
            curr = r[agent_name_col].strip().lower()
            if curr in participants:
                line_num[curr] += 1
                if curr == prev:
                    print("Session: %s, Row: %d, Repeated by: %s" %(sid, line_num[curr]-1, curr))
                    print(r)
            prev = curr
            
            if r[agent_name_col] == agent_name:
                # Picking end of turn (last column) as the time stamp for the turn
                try:
                    struct_t = time.strptime(r[-1].strip(), time_format)
                except ValueError:
                    try:
                        struct_t = time.strptime(r[-1].strip(), time_alt_format)
                    except ValueError:
                        print(r)
                    
                tsec = struct_t[min_id] * min_to_sec + struct_t[sec_id]
                fw.write(str(tsec) + ",")

                clean_sys_turn = str(r[agent_turn_col].strip().translate(translator).lower())
                tokenized_sys_turn = word_tokenize(clean_sys_turn)
                fw.write(clean_sys_turn + ", ")
                n_sys_turn = len(tokenized_sys_turn)
                
                scores = {}
                for i, tu in enumerate(tokenized_utterances):
                    n_utterance = len(tu)
                    n_common = len(set(tokenized_sys_turn).intersection(set(tu)))
                    scores[i] = (n_common/n_utterance, n_utterance)
                
                sorted_scores = sorted(scores.items(), key= lambda x: x[1][0], reverse=True)
                potential_matches = []
                thresh = base_thresh
                while len(potential_matches) == 0:
                    potential_matches = sorted([s for s in sorted_scores if s[1][0] >= thresh], key= lambda x: x[1][1], reverse=True)
                    thresh -= step
            
                # Write to file.
                for i, pm in enumerate(potential_matches):
                    end_str = "," if i < len(potential_matches)-1 else "\n"
                    fw.write(clean_utterances[pm[0]] + ": " + str(pm[0]) + end_str)   
    fw.close()
    print("Done.")

Getting potential matches for file 514007.txt...Session: 514007, Row: 10, Repeated by: sara
['SARA', ' 01:13.1', ' I can recommend to you people to meet or sessions to attend', ' 01:16.2']
Session: 514007, Row: 26, Repeated by: user
['User', ' 03:45.5', ' nothing', ' 03:45.7']
Session: 514007, Row: 29, Repeated by: user
['User', ' 04:01.6', ' ah um hmm', ' 04:02.7']
Done.
Getting potential matches for file 510275.txt...Done.
Getting potential matches for file 1001444.txt...Done.
Getting potential matches for file 510881.txt...Done.
Getting potential matches for file 2016938.txt...Done.
Getting potential matches for file 513033.txt...Done.
Getting potential matches for file 1001335.txt...Done.
Getting potential matches for file 514819.txt...Done.
Getting potential matches for file 1002013.txt...Done.
Getting potential matches for file 513024.txt...Done.
Getting potential matches for file 1001877.txt...Done.
Getting potential matches for file 513961.txt...Done.
Getting potential matches 

In [5]:
import json
task_intentions_map_fname = 'all_task_intentions_act_map_final.csv'
df_map = pd.read_csv(data_dir + task_intentions_map_fname)

intention_act_map = dict(zip(df_map['col_1'], df_map['act']))
print(json.dumps(intention_act_map, indent=2))
all_acts = list(set(df_map['act']))
print(all_acts)

{
  "acknowledgement": "ack()",
  "feedback_person_recommendation_1st_time_yes": "glad()",
  "request(send_msg_tlink)": "request(send_msg_tlink)",
  "outcome_person_recommendation_1st_time": "inform(info)",
  "feedback_session_recommendation_2nd_time_no": "sorry()",
  "do_selfie": "take_selfie()",
  "ready_selfie": "take_selfie()",
  "farewell": "bye()",
  "do_attendance_elicitation": "request(first_time)",
  "greeting": "greeting()",
  "finish_selfie": "take_selfie()",
  "feedback_interest_elicitation_session_recommendation": "give_feedback()",
  "elicit_feedback_person_recommendation_1st_time": "request(feedback)",
  "feedback_goal_elicitation": "give_feedback()",
  "pleasure_coming_together": "greeting()",
  "do_person_recommendation_1st_time": "request(goal)",
  "start_goal_elicitation": "request(primary_goal)",
  "do_interest_elicitation_session_recommendation": "request(interest)",
  "introduce": "introduce()",
  "pre_closing": "request(anything_else)",
  "bye": "bye()",
  "elici

In [8]:
import pickle
from collections import Counter

import pandas as pd

final_files_dir = transcripts_dir + 'final/'
final_suffix = '_final.csv'
tsec_col = 0
matches_col_start = 2
utterance_id = 1
agent_timestamp_suffix = '_agent_timestamps.pkl'
cs_suffix = '_agent_cs.pkl'
intention_suffix = '_agent_intention.pkl'
all_agent_task_intentions = []
count_no_matches = 0

for sid in session_ids:
    print("Getting CSs for file %s_final.csv..." % (sid), end='')

    with open(final_files_dir + sid + final_suffix, 'r', encoding='utf-8-sig', errors='ignore') as fr:
        rows = fr.readlines()
        tsec = {}
        cs = {}
        task_intention = {}
        for i, r in enumerate(rows):
            r = r.rstrip("\n")
            r = r.split(",")
            tsec[i] = int(r[tsec_col])
            matches = r[matches_col_start:]
            n_matches = 0
            for m in matches:
                if m != '':
                    n_matches += 1
            
            for m in matches:
                if m != '':
                    if i not in cs.keys():
                        cs[i] = []
                        task_intention[i] = []
                    m_id = m.split(':')[utterance_id].strip()
                    try:
                        cs_candidate = df.loc[int(m_id)]['STRATEGY']
                        task_intention_candidate = df.loc[int(m_id)]['SYSTEM_INTENTION']
                        cs[i].append('NONE' if cs_candidate in map_to_none else cs_candidate)
                        # all_agent_task_intentions.append(task_intention_candidate)
                        task_intention[i].append(intention_act_map[task_intention_candidate])
#                         if sid == '514928':
#                             print(i)
#                             print(task_intention_candidate)
#                             print(task_intention[i])
                    except ValueError:
                        # print(r)
                        task_intention_candidate = m.split(':')[0].strip()
                        if task_intention_candidate != 'No match':
                            count_no_matches += 1
                            # all_agent_task_intentions.append(task_intention_candidate)
                            task_intention[i].append(intention_act_map[task_intention_candidate])
                        cs[i].append(m_id)
                        
    with open(agent_cs_dir + sid + agent_timestamp_suffix, 'wb') as f:
        pickle.dump(tsec, f, pickle.HIGHEST_PROTOCOL)

    with open(agent_cs_dir + sid + cs_suffix, 'wb') as f:
        pickle.dump(cs, f, pickle.HIGHEST_PROTOCOL)
    
    with open(agent_cs_dir + sid + intention_suffix, 'wb') as f:
        pickle.dump(task_intention, f, pickle.HIGHEST_PROTOCOL)
    
    print("Done.")
print(count_no_matches)
# counter_dict = dict(Counter(all_agent_task_intentions))
# df_new = pd.DataFrame({'col_1': list(counter_dict.keys()), 'col_2': list(counter_dict.values())})
# df_new.to_csv(data_dir + 'all_task_intentions.csv', index=False)

Getting CSs for file 514007_final.csv...{0: ['greeting()'], 1: ['greeting()'], 2: ['tired()'], 3: ['other()'], 4: ['request(first_time)'], 5: ['give_feedback()'], 6: ['give_feedback()'], 7: ['request(primary_goal)'], 8: ['request(primary_goal)'], 9: ['request(primary_goal)'], 10: ['request(primary_goal)'], 11: ['request(primary_goal)'], 12: ['ack()'], 13: ['ack()'], 14: ['request(goal)'], 15: ['request(goal)'], 16: ['inform(info)', 'inform(info)'], 17: ['ack()'], 18: ['request(feedback)'], 19: ['glad()'], 20: ['other()'], 21: ['request(another_reco)', 'ack()'], 22: ['inform(info)'], 23: ['glad()', 'ack()'], 24: ['other()'], 25: ['other()'], 26: ['ack()'], 27: ['ack()'], 28: ['inform(info)'], 29: ['glad()'], 30: ['other()'], 31: ['request(selfie)'], 32: ['glad()', 'take_selfie()'], 33: ['take_selfie()'], 34: ['take_selfie()'], 35: ['bye()'], 36: ['bye()'], 37: ['bye()', 'bye()']}
Getting CSs for file 510275_final.csv...{0: ['greeting()'], 1: ['greeting()'], 2: ['request(first_time)'], 3

Getting CSs for file 511447_final.csv...{0: ['greeting()'], 1: ['greeting()', 'greeting()'], 2: ['request(first_time)', 'give_feedback()', 'introduce()'], 3: ['give_feedback()'], 4: ['request(primary_goal)'], 5: ['request(primary_goal)'], 6: ['give_feedback()', 'give_feedback()', 'request(interest)'], 7: ['give_feedback()', 'request(goal)'], 8: ['request(goal)', 'request(feedback)', 'inform(info)'], 9: ['glad()'], 10: ['request(send_msg_tlink)'], 11: ['ack()'], 12: ['request(selfie)'], 13: ['request(selfie)'], 14: ['glad()', 'take_selfie()', 'take_selfie()'], 15: ['take_selfie()'], 16: ['request(anything_else)'], 17: ['other()'], 18: ['bye()', 'bye()']}
Getting CSs for file 513324_final.csv...{0: ['greeting()'], 1: ['greeting()'], 2: ['introduce()', 'introduce()', 'request(first_time)'], 3: ['give_feedback()'], 4: ['request(primary_goal)'], 5: ['request(primary_goal)'], 6: ['give_feedback()'], 7: ['request(interest)', 'ack()'], 8: ['other()'], 9: ['request(interest)'], 10: ['request(go

Getting CSs for file 510422_final.csv...{0: ['greeting()'], 1: ['greeting()'], 2: ['introduce()'], 3: ['request(first_time)', 'request(first_time)'], 4: ['give_feedback()'], 5: ['request(primary_goal)'], 6: ['request(primary_goal)'], 7: ['give_feedback()'], 8: ['request(interest)'], 9: ['give_feedback()'], 10: ['thank()'], 11: ['request(goal)'], 12: ['request(goal)'], 13: ['inform(info)'], 14: ['request(another_reco)'], 15: ['inform(info)'], 16: ['glad()'], 17: ['request(send_msg_tlink)'], 18: ['sorry()', 'request(send_msg_tlink)'], 19: ['take_selfie()'], 20: ['take_selfie()', 'request(anything_else)'], 21: ['bye()'], 22: ['bye()']}
Getting CSs for file 1002242_final.csv...{0: ['greeting()'], 1: ['greeting()'], 2: ['introduce()', 'request(first_time)', 'glad()'], 3: ['give_feedback()'], 4: ['request(primary_goal)'], 5: ['give_feedback()', 'request(interest)'], 6: ['give_feedback()'], 7: ['request(goal)'], 8: ['request(goal)', 'inform(info)', 'sorry()'], 9: ['other()', 'ack()'], 10: ['r

In [19]:
user_timestamp_suffix = '_user_timestamps.pkl'
user_name_col = 0
user_name = 'user'
user_cs_dir = data_dir + 'user_cs/'
user_pickle_dir = user_cs_dir + 'pickle_files/'

for sid in session_ids:
    print("Getting user turn timestamps for session %s..." % (sid), end='')
    with open(transcripts_dir + sid + txt_suffix, 'r') as fr:
        rows = fr.readlines()
        line = 0
        tsec = {}
        for i, r in enumerate(rows):
            r = r.rstrip("\n")
            r = r.split(",")
            
            if r[user_name_col].lower() == user_name:
                # Picking end of turn (last column) as the time stamp for the turn
                try:
                    struct_t = time.strptime(r[-1].strip(), time_format)
                except ValueError:
                    try:
                        struct_t = time.strptime(r[-1].strip(), time_alt_format)
                    except ValueError:
                        print(r)
                    
                tsec[i] = struct_t[min_id] * min_to_sec + struct_t[sec_id]

    with open(user_pickle_dir + sid + user_timestamp_suffix, 'wb') as f:
        pickle.dump(tsec, f, pickle.HIGHEST_PROTOCOL)
    print("Done.")

Getting user turn timestamps for session 514007...Done.
Getting user turn timestamps for session 510275...Done.
Getting user turn timestamps for session 1001444...Done.
Getting user turn timestamps for session 510881...Done.
Getting user turn timestamps for session 2016938...Done.
Getting user turn timestamps for session 513033...Done.
Getting user turn timestamps for session 1001335...Done.
Getting user turn timestamps for session 514819...Done.
Getting user turn timestamps for session 1002013...Done.
Getting user turn timestamps for session 513024...Done.
Getting user turn timestamps for session 1001877...Done.
Getting user turn timestamps for session 513961...['User', ' 00:01.0', ' (inaudible) he’s in Scandinavia it could be [interesting (inaudible)']
Done.
Getting user turn timestamps for session 510918...Done.
Getting user turn timestamps for session 510688...Done.
Getting user turn timestamps for session 510500...Done.
Getting user turn timestamps for session 516417...Done.
Getti

In [20]:
all_files = os.listdir(user_cs_dir)
session_ids = []
csv_suffix = '.csv'
for file in all_files:
    if file.endswith(csv_suffix):
        session_ids.append(file.split(csv_suffix)[0])

In [21]:
none_str = 'NONE'
user_cs_suffix = '_user_cs.pkl'

def remove_all(full_list, ele_rm):
    trimmed_list = [ele for ele in full_list if ele != ele_rm]
    if trimmed_list == []:
        trimmed_list.append(none_str)
    return trimmed_list

for sid in session_ids:
    print("Getting user CSs for session %s..." % (sid), end='')
    df = pd.read_csv(user_cs_dir + sid + csv_suffix, engine='python')
    cols = df.columns.values.tolist()[1:]

    for c in cols:
        df[c].replace(1, c, inplace=True)
    ind = df.index.values.tolist()
    print(len(ind))
    cs = {i: remove_all(df.loc[i].tolist()[1:], 0) for i in ind}
    if sid == '510999':
        print(cs)
    
    with open(user_pickle_dir + sid + user_cs_suffix, 'wb') as f:
        pickle.dump(cs, f, pickle.HIGHEST_PROTOCOL)
    print("Done.")

Getting user CSs for session 2008165...16
Done.
Getting user CSs for session 1003158...28
Done.
Getting user CSs for session 1002655...22
Done.
Getting user CSs for session 515762...29
Done.
Getting user CSs for session 2005133...34
Done.
Getting user CSs for session 516135...26
Done.
Getting user CSs for session 1003016...28
Done.
Getting user CSs for session 514928...27
Done.
Getting user CSs for session 510999...25
{0: ['NONE'], 1: ['SD'], 2: ['SD'], 3: ['SD', 'HE'], 4: ['NONE'], 5: ['SD'], 6: ['NONE'], 7: ['SD', 'PR'], 8: ['NONE'], 9: ['SD', 'HE'], 10: ['NONE'], 11: ['SD'], 12: ['SD', 'PR'], 13: ['NONE'], 14: ['SD', 'PR'], 15: ['PR'], 16: ['NONE'], 17: ['NONE'], 18: ['NONE'], 19: ['NONE'], 20: ['NONE'], 21: ['NONE'], 22: ['NONE'], 23: ['PR'], 24: ['NONE']}
Done.
Getting user CSs for session 1002242...33
Done.
Getting user CSs for session 2001541...27
Done.
Getting user CSs for session 1002281...23
Done.
Getting user CSs for session 512508...36
Done.
Getting user CSs for session 514