In [1]:
import os
import pickle
import numpy as np
import json

parent_path = os.path.abspath('../')
data_path = parent_path + '/data/davos/'

participants = ['agent', 'user']
suffix_str = ['_timestamps.pkl', '_cs.pkl', '_intention.pkl']

participant_paths = []
for p in participants:
    participant_paths.append(data_path + p + '_cs/')
participant_paths[1] += 'pickle_files/'
    
cluster_file = data_path + 'clusters_full.pkl'
id_to_f_file = data_path + 'id_to_f_full.pkl'
last_slice_fname = data_path + 'last_slice_length_full.txt'
tot_length_fname = data_path + 'length_full.txt'
train_data_fname = data_path + 'train_data_full.pkl'

window = 2 # How back do we go in the past
input_variables = ['user_cs_inp', 'agent_cs_inp', 'agent_intention_inp', 'rapp_inp']
output_variables = ['user_cs_outp', 'rapp_outp']
all_str = 'all'

clusters = pickle.load(open(cluster_file, 'rb'))
id_to_f = pickle.load(open(id_to_f_file, 'rb'))

all_sessions = id_to_f.keys()

unique_clusters = list(set(list(clusters.values())))
unique_clusters.append(all_str)

data = {}
for uc in unique_clusters:
    data[uc] = {}
    for ov in output_variables:
        data[uc][ov] = None
    for i, iv in enumerate(input_variables):
        for w in range(window):
            if iv in ['agent_cs_inp', 'agent_intention_inp']:
                w -= 1
            data[uc][iv + '_t-' + str(w+1)] = None

print(json.dumps(data, indent=2))


def get_map(fname):
    map_dict = {}
    with open(fname, 'r') as f:
        rows = f.readlines()
        for r in rows:
            r = r.split(" ")
            key = r[0].strip()
            val = int(r[1].strip())
            map_dict[key] = val
    return map_dict

last_slice_length = get_map(last_slice_fname)
tot_length = get_map(tot_length_fname)

{
  "0": {
    "user_cs_outp": null,
    "rapp_outp": null,
    "user_cs_inp_t-1": null,
    "user_cs_inp_t-2": null,
    "agent_cs_inp_t-0": null,
    "agent_cs_inp_t-1": null,
    "agent_intention_inp_t-0": null,
    "agent_intention_inp_t-1": null,
    "rapp_inp_t-1": null,
    "rapp_inp_t-2": null
  },
  "1": {
    "user_cs_outp": null,
    "rapp_outp": null,
    "user_cs_inp_t-1": null,
    "user_cs_inp_t-2": null,
    "agent_cs_inp_t-0": null,
    "agent_cs_inp_t-1": null,
    "agent_intention_inp_t-0": null,
    "agent_intention_inp_t-1": null,
    "rapp_inp_t-1": null,
    "rapp_inp_t-2": null
  },
  "all": {
    "user_cs_outp": null,
    "rapp_outp": null,
    "user_cs_inp_t-1": null,
    "user_cs_inp_t-2": null,
    "agent_cs_inp_t-0": null,
    "agent_cs_inp_t-1": null,
    "agent_intention_inp_t-0": null,
    "agent_intention_inp_t-1": null,
    "rapp_inp_t-1": null,
    "rapp_inp_t-2": null
  }
}


In [2]:
class CategoricalEncoder:
    def __init__(self, data):
        self.data = data
    
    def fit(self, cs_types):
        onehotvec = np.zeros(len(self.data))
        for c in cs_types:
            if c == 'QE':
                c = 'QESD'
            onehotvec[self.data.index(c)] = 1
        return onehotvec

cs_types = {}
intention_types = {}
cs_types['agent'] = ['ASN', 'ACK', 'SD', 'QESD', 'PR', 'HE', 'VSN', 'NONE']
cs_types['user'] = ['SD', 'QESD', 'PR', 'HE', 'VSN', 'NONE']
intention_types['agent'] = ['ack()', 'request(met_before)', 'take_selfie()', 'give_feedback()', 'tired()', 'request(selfie)', 'request(send_msg_tlink)', 'request(another_reco)', 'greeting()', 'request(first_time)', 'no_worries()', 'do()', 'request(interest)', 'thank()', 'bye()', 'request(goal)', 'send_msg()', 'request(feedback)', 'introduce()', 'glad()', 'sorry()', 'request(anything_else)', 'request(primary_goal)', 'other()', 'inform(info)', 'you()']

enc = {}
for p in participants:
    enc[p] = CategoricalEncoder(cs_types[p])
enc['agent_intention'] = CategoricalEncoder(intention_types['agent'])

In [3]:
def shift_in_time(mat, n):
    """Shifts the given 2-d matrix mat by n (rows), appending n zero arrays to its top"""
    assert mat.ndim == 2
    if n == 0:
        return mat
    mat = mat[:-n, :]
    r, c = mat.shape
    padding = np.zeros((n, c))
    return np.concatenate((padding, mat), axis=0)

In [4]:
outp_base_str = "_cs_outp"
inp_base_str = "_cs_inp_t-"
rapp_outp_str = "rapp_outp"
rapp_inp_str = "rapp_inp_t-"
agent_intention_inp_str = "agent_intention_inp_t-"
rapp_lower_bound = 2 # rapport lower bound
rapp_upper_bound = 6 # rapport upper bound
task_intention_binary = None

flag = {}
for clust in clusters.values():
    flag[clust] = {}
    for p in participants:
        flag[clust][p] = False

def update_data(window, data, clust, base_str, array, create_new=False, decr_window=False):
    for w in range(window):
        if decr_window:
            w -= 1
        if create_new:
            print(base_str + str(w+1))
            data[clust][base_str + str(w+1)] = shift_in_time(array, w+1)
        else:
            data[clust][base_str + str(w+1)] = np.concatenate((data[clust][base_str + str(w+1)], shift_in_time(array, w+1)), axis=0)
    return data

first_time = {}
num_cs = {}

for i, sid in enumerate(all_sessions):
    print("Processing session: %s..." %(sid), end='')
    clust = clusters[int(sid)]
    
    for q in participants:
        # -1 means these have not been updated yet.
        first_time[q] = -1
        num_cs[q] = -1
    
    for j, p in enumerate(participants):
        print("Processing %s..." %(p), end='')
        tstmp = pickle.load(open(participant_paths[j] + sid + '_' + p + suffix_str[0], 'rb'))
        t = list(tstmp.values())
        orig_len = len(t)
        # Drop time stamps for last slice
        t = [i for i in t if tot_length[sid] - i > last_slice_length[sid]]
        new_len = len(t)
        cs = pickle.load(open(participant_paths[j] + sid + '_' + p + suffix_str[1], 'rb'))
        if p == participants[0]:
            task_intention = pickle.load(open(participant_paths[j] + sid + '_' + p + suffix_str[2], 'rb'))
        diff = orig_len-new_len
        
        # Drop CSs for last slice
        if diff > 0:
            cs = list(cs.values())[:-diff]
            if p == 'agent':
                task_intention = list(task_intention.values())[:-diff]
        else:
            cs = list(cs.values())
            if p == participants[0]:
                task_intention = list(task_intention.values())
        
        if p == participants[0]:
            # This is true only for agent.
            assert len(t) == len(cs)
            assert len(t) == len(task_intention)
        
        first_time[p] = t[0]
        num_cs[p] = len(cs)
        num_intention = len(task_intention)
        
        time_updated = True
        for q in participants:
            if first_time[q] == -1:
                time_updated = False  
        
        if time_updated:
            if first_time[participants[0]] - first_time[participants[1]] > 0:
                # Drop the first user CS if the user spoke first!
                cs.pop(0)
            num_cs[p] = len(cs)
            assert num_cs[participants[0]] - num_cs[participants[1]] <= 1
            if num_cs[participants[0]] - num_cs[participants[1]] == 1:
                cs.append(['NONE'])
            num_cs[p] = len(cs)
#             print(num_cs[participants[0]])
#             print(num_cs[participants[1]])
        
        # Binary representation of the conversational strategies used in the entire interaction by the 
        # given participant (user or agent)
        cs_binary = np.array([enc[p].fit(c) for c in cs])
        rapp = id_to_f[sid](np.array(t))[:, np.newaxis]
        if p == participants[0]:
            task_intention_binary = np.array([enc[p + '_intention'].fit(ti) for ti in task_intention])
#             print(task_intention_binary)
        x = rapp[rapp < rapp_lower_bound].tolist()
        y = rapp[rapp > rapp_upper_bound].tolist()
        correct_indices = np.where((rapp >= rapp_lower_bound) & (rapp <= rapp_upper_bound))[0]
        
        # Nearest neighbour interpolation for out of bound rapport values
        if x != []:
            problem_indices = np.where(rapp < rapp_lower_bound)[0]
            nearest_indices = np.argsort(abs(problem_indices[:, np.newaxis]-correct_indices[np.newaxis, :]), axis=1)[:, 0]
            rapp[problem_indices] = rapp[correct_indices[nearest_indices]]
        if y != []:
            problem_indices = np.where(rapp > rapp_upper_bound)[0]
            nearest_indices = np.argsort(abs(problem_indices[:, np.newaxis]-correct_indices[np.newaxis, :]), axis=1)[:, 0]
            rapp[problem_indices] = rapp[correct_indices[nearest_indices]]

        base_str = [p + inp_base_str, rapp_inp_str, agent_intention_inp_str]
        to_be_added = [cs_binary, rapp, task_intention_binary]
        if not flag[clust][p]:
            if p == 'agent':
                # Create rapp_outp.
                data[clust][rapp_outp_str] = rapp
                data = update_data(window, data, clust, base_str[1], to_be_added[1], create_new=True, decr_window=False)
                dw = True
                data = update_data(window, data, clust, base_str[2], to_be_added[2], create_new=True, decr_window=dw)
            if p == 'user':
                # U_0 should be as it is.
                data[clust][p + outp_base_str] = cs_binary
                dw = False
            data = update_data(window, data, clust, base_str[0], to_be_added[0], create_new=True, decr_window=dw)
            flag[clust][p] = True
        else:
            if p == 'agent':
                # Create rapp_outp.
                data[clust][rapp_outp_str] = np.concatenate((data[clust][rapp_outp_str], rapp), axis=0)
                data = update_data(window, data, clust, base_str[1], to_be_added[1], create_new=False, decr_window=False)
                dw = True
                data = update_data(window, data, clust, base_str[2], to_be_added[2], create_new=False, decr_window=dw)
            if p == 'user':
                data[clust][p + outp_base_str] = np.concatenate((data[clust][p + outp_base_str], cs_binary), axis=0)
                dw = False
            data = update_data(window, data, clust, base_str[0], to_be_added[0], create_new=False, decr_window=dw)
        print("Done.", end='')
    print("Done.")
        
for k, v in data[all_str].items():
    data[all_str][k] = np.vstack([data[c][k] for c in unique_clusters[:-1]])

for keys, vals in data.items():
    print(keys)
    for k, v in vals.items():
        print(k)
        print(np.shape(v))

with open(train_data_fname, 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

Processing session: 514928...Processing agent...rapp_inp_t-1
rapp_inp_t-2
agent_intention_inp_t-0
agent_intention_inp_t-1
agent_cs_inp_t-0
agent_cs_inp_t-1
Done.Processing user...user_cs_inp_t-1
user_cs_inp_t-2
Done.Done.
Processing session: 515151...Processing agent...rapp_inp_t-1
rapp_inp_t-2
agent_intention_inp_t-0
agent_intention_inp_t-1
agent_cs_inp_t-0
agent_cs_inp_t-1
Done.Processing user...user_cs_inp_t-1
user_cs_inp_t-2
Done.Done.
Processing session: 515233...Processing agent...Done.Processing user...Done.Done.
Processing session: 515289...Processing agent...Done.Processing user...Done.Done.
Processing session: 515762...Processing agent...Done.Processing user...Done.Done.
Processing session: 515843...Processing agent...Done.Processing user...Done.Done.
Processing session: 516135...Processing agent...Done.Processing user...Done.Done.
Processing session: 516417...Processing agent...Done.Processing user...Done.Done.
Processing session: 516517...Processing agent...Done.Processing 

KeyError: '2002040'

In [53]:
import collections
import json
import operator

# Code for getting distribution of agent task intention and conversational strategy for user conversational
# strategies of SD, PR, HE, QESD, VSN. Basically trying to answer the question: what makes users use these
# conversational strategies.

first_time = {}
num_cs = {}

non_none_strats = ['SD', 'QESD', 'PR', 'HE', 'VSN']
precondition_classes = ['agent_task_intention', 'agent_cs', 'intention_cs']

user_precondition_map = {}
for cs in non_none_strats:
    user_precondition_map[cs] = {}
    for precond_class in precondition_classes:
        user_precondition_map[cs][precond_class] = []

print(user_precondition_map)

for i, sid in enumerate(all_sessions):
    print("Processing session: %s..." %(sid), end='')
    
    clust = clusters[int(sid)]
    
    for q in participants:
        # -1 means these have not been updated yet.
        first_time[q] = -1
        num_cs[q] = -1
    
    for j, p in enumerate(participants):
        print("Processing %s..." %(p), end='')
        
        tstmp = pickle.load(open(participant_paths[j] + sid + '_' + p + suffix_str[0], 'rb'))
        t = list(tstmp.values())
#         orig_len = len(t)
#         # Drop time stamps for last slice
#         t = [i for i in t if tot_length[sid] - i > last_slice_length[sid]]
#         new_len = len(t)
        
        cs = pickle.load(open(participant_paths[j] + sid + '_' + p + suffix_str[1], 'rb'))
        
        if p == participants[0]:
            task_intention = pickle.load(open(participant_paths[j] + sid + '_' + p + suffix_str[2], 'rb'))
        
        diff = 0
#         diff = orig_len - new_len
        
        # Drop CSs for last slice
        if diff > 0:
            cs = list(cs.values())[:-diff]
            if p == participants[0]:
                task_intention = list(task_intention.values())[:-diff]
                cs_agent = cs
        else:
            cs = list(cs.values())
            if p == participants[0]:
                task_intention = list(task_intention.values())
                cs_agent = cs
        
        if p == participants[0]:
            # This is true only for agent.
            assert len(t) == len(cs)
            assert len(t) == len(task_intention)
        
        first_time[p] = t[0]
        num_cs[p] = len(cs)
        num_intention = len(task_intention)
        
        time_updated = True
        for q in participants:
            if first_time[q] == -1:
                time_updated = False  
        
        if time_updated:
            if first_time[participants[0]] - first_time[participants[1]] > 0:
                # Drop the first user CS if the user spoke first!
                cs.pop(0)
            num_cs[p] = len(cs)
            # assert num_cs[participants[0]] - num_cs[participants[1]] <= 1
            if num_cs[participants[0]] - num_cs[participants[1]] > 1:
                print(num_cs[participants[0]])
                print(num_cs[participants[1]])
            if num_cs[participants[0]] - num_cs[participants[1]] == 1:
                cs.append(['NONE'])
            num_cs[p] = len(cs)
        
        if j == 1:
            assert len(cs_agent) == len(cs) == len(task_intention)
            
            #if clust == 1:
            for turn, cs_by_turn in enumerate(cs):
                for actual_cs in cs_by_turn:
                    if actual_cs in non_none_strats:
                        user_precondition_map[actual_cs]['agent_task_intention'].extend(task_intention[turn])
                        user_precondition_map[actual_cs]['agent_cs'].extend(cs_agent[turn])
                        for n_intent, intent in enumerate(task_intention[turn]):
                            user_precondition_map[actual_cs]['intention_cs'].append(task_intention[turn][n_intent] + ", " + cs_agent[turn][n_intent])

#         # Binary representation of the conversational strategies used in the entire interaction by the 
#         # given participant (user or agent)
#         cs_binary = np.array([enc[p].fit(c) for c in cs])
#         rapp = id_to_f[sid](np.array(t))[:, np.newaxis]
#         if p == participants[0]:
#             task_intention_binary = np.array([enc[p + '_intention'].fit(ti) for ti in task_intention])
# #             print(task_intention_binary)
#         x = rapp[rapp < rapp_lower_bound].tolist()
#         y = rapp[rapp > rapp_upper_bound].tolist()
#         correct_indices = np.where((rapp >= rapp_lower_bound) & (rapp <= rapp_upper_bound))[0]
        
#         # Nearest neighbour interpolation for out of bound rapport values
#         if x != []:
#             problem_indices = np.where(rapp < rapp_lower_bound)[0]
#             nearest_indices = np.argsort(abs(problem_indices[:, np.newaxis]-correct_indices[np.newaxis, :]), axis=1)[:, 0]
#             rapp[problem_indices] = rapp[correct_indices[nearest_indices]]
            print('Done.')

for cs in non_none_strats:
    for precond_class in precondition_classes:
        user_precondition_map[cs][precond_class] = collections.OrderedDict(sorted(collections.Counter(user_precondition_map[cs][precond_class]).items(), key=operator.itemgetter(1), reverse=True))

print(json.dumps(user_precondition_map, indent=2))
# print(user_precondition_map)


{'SD': {'agent_task_intention': [], 'agent_cs': [], 'intention_cs': []}, 'QESD': {'agent_task_intention': [], 'agent_cs': [], 'intention_cs': []}, 'PR': {'agent_task_intention': [], 'agent_cs': [], 'intention_cs': []}, 'HE': {'agent_task_intention': [], 'agent_cs': [], 'intention_cs': []}, 'VSN': {'agent_task_intention': [], 'agent_cs': [], 'intention_cs': []}}
Processing session: 514928...Processing agent...Processing user...Done.
Processing session: 515151...Processing agent...Processing user...Done.
Processing session: 515233...Processing agent...Processing user...Done.
Processing session: 515289...Processing agent...Processing user...Done.
Processing session: 515762...Processing agent...Processing user...Done.
Processing session: 515843...Processing agent...Processing user...Done.
Processing session: 516135...Processing agent...Processing user...Done.
Processing session: 516417...Processing agent...Processing user...Done.
Processing session: 516517...Processing agent...Processing u