In [1]:
import import_ipynb
import numpy as np
from statistics import mean, stdev
import random
import math

random.seed(10)

from copy import deepcopy

from fetch_data import *

importing Jupyter notebook from fetch_data.ipynb




INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


In [None]:
def get_eventct_stats(data, unique_types, feature):
    events_cts = [[] for e in unique_types]
    mean_eventct = [0 for e in unique_types]
    sdv_eventct = [1 for e in unique_types]
    
    for event_type in range(len(unique_types)):
        for s_id in data:
            for log in data[s_id]['logs']:
                events_cts[event_type].append(log[feature][event_type])
    
    for event_type in range(len(unique_types)):
        if len(events_cts[event_type]) > 0:
            mean_eventct[event_type] = mean(events_cts[event_type])
            sdv_eventct[event_type] = stdev(events_cts[event_type])
        
    return mean_eventct, sdv_eventct

In [None]:
def apply_zscore_norm(data, mean_eventct, sd_eventct, unique_types, feature):
    for event_type in range(len(unique_types)):
        for s_id in data:
            for log in data[s_id]['logs']:
                m = mean_eventct[event_type]
                s = sd_eventct[event_type]
                try:
                    log[feature][event_type] = (log[feature][event_type] - m) / s
                except:
                    log[feature][event_type] = (log[feature][event_type] - m) 
                
    return data

In [None]:
def create_subsequences(data, seq_length=20, emb_dim=32, max_responses=7):
    subsequences = {}
    
    for s_id in data:
        if len(data[s_id]['logs']) == 0:
            print(s_id)
            continue
        default_log = {'event': [0 for e in range(len(events_without_movement))],
                       'goal': [0 for g in range(len(plot_names))], 
                       'locn': [0 for l in range(len(locations))],
                       'response_emb': [[0 for e in range(emb_dim)] for m in range(max_responses)],
                       'prev_response': "",
                       'prev_rating': -1,
                       'next_goal': data[s_id]['logs'][0]['next_goal'],
                       'goal_number': 0,
                       'dist': [0 for l in range(len(fewer_plot_names))],
                       'current_locn': [0 for l in range(len(locations))],
                       'current_event': [0 for e in events_without_movement],
                       'seq_goals': [[0 for g in fewer_plot_names] 
                                     for l in fewer_plot_names]}
        new_entry = {'pretest': data[s_id]['pretest'],
                     'posttest': data[s_id]['posttest'],
                     'logs': [[deepcopy(default_log) for s in range(seq_length-1)]]}
        current_idx = 0
        
        for log in data[s_id]['logs']:
            if len(new_entry['logs'][current_idx]) >= seq_length:
                new_entry['logs'].append(deepcopy(new_entry['logs'][current_idx][1:seq_length]))
                current_idx += 1
            new_entry['logs'][current_idx].append(deepcopy(log))
        subsequences[s_id] = new_entry
        
    return subsequences

In [None]:
def compute_dist(point1, point2):
    return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)

In [None]:
def add_location_distances(data):
    for s_id in data:
        for log in data[s_id]['logs']:
            dist_vec = [compute_dist(locn_coord_map[log['current_locn']], 
                                     locn_coord_map[label_goal_locations[g]]) 
                        for g in fewer_plot_names]
            log['dist'] = deepcopy(dist_vec)
            locn_vec = [0 for l in locations]
            locn_vec[locations.index(log['current_locn'])] = 1
            log['current_locn'] = deepcopy(locn_vec)
            event_vec = [0 for e in events_without_movement]
            event_vec[events_without_movement.index(log['current_event'])] = 1
            log['current_event'] = deepcopy(event_vec)
            
    return data

In [None]:
def add_seq_goals(data):
    for s_id in data:
        for log in data[s_id]['logs']:
            log['seq_goals'] = [[0 for g in fewer_plot_names] 
                                for l in fewer_plot_names]
            for i,plot in enumerate(log['prev_goals']):
                pos = len(fewer_plot_names) - len(log['prev_goals']) + i
                log['seq_goals'][pos][fewer_plot_names.index(plot)] = 1
        
    return data

In [None]:
def preprocess_data(data, req_features, seq_length=20):
    copy_data = deepcopy(data)
    
    mean_eventct, sd_eventct = get_eventct_stats(copy_data['Train'], events_without_movement, 'event')  
    copy_data['Train'] = apply_zscore_norm(copy_data['Train'], mean_eventct, sd_eventct,
                                           events_without_movement, 'event') 
    copy_data['Val'] = apply_zscore_norm(copy_data['Val'], mean_eventct, sd_eventct,
                                         events_without_movement, 'event') 
    copy_data['Test'] = apply_zscore_norm(copy_data['Test'], mean_eventct, sd_eventct,
                                          events_without_movement, 'event') 
    
    mean_locnct, sd_locnct = get_eventct_stats(copy_data['Train'], locations, 'locn')  
    copy_data['Train'] = apply_zscore_norm(copy_data['Train'], mean_locnct, sd_locnct,
                                           locations, 'locn') 
    copy_data['Val'] = apply_zscore_norm(copy_data['Val'], mean_locnct, sd_locnct,
                                         locations, 'locn') 
    copy_data['Test'] = apply_zscore_norm(copy_data['Test'], mean_locnct, sd_locnct,
                                          locations, 'locn') 
    
    copy_data['Train'] = add_location_distances(copy_data['Train'])
    copy_data['Val'] = add_location_distances(copy_data['Val'])
    copy_data['Test'] = add_location_distances(copy_data['Test'])
    
    copy_data['Train'] = add_seq_goals(copy_data['Train'])
    copy_data['Val'] = add_seq_goals(copy_data['Val'])
    copy_data['Test'] = add_seq_goals(copy_data['Test'])
    
    copy_data['Train'] = create_subsequences(copy_data['Train'], seq_length=seq_length) 
    copy_data['Val'] = create_subsequences(copy_data['Val'], seq_length=seq_length) 
    copy_data['Test'] = create_subsequences(copy_data['Test'], seq_length=seq_length) 
    
    formatted_data = {}
    formatted_labels = {}
    ids = {}
    formatted_data['Train'], formatted_labels['Train'], ids['Train'] = get_features(copy_data['Train'], reqd_features,
                                                                                        seq_length=seq_length)
    formatted_data['Val'], formatted_labels['Val'], ids['Val'] = get_features(copy_data['Val'], reqd_features,
                                                                                  seq_length=seq_length)
    formatted_data['Test'], formatted_labels['Test'], ids['Test'] = get_features(copy_data['Test'], reqd_features,
                                                                                     seq_length=seq_length)
    
    return formatted_data, formatted_labels, ids

In [None]:
def get_features(data, reqd_features, seq_length=20):
    X = []
    actions_responses = []
    pretests = []
    ids = []
    labels = []
    posttests = []
    goals = []
    goal_ids = []
    prev_goals = []
    seq_goals = []
    dist = []
    seq_locnevents = []
    responses = []
    prev_responses = []
    
    for s_id in data:
        for log in data[s_id]['logs']: # each 'log' is a subsequence of actions
            ids.append(s_id)
            pretests.append(data[s_id]['pretest']/17.)
            posttests.append(data[s_id]['posttest']/17.)
            goal_vec = [0 for g in fewer_plot_names]
            goal_vec[fewer_plot_names.index(log[seq_length-1]['next_goal'])] = 1
            goals.append(deepcopy(goal_vec))
            goal_ids.append(s_id+str(log[seq_length-1]['goal_number']))
            prev_goals.append(deepcopy(log[seq_length-1]['prev_goals']))
            actions_responses.append([])
            seq_locnevents.append([])
            seq_goals.append(deepcopy(log[seq_length-1]['seq_goals']))
            dist.append(deepcopy(log[seq_length-1]['dist']))
            responses.append(deepcopy(log[seq_length-1]['response_emb']))
            prev_responses.append({'response': deepcopy(log[seq_length-1]['prev_response']),
                                   'rating': deepcopy(log[seq_length-1]['prev_rating'])})
            for s in range(seq_length):
                seq_locnevents[len(seq_locnevents)-1].append(deepcopy(log[s]['current_locn']))
                seq_locnevents[len(seq_locnevents)-1][s].extend(deepcopy(log[s]['current_event']))
                actions_responses[len(actions_responses)-1].append(deepcopy(log[s]['event']))
                actions_responses[len(actions_responses)-1][s].extend(deepcopy(log[s]['goal']))
                actions_responses[len(actions_responses)-1][s].extend(deepcopy(log[s]['response_emb']))
                actions_responses[len(actions_responses)-1][s].extend(deepcopy(log[s]['locn']))
                actions_responses[len(actions_responses)-1][s] = np.array(actions_responses[len(actions_responses)-1][s])
           
    all_features = {'pretests': pretests, 'posttests': posttests, 'goals': goals,
                    'prev_goals': prev_goals, 'action_responses': actions_responses,
                    'seq_locnevents': seq_locnevents, 'seq_goals': seq_goals, 
                    'dist': dist, 'responses': responses, 'prev_responses': prev_responses}
    X = [np.array(all_features[feature]) for feature in reqd_features]
    labels = np.array(goals)
    joint_ids = {'ids': ids, 'goal_ids': goal_ids, 'prev_goals': prev_goals,
                 'prev_responses': prev_responses}
    
    print(len(labels))
    
    return X, labels, joint_ids

In [None]:
def get_weighted_avg(response_embs):
    mean_responses = [0 for r in response_embs[0]]
    
    for dim in range(len(response_embs[0])):
        total = 0
        for emb_idx in range(len(response_embs)):
            mean_responses[dim] += response_embs[emb_idx][dim]*math.sqrt((emb_idx + 1))
            total += math.sqrt((emb_idx + 1))
        mean_responses[dim] /= total
    
    return mean_responses