# Trajectory synthesis from trained textgenrnn models

These trajectories are specific to Cambridge MA.
I.e. producing data for users with homes in Camridge MA census tracts.

In [0]:
!pip install --user textgenrnn

from datetime import datetime
import json
import os

from google.colab import files

Some quick hacking below for use the textgenrnn module internal functions:

In [0]:
# In order to use the internal functions, must clone and rename local version 
# of the module
!git clone https://github.com/minimaxir/textgenrnn.git

In [0]:
!ls

In [0]:
!mv textgenrnn clonedtextgenrnn

In [0]:
from clonedtextgenrnn.textgenrnn import utils

In [0]:
# verify this function can be used:
utils.synthesize

-- module import/hacking done --

In [0]:
import textgenrnn

def get_model_generator(model_name):
  return textgenrnn.textgenrnn(weights_path='./{}_weights.hdf5'.format(model_name),
                      vocab_path='./{}_vocab.json'.format(model_name),
                      config_path='./{}_config.json'.format(model_name),
                      name=model_name)


In [0]:
generate_temperatures = [0.8, 0.9, 1.0]
def get_output_filename(model_name, temperature):
    return 'generated-cambridge-{}-temperature:{}.txt'.format(model_name, temperature)


In [0]:

# Read in the mapping of (home, work) label pairs -> count
def get_prefixes_to_counts_dict(fname):
    prefixes_to_counts_dict = None
    with open(fname) as json_file:
        prefixes_to_counts_dict = json.load(json_file)
    return prefixes_to_counts_dict

  
seq_length = 122

def filter_to_seq_length(sequences):
    return [seq for seq in sequences if (len(seq.split()) == seq_length)]



In [0]:

def generate_sequences(generator, temperature, prefix, make_num):
    # Current problem: not always getting desired sequence length (TODO: fork and hack on textgenrnn code to fix this)
    # Solution for now: loop to hack around this
    ss = []
    while len(ss) < make_num:
        n = (make_num - len(ss))*2
        generated_sequences = utils.synthesize(
            [generator], n=n, prefix=prefix, temperature=[temperature],
            return_as_list=True, max_gen_length=seq_length+1, stop_tokens=['hack'])
        ss += filter_to_seq_length(generated_sequences)
    return ss[:make_num]



In [0]:
import time

# Generate the sequences!
# Generate 2 synthetic sequences for every real sequence
count_multiplier = 2

# NOTE: Generation is done for data Cambridge specific data.
# Only using prefixes where the home label is a Cambridge GEOID
input_trajectories_prefixes_to_counts_filename = './relabeled_cambridge_trajectories_1_workweek_prefixes_to_counts.json'
prefixes_to_counts_dict = get_prefixes_to_counts_dict(input_trajectories_prefixes_to_counts_filename)

# For debugging...
sequences = None

# generate with a variety of temperatures, for given model
def generate_for_model(model_name):
  generator = get_model_generator(model_name)
  print('\nwill generate trajectories for temperatures %s and output to files %s\n' % (generate_temperatures, [get_output_filename(model_name, t) for t in generate_temperatures]))
  for temperature in generate_temperatures:
      output_fname = get_output_filename(model_name, temperature)
      print('%s : generating trajectories and saving to file: %s' % (datetime.now(), output_fname))
      sequences = []
      i = 0
      for prefix_labels, count in prefixes_to_counts_dict.items():
          if i % 100 == 0:
              print('%s : %s : generated %s sequences...' % (datetime.now(), i, len(sequences)))
          i += 1
          
          # Add an extra space so that the work prefix label has proper end and model continues to next label
          prefix = '%s ' % prefix_labels
          make_num = count*count_multiplier
          sequences += generate_sequences(generator, temperature, prefix, make_num=make_num)
      print('writing sequences to file', output_fname)
      with open(output_fname, 'w') as f:
          for seq in sequences:
              f.write('{}\n'.format(seq))
      print('wrote to file')
      # Make sure the file is finished being written - annoying colab issue
      time.sleep(120)
      files.download(output_fname)
      print('downloaded file')

In [0]:
model_name = 'trajectories-rnn_bidirectional:False-max_len:24-rnn_layers:2-rnn_size:128-dropout:0.2-dim_embeddings:50'
generate_for_model(model_name)

In [0]:
files.download('generated-cambridge-trajectories-rnn_bidirectional:False-max_len:24-rnn_layers:2-rnn_size:128-dropout:0.2-dim_embeddings:50-temperature:1.0.txt')

In [0]:
!ls

In [0]:
model_name = 'trajectories-rnn_bidirectional:False-max_len:24-rnn_layers:2-rnn_size:128-dropout:0.1-dim_embeddings:100'
generate_for_model(model_name)

In [0]:
model_name = 'trajectories-rnn_bidirectional:False-max_len:24-rnn_layers:2-rnn_size:128-dropout:0.1-dim_embeddings:50'
generate_for_model(model_name)
