# Load Data

In [182]:
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime
from importlib import reload

import nltk
from nltk.corpus import stopwords
from w266_common import utils, vocabulary, tf_embed_viz, patched_numpy_io
from collections import defaultdict, Counter

from pathlib import Path

In [4]:
tv_show = "friends"

In [14]:
nltk.download('punkt')
nltk.download('stopwords')

root_path = Path().resolve().joinpath("..")
show_data_path = root_path.joinpath("scrape", "data", tv_show, "parsed")
embeddings_path = root_path.joinpath("embeddings", "newscrawl.300d.W.pos.vectors.gz")
embeddings_url = "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1"

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
print(embeddings_path)

if not os.path.isfile(embeddings_path):
    print("downloading embeddings...")
    urllib.request.urlretrieve(embeddings_url, embeddings_path)

# !gunzip {str(embeddings_path)} 

/home/jovyan/work/analysis/../embeddings/newscrawl.300d.W.pos.vectors.gz


## Load the training data

In [204]:
dialog_datas = []

for filename in os.listdir(show_data_path):
    dialog_data = pd.read_csv(show_data_path.joinpath(filename), header=None, names=("speaker", "utterance"))
    dialog_data["episode"] = filename.split(".")[0]
    dialog_datas.append(dialog_data)
    
all_dialog_data = pd.concat(dialog_datas).dropna().reset_index(drop=True)

In [205]:
print(all_dialog_data.count())
all_dialog_data.head()

speaker      55025
utterance    55025
episode      55025
dtype: int64


Unnamed: 0,speaker,utterance,episode
0,Phoebe,"Oh, hey Joey.",1012
1,Joey,"Uh, hey.",1012
2,Phoebe,"Listen, I need to ask you something. Ok, you k...",1012
3,Joey,Yeah.,1012
4,Phoebe,"Yeah. Well, uhm... listen he was supposed to g...",1012


In [218]:
character_counts = Counter()
for character in all_dialog_data.speaker:
    character_counts[character] += 1
    
top_characters = character_counts.most_common(6)

char_id_to_word = dict(enumerate([w for w, c in top_characters]))
char_word_to_id = {v:k for k,v in char_id_to_word.items()}

major_dialog_data = all_dialog_data[all_dialog_data.speaker.isin(char_word_to_id.keys())]

utterance_tokenized = [word_tokenize(sentence) for sentence in major_dialog_data.utterance]
vocab = vocabulary.Vocabulary(utils.canonicalize_word(w) for w in utils.flatten(utterance_tokenized))

In [236]:
utterances_index = [vocab.words_to_ids(words) for words in utterance_tokenized]

In [268]:
utterances_length = np.array([len(words) for words in utterances_index])

print(max(utterances_length))

def create_one_hot_char(x):
    speaker_index = np.zeros(len(char_word_to_id))
    speaker_index[char_word_to_id[x]] = 1
    return speaker_index




#speaker_index = np.array([create_one_hot_char(speaker) for speaker in major_dialog_data.speaker])
speaker_index = np.array([char_word_to_id[speaker] for speaker in major_dialog_data.speaker])

298


In [263]:
max_len = 40

utterances_index_nparray = np.zeros((len(utterances_index), max_len), dtype=np.int32)
utterances_length = np.zeros([len(utterances_index)], dtype=np.int32)

for i, row in enumerate(utterances_index):
    cpy_len = min(len(row), max_len)
    utterances_index_nparray[i,:cpy_len] = row[:cpy_len]
    utterances_length[i] = cpy_len

In [269]:
print(utterances_index_nparray.shape)
print(utterances_length.shape)
print(speaker_index.shape)

(46079, 40)
(46079,)
(46079,)


In [271]:
import models.neuralbow as neuralbow
reload(neuralbow)

<module 'models.neuralbow' from '/home/jovyan/work/analysis/models/neuralbow.py'>

In [272]:
# Specify model hyperparameters as used by model_fn
model_params = dict(V=vocab.size,
                    embed_dim=50,
                    hidden_dims=[25],
                    num_classes=6,
                    encoder_type='bow',
                    lr=0.1,
                    optimizer='adagrad',
                    beta=0.01)

checkpoint_dir = "/tmp/tf_bow_sst_" + datetime.datetime.now().strftime("%Y%m%d-%H%M")
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
# Write vocabulary to file, so TensorBoard can label embeddings.
# creates checkpoint_dir/projector_config.pbtxt and checkpoint_dir/metadata.tsv
# ds.vocab.write_projector_config(checkpoint_dir, "Encoder/Embedding_Layer/W_embed")

model = tf.estimator.Estimator(model_fn=neuralbow.classifier_model_fn, 
                               params=model_params,
                               model_dir=checkpoint_dir)
print("")
print("To view training (once it starts), run:\n")
print("    tensorboard --logdir='{:s}' --port 6006".format(checkpoint_dir))
print("\nThen in your browser, open: http://localhost:6006")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tf_bow_sst_20181122-0402', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f87c88a1a58>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

To view training (once it starts), run:

    tensorboard --logdir='/tmp/tf_bow_sst_20181122-0402' --port 6006

Then in your bro

In [273]:
# Training params, just used in this cell for the input_fn-s
train_params = dict(batch_size=32, total_epochs=20, eval_every=2)
assert(train_params['total_epochs'] % train_params['eval_every'] == 0)

# Construct and train the model, saving checkpoints to the directory above.
# Input function for training set batches
# Do 'eval_every' epochs at once, followed by evaluating on the dev set.
# NOTE: use patch_numpy_io.numpy_input_fn instead of tf.estimator.inputs.numpy_input_fn
train_input_fn = patched_numpy_io.numpy_input_fn(
    x={"ids": utterances_index_nparray,
       "ns": utterances_length},
    y=speaker_index,
    batch_size=train_params['batch_size'], 
    num_epochs=train_params['eval_every'], shuffle=True, seed=42
)

# Input function for dev set batches. As above, but:
# - Don't randomize order
# - Iterate exactly once (one epoch)
dev_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"ids": utterances_index_nparray,
       "ns": utterances_length},
    y=speaker_index,
    batch_size=128,
    num_epochs=1,
    shuffle=False)

for _ in range(train_params['total_epochs'] // train_params['eval_every']):
    # Train for a few epochs, then evaluate on dev
    model.train(input_fn=train_input_fn)
    eval_metrics = model.evaluate(input_fn=dev_input_fn, name="dev")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tf_bow_sst_20181122-0402/model.ckpt.
INFO:tensorflow:loss = 2.76837, step = 1
INFO:tensorflow:global_step/sec: 132.576
INFO:tensorflow:loss = 2.41577, step = 101 (0.744 sec)
INFO:tensorflow:global_step/sec: 225.187
INFO:tensorflow:loss = 2.22552, step = 201 (0.443 sec)
INFO:tensorflow:global_step/sec: 215.155
INFO:tensorflow:loss = 2.08667, step = 301 (0.465 sec)
INFO:tensorflow:global_step/sec: 203.154
INFO:tensorflow:loss = 1.98211, step = 401 (0.494 sec)
INFO:tensorflow:global_step/sec: 206.679
INFO:tensorflow:loss = 1.96327, step = 501 (0.482 sec)
INFO:tensorflow:global_step/sec: 202.621
INFO:tensorflow:loss = 1.90147

In [275]:

test_input_fn = tf.estimator.inputs.numpy_input_fn(
                    x={"ids": utterances_index_nparray, "ns": utterances_length}, y=speaker_index,
                    batch_size=128, num_epochs=1, shuffle=False
                )  # replace with an input_fn, similar to dev_input_fn

eval_metrics = model.evaluate(input_fn=test_input_fn, name="test")  # replace with result of model.evaluate(...)

print("Accuracy on test set: {:.02%}".format(eval_metrics['accuracy']))
eval_metrics

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-22-04:08:34
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tf_bow_sst_20181122-0402/model.ckpt-28800
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-22-04:08:36
INFO:tensorflow:Saving dict for global step 28800: accuracy = 0.246815, cross_entropy_loss = 1.69052, global_step = 28800, loss = 1.76861
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 28800: /tmp/tf_bow_sst_20181122-0402/model.ckpt-28800
Accuracy on test set: 24.68%


{'accuracy': 0.24681525,
 'cross_entropy_loss': 1.6905236,
 'loss': 1.7686096,
 'global_step': 28800}