In [1]:
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from correct_text import train, decode, create_model, DefaultPTBConfig, DefaultMovieDialogConfig
from text_correcter_data_readers import PTBDataReader, MovieDialogReader

%matplotlib inline

In [2]:
root_data_path = "/Users/atpaino/data/textcorrecter/dialog_corpus"
train_path = os.path.join(root_data_path, "cleaned_dialog_train.txt")
val_path = os.path.join(root_data_path, "cleaned_dialog_val.txt")
test_path = os.path.join(root_data_path, "cleaned_dialog_test.txt")
model_path = os.path.join(root_data_path, "dialog_correcter_model")
config = DefaultMovieDialogConfig()

## Train

In [3]:
data_reader = MovieDialogReader(config, train_path)

In [4]:
train(data_reader, train_path, val_path, model_path)

Reading data; train = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_train.txt, test = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_val.txt
Creating 2 layers of 512 units.
Created model with fresh parameters.
Training bucket sizes: [226666, 98064, 56724, 80504]
Total train size: 461958.0
global step 100 learning rate 0.5000 step-time 4.61 perplexity 2143.50
  eval: bucket 0 perplexity 220.79
  eval: bucket 1 perplexity 340.07
  eval: bucket 2 perplexity 390.61
  eval: bucket 3 perplexity 604.10
global step 200 learning rate 0.5000 step-time 4.27 perplexity 194.30
  eval: bucket 0 perplexity 69.65
  eval: bucket 1 perplexity 144.68
  eval: bucket 2 perplexity 218.88
  eval: bucket 3 perplexity 315.74
global step 300 learning rate 0.5000 step-time 4.65 perplexity 94.97
  eval: bucket 0 perplexity 30.62
  eval: bucket 1 perplexity 97.79
  eval: bucket 2 perplexity 158.51
  eval: bucket 3 perplexity 225.04
global step 400 learning rate 0.5000 step-time 4.3

## Decode sentences

In [3]:
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

Reading model parameters from /Users/atpaino/data/textcorrecter/dialog_corpus/dialog_correcter_model/translate.ckpt-20000


In [4]:
data_reader = MovieDialogReader(config, train_path, dropout_prob=0.9, replacement_prob=0.9, dataset_copies=1)

In [19]:
# Test a sample from the test dataset.
decoded = next(decode_sentence(sess, model, data_reader, "you have girlfriend"))

Input: you have girlfriend
Output: you have a girlfriend



In [40]:
blah = decode_str("do you have book")

Input: do you have book
Output: do you have a book



In [20]:
def decode_str(s):
    return next(decode_sentence(sess, model, data_reader, s))

In [9]:

def decode_sentence(sess, model, data_reader, sentence, verbose=True):
    """Used with InteractiveSession in an IPython notebook."""
    return next(decode(sess, model, data_reader, [sentence.split()], verbose))

In [None]:
# Dropout and replacement rates of 0.9

In [17]:
# Build corpus and decode hypotheses.
baseline_hypotheses = defaultdict(list)  # The model's input
model_hypotheses = defaultdict(list)  # The actual model's predictions
targets = defaultdict(list)
blah = 0

for source, target in data_reader.read_samples_by_string(test_path):
    
    matching_buckets = [i for i, bucket in enumerate(model.buckets) if len(source) < bucket[0]]
    if not matching_buckets:
        continue
    bucket_id = matching_buckets[0]
    
    model_hypotheses[bucket_id].append(next(decode(sess, model, data_reader, [source], verbose=False)))
    
    # Replace out of vocab words with "UNK" in the baseline hypothesis to make it a little fairer.
    baseline_hypothesis = [word if word in data_reader.token_to_id else MovieDialogReader.UNKNOWN_TOKEN
                           for word in source]
    baseline_hypotheses[bucket_id].append(baseline_hypothesis)
    
    # nltk.corpus_bleu expects a list of one or more reference tranlsations per sample,
    # so we wrap the target list in another list here.
    targets[bucket_id].append([target])
    
#     blah += 1
#     if blah > 10:
#         break

In [18]:
for bucket_id in targets.keys():
    baseline_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], baseline_hypotheses[bucket_id])
    model_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], model_hypotheses[bucket_id])
    print("Bucket {}: {}".format(bucket_id, model.buckets[bucket_id]))
    print("\tBaseline BLEU = {}\n\tModel BLEU = {}".format(baseline_bleu_score, model_bleu_score))

Bucket 0: (10, 10)
	Baseline BLEU = 0.671113372091
	Model BLEU = 0.662576018437
Bucket 1: (15, 15)
	Baseline BLEU = 0.724487231051
	Model BLEU = 0.695375017816
Bucket 2: (20, 20)
	Baseline BLEU = 0.750923521318
	Model BLEU = 0.72081720851
Bucket 3: (40, 40)
	Baseline BLEU = 0.77119960248
	Model BLEU = 0.712245139018
