In [3]:
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from correct_text import train, decode, create_model, DefaultConfig, DefaultMovieDialogConfig
from text_correcter_data_readers import PTBDataReader, MovieDialogReader

%matplotlib inline

In [4]:
root_data_path = "/Users/atpaino/data/textcorrecter/dialog_corpus"
train_path = os.path.join(root_data_path, "cleaned_dialog_train.txt")#"ptb.train.txt") 
val_path = os.path.join(root_data_path, "cleaned_dialog_val.txt")#"ptb.valid.txt")
test_path = os.path.join(root_data_path, "cleaned_dialog_test.txt")
model_path = os.path.join(root_data_path, "dialog_correcter_model")
config = DefaultMovieDialogConfig()

## Train

In [5]:
data_reader = MovieDialogReader(config, train_path)

In [6]:
train(data_reader, train_path, val_path, model_path)

Reading data; train = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_train.txt, test = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_val.txt
Creating 2 layers of 512 units.
Reading model parameters from /Users/atpaino/data/textcorrecter/dialog_corpus/dialog_correcter_model/translate.ckpt-9100
Training bucket sizes: [226666, 98064, 56724, 80504]
Total train size: 461958.0
global step 9200 learning rate 0.4522 step-time 24.66 perplexity 1.29
  eval: bucket 0 perplexity 1.21
  eval: bucket 1 perplexity 1.31
  eval: bucket 2 perplexity 1.44
  eval: bucket 3 perplexity 1.61
global step 9300 learning rate 0.4522 step-time 4.73 perplexity 1.31
  eval: bucket 0 perplexity 1.25
  eval: bucket 1 perplexity 1.25
  eval: bucket 2 perplexity 1.30
  eval: bucket 3 perplexity 1.61
global step 9400 learning rate 0.4522 step-time 4.42 perplexity 1.27
  eval: bucket 0 perplexity 1.21
  eval: bucket 1 perplexity 1.23
  eval: bucket 2 perplexity 1.34
  eval: bucket 3 perpl

KeyboardInterrupt: 

## Decode sentences

In [3]:
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

Reading model parameters from /Users/atpaino/data/textcorrecter/dialog_corpus/dialog_correcter_model/translate.ckpt-6700


In [4]:
data_reader = MovieDialogReader(config, train_path, dropout_prob=0.9, replacement_prob=0.9, dataset_copies=1)

In [5]:
def dec(s, verbose=True):
    return decode(sess, model, data_reader, [s], verbose=verbose)

In [8]:
decoded = decode(sess, model, data_reader,
                 ["you have girlfriend"])

Input: you have girlfriend
Output: you have a



In [3]:
data_reader = MovieDialogReader(config, os.path.join(root_data_path, "movie_lines.txt"))

UnicodeDecodeError: 'ascii' codec can't decode byte 0x97 in position 11: ordinal not in range(128)

In [None]:
# Dropout and replacement rates of 0.9

In [16]:
# Build corpus and decode hypotheses.
baseline_hypotheses = defaultdict(list)  # The model's input
model_hypotheses = defaultdict(list)  # The actual model's predictions
targets = defaultdict(list)
blah = 0

for source, target in data_reader.read_words(test_path):
    
    matching_buckets = [i for i, bucket in enumerate(model.buckets) if len(source) < bucket[0]]
    if not matching_buckets:
        continue
    bucket_id = matching_buckets[0]
    
    model_hypotheses[bucket_id].append(dec(" ".join(source), verbose=False)[0])
    
    # Replace out of vocab words with "UNK" in the baseline hypothesis to make it a little fairer.
    baseline_hypothesis = [word if word in data_reader.word_to_id else MovieDialogReader.UNKNOWN_TOKEN
                           for word in source]
    baseline_hypotheses[bucket_id].append(baseline_hypothesis)
    
    # nltk.corpus_bleu expects a list of one or more reference tranlsations per sample,
    # so we wrap the target list in another list here.
    targets[bucket_id].append([target])
    
    blah += 1
    if blah > 100:
        break

In [17]:
for bucket_id in targets.keys():
    baseline_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], baseline_hypotheses[bucket_id])
    model_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], model_hypotheses[bucket_id])
    print("Bucket {}: {}".format(bucket_id, model.buckets[bucket_id]))
    print("\tBaseline BLEU = {}\n\tModel BLEU = {}".format(baseline_bleu_score, model_bleu_score))

Bucket 0: (10, 10)
	Baseline BLEU = 0.67347412776
	Model BLEU = 0.59654088672
Bucket 1: (15, 15)
	Baseline BLEU = 0.725270294806
	Model BLEU = 0.619899301746
Bucket 2: (20, 20)
	Baseline BLEU = 0.685227195136
	Model BLEU = 0.318128969106
Bucket 3: (40, 40)
	Baseline BLEU = 0.735841140618
	Model BLEU = 0.280921503555


In [None]:
# Default dropout/replacement rates of 0.25

In [7]:
# Build corpus and decode hypotheses.
baseline_hypotheses = defaultdict(list)  # The model's input
model_hypotheses = defaultdict(list)  # The actual model's predictions
targets = defaultdict(list)

for source, target in data_reader.read_words(test_path):
    
    matching_buckets = [i for i, bucket in enumerate(model.buckets) if len(source) < bucket[0]]
    if not matching_buckets:
        continue
    bucket_id = matching_buckets[0]
    
    model_hypotheses[bucket_id].append(dec(" ".join(source), verbose=False)[0])
    
    # Replace out of vocab words with "UNK" in the baseline hypothesis to make it a little fairer.
    baseline_hypothesis = [word if word in data_reader.word_to_id else MovieDialogReader.UNKNOWN_TOKEN
                           for word in source]
    baseline_hypotheses[bucket_id].append(baseline_hypothesis)
    
    # nltk.corpus_bleu expects a list of one or more reference tranlsations per sample,
    # so we wrap the target list in another list here.
    targets[bucket_id].append([target])

In [8]:
for bucket_id in targets.keys():
    baseline_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], baseline_hypotheses[bucket_id])
    model_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], model_hypotheses[bucket_id])
    print("Bucket {}: {}".format(bucket_id, model.buckets[bucket_id]))
    print("\tBaseline BLEU = {}\n\tModel BLEU = {}".format(baseline_bleu_score, model_bleu_score))

Bucket 0: (10, 10)
	Baseline BLEU = 0.729638310326
	Model BLEU = 0.695741131167
Bucket 1: (15, 15)
	Baseline BLEU = 0.769472470187
	Model BLEU = 0.678890611587
Bucket 2: (20, 20)
	Baseline BLEU = 0.798002323422
	Model BLEU = 0.621063999468
Bucket 3: (40, 40)
	Baseline BLEU = 0.822407479385
	Model BLEU = 0.386759652473
