In [2]:
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from correct_text import train, decode, decode_sentence, evaluate_accuracy, create_model,\
    get_corrective_tokens, DefaultPTBConfig, DefaultMovieDialogConfig
from text_correcter_data_readers import PTBDataReader, MovieDialogReader

%matplotlib inline

In [3]:
root_data_path = "/Users/atpaino/data/textcorrecter/dialog_corpus"
train_path = os.path.join(root_data_path, "cleaned_dialog_train.txt")
val_path = os.path.join(root_data_path, "cleaned_dialog_val.txt")
test_path = os.path.join(root_data_path, "cleaned_dialog_test.txt")
model_path = os.path.join(root_data_path, "dialog_correcter_model")
config = DefaultMovieDialogConfig()

## Train

In [3]:
data_reader = MovieDialogReader(config, train_path)

In [4]:
train(data_reader, train_path, val_path, model_path)

Reading data; train = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_train.txt, test = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_val.txt
Creating 2 layers of 512 units.
Reading model parameters from /Users/atpaino/data/textcorrecter/dialog_corpus/dialog_correcter_model/translate.ckpt-15000
Training bucket sizes: [226666, 98064, 56724, 80504]
Total train size: 461958.0
global step 15100 learning rate 0.4049 step-time 4.43 perplexity 1.05
  eval: bucket 0 perplexity 1.04
  eval: bucket 1 perplexity 1.04
  eval: bucket 2 perplexity 1.12
  eval: bucket 3 perplexity 1.67
global step 15200 learning rate 0.4049 step-time 5.28 perplexity 1.12
  eval: bucket 0 perplexity 1.02
  eval: bucket 1 perplexity 1.13
  eval: bucket 2 perplexity 1.15
  eval: bucket 3 perplexity 1.16
global step 15300 learning rate 0.4049 step-time 4.65 perplexity 1.06
  eval: bucket 0 perplexity 1.07
  eval: bucket 1 perplexity 1.05
  eval: bucket 2 perplexity 1.05
  eval: bucket 3 pe

KeyboardInterrupt: 

## Decode sentences

In [4]:
data_reader = MovieDialogReader(config, train_path, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1)

In [5]:
corrective_tokens = get_corrective_tokens(data_reader, train_path)

In [6]:
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

Reading model parameters from /Users/atpaino/data/textcorrecter/dialog_corpus/dialog_correcter_model/translate.ckpt-31300


In [7]:
# Test a sample from the test dataset.
decoded = decode_sentence(sess, model, data_reader, "you must have girlfriend", corrective_tokens=corrective_tokens)

Input: you must have girlfriend
Output: you must have a girlfriend



In [8]:
decoded = decode_sentence(sess, model, data_reader,
                          "did n't you say that they 're going to develop this revolutionary new thing ...",
                          corrective_tokens=corrective_tokens)

Input: did n't you say that they 're going to develop this revolutionary new thing ...
Output: did n't you say that they 're going to develop this revolutionary a new thing ...



In [9]:
decode_sentence(sess, model, data_reader, "kvothe went to market", corrective_tokens=corrective_tokens, verbose=False)

['kvothe', 'went', 'to', 'the', 'market']

In [10]:
decode_sentence(sess, model, data_reader, "blablahblah and bladdddd went to market", corrective_tokens=corrective_tokens,
                verbose=False)

['blablahblah', 'and', 'bladdddd', 'went', 'to', 'the', 'market']

In [13]:
decode_sentence(sess, model, data_reader, "do you have book", corrective_tokens=corrective_tokens, verbose=False)

['do', 'you', 'have', 'a', 'book']

In [12]:
decode_sentence(sess, model, data_reader, "she did better then him", corrective_tokens=corrective_tokens, verbose=False)

['she', 'did', 'better', 'than', 'him']

In [14]:
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8282
	Model BLEU = 0.8444
	Baseline Accuracy: 0.9041
	Model Accuracy: 0.9355
Bucket 1: (15, 15)
	Baseline BLEU = 0.8794
	Model BLEU = 0.8953
	Baseline Accuracy: 0.7711
	Model Accuracy: 0.8434
Bucket 2: (20, 20)
	Baseline BLEU = 0.8989
	Model BLEU = 0.8884
	Baseline Accuracy: 0.7619
	Model Accuracy: 0.7810
Bucket 3: (40, 40)
	Baseline BLEU = 0.8977
	Model BLEU = 0.9109
	Baseline Accuracy: 0.6000
	Model Accuracy: 0.6480


In [15]:
for decoding, target in errors:
    print("Decoding: " + " ".join(decoding))
    print("Target:   " + " ".join(target) + "\n")

Decoding: you beg for mercy in a second .
Target:   you 'll beg for mercy in a second .

Decoding: i 'm dying for a shower . you could use the one too . and we 'd better check that bandage .
Target:   i 'm dying for a shower . you could use one too . and we 'd better check that bandage .

Decoding: whatever ... they 've become hotshot computer guys so they get a job to build el computer grande ... skynet ... for the government . right ?
Target:   whatever ... they become the hotshot computer guys so they get the job to build el computer grande ... skynet ... for the government . right ?

Decoding: did n't you say that they 're going to develop this revolutionary a new thing ...
Target:   did n't you say that they 're going to develop this revolutionary new thing ...

Decoding: bag some z ?
Target:   bag some z 's ?

Decoding: sleep . it 'll be a light soon .
Target:   sleep . it 'll be light soon .

Decoding: well , at least i know what to name him . i do n't suppose you 'd know who fa