In [1]:
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from correct_text import train, decode, decode_sentence, evaluate_accuracy, create_model,\
    get_corrective_tokens, DefaultPTBConfig, DefaultMovieDialogConfig
from text_correcter_data_readers import PTBDataReader, MovieDialogReader

%matplotlib inline

In [2]:
root_data_path = "/Users/atpaino/data/textcorrecter/dialog_corpus"
train_path = os.path.join(root_data_path, "movie_lines.txt")
val_path = os.path.join(root_data_path, "cleaned_dialog_val.txt")
test_path = os.path.join(root_data_path, "cleaned_dialog_test.txt")
model_path = os.path.join(root_data_path, "dialog_correcter_model_testnltk")
config = DefaultMovieDialogConfig()

## Train

In [3]:
data_reader = MovieDialogReader(config, train_path)

In [4]:
train(data_reader, train_path, val_path, model_path)

Reading data; train = /Users/atpaino/data/textcorrecter/dialog_corpus/movie_lines.txt, test = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_val.txt


KeyboardInterrupt: 

## Decode sentences

In [3]:
data_reader = MovieDialogReader(config, train_path, dropout_prob=0.25, replacement_prob=0.25, dataset_copies=1)

In [5]:
corrective_tokens = get_corrective_tokens(data_reader, train_path)

In [6]:
import pickle
with open(os.path.join(root_data_path, "corrective_tokens.pickle"), "w") as f:
    pickle.dump(corrective_tokens, f)

In [8]:
import pickle
with open(os.path.join(root_data_path, "token_to_id.pickle"), "w") as f:
    pickle.dump(data_reader.token_to_id, f)

In [5]:
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

Reading model parameters from /Users/atpaino/data/textcorrecter/dialog_corpus/dialog_correcter_model/translate.ckpt-41900


In [7]:
# Test a sample from the test dataset.
decoded = decode_sentence(sess, model, data_reader, "you must have girlfriend", corrective_tokens=corrective_tokens)

Input: you must have girlfriend
Output: you must have a girlfriend



In [7]:
decoded

NameError: name 'decoded' is not defined

In [6]:
decoded = decode_sentence(sess, model, data_reader,
                          "did n't you say that they 're going to develop this revolutionary new thing ...",
                          corrective_tokens=corrective_tokens)

Input: did n't you say that they 're going to develop this revolutionary new thing ...
Output: did n't you say that they 're going to develop this revolutionary new thing ...



In [9]:
decode_sentence(sess, model, data_reader, "kvothe went to market", corrective_tokens=corrective_tokens, verbose=False)

['kvothe', 'went', 'to', 'the', 'market']

In [10]:
decode_sentence(sess, model, data_reader, "blablahblah and bladdddd went to market", corrective_tokens=corrective_tokens,
                verbose=False)

['blablahblah', 'and', 'bladdddd', 'went', 'to', 'the', 'market']

In [11]:
decode_sentence(sess, model, data_reader, "do you have book", corrective_tokens=corrective_tokens, verbose=False)

['do', 'you', 'have', 'a', 'book']

In [10]:
decode_sentence(sess, model, data_reader, "the cardinals did better then the cubs", corrective_tokens=corrective_tokens, verbose=False)

['the', 'cardinals', 'did', 'better', 'than', 'the', 'cubs']

In [6]:
# 4 layers, 40k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8354
	Model BLEU = 0.8492
	Baseline Accuracy: 0.9090
	Model Accuracy: 0.9354
Bucket 1: (15, 15)
	Baseline BLEU = 0.8826
	Model BLEU = 0.8595
	Baseline Accuracy: 0.8055
	Model Accuracy: 0.8149
Bucket 2: (20, 20)
	Baseline BLEU = 0.8880
	Model BLEU = 0.8216
	Baseline Accuracy: 0.7301
	Model Accuracy: 0.6689
Bucket 3: (40, 40)
	Baseline BLEU = 0.9097
	Model BLEU = 0.6357
	Baseline Accuracy: 0.5981
	Model Accuracy: 0.2283


In [9]:
# 4 layers, 30k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8368
	Model BLEU = 0.8425
	Baseline Accuracy: 0.9110
	Model Accuracy: 0.9303
Bucket 1: (15, 15)
	Baseline BLEU = 0.8818
	Model BLEU = 0.8459
	Baseline Accuracy: 0.8063
	Model Accuracy: 0.8014
Bucket 2: (20, 20)
	Baseline BLEU = 0.8891
	Model BLEU = 0.7986
	Baseline Accuracy: 0.7309
	Model Accuracy: 0.6281
Bucket 3: (40, 40)
	Baseline BLEU = 0.9099
	Model BLEU = 0.5997
	Baseline Accuracy: 0.6007
	Model Accuracy: 0.1607


In [13]:
# 4 layers, 20k steps
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8330
	Model BLEU = 0.8335
	Baseline Accuracy: 0.9067
	Model Accuracy: 0.9218
Bucket 1: (15, 15)
	Baseline BLEU = 0.8772
	Model BLEU = 0.8100
	Baseline Accuracy: 0.7980
	Model Accuracy: 0.7437
Bucket 2: (20, 20)
	Baseline BLEU = 0.8898
	Model BLEU = 0.7636
	Baseline Accuracy: 0.7366
	Model Accuracy: 0.5370
Bucket 3: (40, 40)
	Baseline BLEU = 0.9098
	Model BLEU = 0.5387
	Baseline Accuracy: 0.6041
	Model Accuracy: 0.1117


In [16]:
errors = evaluate_accuracy(sess, model, data_reader, corrective_tokens, test_path)#, max_samples=1000)

Bucket 0: (10, 10)
	Baseline BLEU = 0.8341
	Model BLEU = 0.8516
	Baseline Accuracy: 0.9083
	Model Accuracy: 0.9384
Bucket 1: (15, 15)
	Baseline BLEU = 0.8850
	Model BLEU = 0.8860
	Baseline Accuracy: 0.8156
	Model Accuracy: 0.8491
Bucket 2: (20, 20)
	Baseline BLEU = 0.8876
	Model BLEU = 0.8880
	Baseline Accuracy: 0.7291
	Model Accuracy: 0.7817
Bucket 3: (40, 40)
	Baseline BLEU = 0.9099
	Model BLEU = 0.9045
	Baseline Accuracy: 0.6073
	Model Accuracy: 0.6425


In [15]:
for decoding, target in errors:
    print("Decoding: " + " ".join(decoding))
    print("Target:   " + " ".join(target) + "\n")

Decoding: you beg for mercy in a second .
Target:   you 'll beg for mercy in a second .

Decoding: i 'm dying for a shower . you could use the one too . and we 'd better check that bandage .
Target:   i 'm dying for a shower . you could use one too . and we 'd better check that bandage .

Decoding: whatever ... they 've become hotshot computer guys so they get a job to build el computer grande ... skynet ... for the government . right ?
Target:   whatever ... they become the hotshot computer guys so they get the job to build el computer grande ... skynet ... for the government . right ?

Decoding: did n't you say that they 're going to develop this revolutionary a new thing ...
Target:   did n't you say that they 're going to develop this revolutionary new thing ...

Decoding: bag some z ?
Target:   bag some z 's ?

Decoding: sleep . it 'll be a light soon .
Target:   sleep . it 'll be light soon .

Decoding: well , at least i know what to name him . i do n't suppose you 'd know who fa