In [11]:
from __future__ import print_function

import os
import time
import numpy as np
import tensorflow as tf
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score, accuracy_score
import nltk

from correct_text import train, decode, create_model, DefaultConfig, DefaultMovieDialogConfig
from text_correcter_model import PTBDataReader, MovieDialogReader

%matplotlib inline

In [2]:
root_data_path = "/Users/atpaino/data/textcorrecter/dialog_corpus"
train_path = os.path.join(root_data_path, "cleaned_dialog_train.txt")#"ptb.train.txt") 
val_path = os.path.join(root_data_path, "cleaned_dialog_val.txt")#"ptb.valid.txt")
test_path = os.path.join(root_data_path, "cleaned_dialog_test.txt")
model_path = os.path.join(root_data_path, "dialog_correcter_model")
config = DefaultConfig()

## Train

In [3]:
data_reader = MovieDialogReader(config, train_path)

In [None]:
train(data_reader, train_path, val_path, model_path)

Reading PTB data; train = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_train.txt, test = /Users/atpaino/data/textcorrecter/dialog_corpus/cleaned_dialog_val.txt
Creating 2 layers of 512 units.
Created model with fresh parameters.
Training bucket sizes: [113333, 49032, 28362, 40252]
Total train size: 230979.0
global step 100 learning rate 0.5000 step-time 8.62 perplexity 1746.50
  eval: bucket 0 perplexity 59.71
  eval: bucket 1 perplexity 234.62
  eval: bucket 2 perplexity 265.53
  eval: bucket 3 perplexity 308.15
global step 200 learning rate 0.5000 step-time 3.87 perplexity 92.27
  eval: bucket 0 perplexity 39.02
  eval: bucket 1 perplexity 175.45
  eval: bucket 2 perplexity 184.19
  eval: bucket 3 perplexity 222.94
global step 300 learning rate 0.5000 step-time 4.37 perplexity 78.92
  eval: bucket 0 perplexity 15.84
  eval: bucket 1 perplexity 79.16
  eval: bucket 2 perplexity 112.48
  eval: bucket 3 perplexity 144.01
global step 400 learning rate 0.5000 step-time 3

## Decode sentences

In [4]:
sess = tf.InteractiveSession()
model = create_model(sess, True, model_path, config=config)

Reading model parameters from /Users/atpaino/data/textcorrecter/dialog_corpus/dialog_correcter_model/translate.ckpt-18900


In [5]:
def dec(s, verbose=True):
    return decode(sess, model, data_reader, [s], verbose=verbose)

In [7]:
decoded = decode(sess, model, data_reader,
                 ["you must have girlfriend"])

Input: you must have girlfriend
Output: you must have a girlfriend



In [4]:
data_reader.word_counts.most_common(100)

[('.', 273361),
 (',', 134636),
 ('you', 118454),
 ('i', 113224),
 ('?', 87731),
 ('the', 78659),
 ('to', 63891),
 ('a', 56174),
 ("'s", 53300),
 ('it', 52322),
 ("n't", 43814),
 ('...', 40836),
 ('do', 37389),
 ('that', 37061),
 ('and', 36616),
 ('of', 31184),
 ('!', 30120),
 ('what', 29827),
 ('in', 27239),
 ('me', 25549),
 ('is', 24971),
 ('we', 23379),
 ('--', 22434),
 ('he', 21586),
 ('this', 19597),
 ('for', 18709),
 ('have', 18202),
 ("'m", 18166),
 ("'re", 17478),
 ('know', 17341),
 ('was', 16955),
 ('your', 16710),
 ('my', 16560),
 ('not', 15725),
 ('be', 15485),
 ('on', 15470),
 ('no', 15220),
 ('are', 14038),
 ('with', 13742),
 ('but', 13648),
 ('they', 13433),
 ('just', 12635),
 ('all', 12204),
 ('like', 11844),
 ("'ll", 11798),
 ('did', 11560),
 ('there', 11543),
 ('get', 11314),
 ('about', 11179),
 ('so', 10791),
 ('out', 10648),
 ('got', 10635),
 ('here', 10524),
 ('if', 10478),
 ('she', 9736),
 ('him', 9568),
 ('can', 9401),
 ('up', 9334),
 ('how', 9116),
 ('want', 8890

In [12]:
# Build corpus and decode hypotheses.
baseline_hypotheses = defaultdict(list)  # The model's input
model_hypotheses = defaultdict(list)  # The actual model's predictions
targets = defaultdict(list)

for source, target in data_reader.read_words(test_path):
    
    matching_buckets = [i for i, bucket in enumerate(model.buckets) if len(source) < bucket[0]]
    if not matching_buckets:
        continue
    bucket_id = matching_buckets[0]
    
    model_hypotheses[bucket_id].append(dec(" ".join(source), verbose=False)[0])
    
    # Replace out of vocab words with "UNK" in the baseline hypothesis to make it a little fairer.
    baseline_hypothesis = [word if word in data_reader.word_to_id else MovieDialogReader.UNKNOWN_TOKEN
                           for word in source]
    baseline_hypotheses[bucket_id].append(baseline_hypothesis)
    
    # nltk.corpus_bleu expects a list of one or more reference tranlsations per sample,
    # so we wrap the target list in another list here.
    targets[bucket_id].append([target])

In [15]:
for bucket_id in targets.keys():
    baseline_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], baseline_hypotheses[bucket_id])
    model_bleu_score = nltk.translate.bleu_score.corpus_bleu(targets[bucket_id], model_hypotheses[bucket_id])
    print("Bucket {}: {}".format(bucket_id, model.buckets[bucket_id]))
    print("\tBaseline BLEU = {}\n\tModel BLEU = {}".format(baseline_bleu_score, model_bleu_score))

Bucket 0: (10, 10)
	Baseline BLEU = 0.747396876291
	Model BLEU = 0.690203743792
Bucket 1: (15, 15)
	Baseline BLEU = 0.778185053349
	Model BLEU = 0.653820987905
Bucket 2: (20, 20)
	Baseline BLEU = 0.806628147208
	Model BLEU = 0.576479857871
Bucket 3: (40, 40)
	Baseline BLEU = 0.832271735442
	Model BLEU = 0.339813197828
