In [1]:
import tensorflow as tf
import numpy as np
import argparse
from datetime import datetime

from data_loader import DataGenerator, ParaphraseDataGenerator

from trainer import ParaphraseDetectionTrainer
from preprocessor import DynamicPreprocessor
from utils.dirs import create_dirs
from utils.logger import SummaryWriter
from utils.config import load_config, save_config
from models.base import get_model
from utils.utils import JamoProcessor

In [2]:
import editdistance

In [3]:
class Config:
    def __init__(self):
        self.mode = "train"
        self.name = "start"
        self.config = ""
        self.train_dir = "/home/angrypark/paraphrase_detection/data/train/"
        self.val_dir = "/home/angrypark/paraphrase_detection/data/small/val.txt"
        
        self.pretrained_embed_dir = "/media/scatter/scatterdisk/pretrained_embedding/fasttext.sent_piece_100K.256D"
        self.checkpoint_dir = "/media/scatter/scatterdisk/paraphrase_detection/runs/"
        
        self.model = "DeepParaphrase"
        self.sent_piece_model = "/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model"
        self.normalizer = "DummyNormalizer"
        self.tokenizer = "DummyTokenizer"
        
        self.vocab_size = 90000
        self.vocab_list = "/media/scatter/scatterdisk/pretrained_embedding/vocab_list.sent_piece_100K.txt"
        self.embed_dim = 256
        
        self.learning_rate = 1e-3
        self.min_length = 1
        self.max_length = 20
        
        self.dropout_keep_prob = 0.75
        
        self.num_epochs = 300
        self.batch_size = 256
        self.evaluate_every = 10000
        self.save_every = 10000
        self.max_to_keep = 4
        
        self.shuffle = True
        self.extra_features_dim = 2

config = Config()

In [4]:
config = create_dirs(config)

In [5]:
device_config = tf.ConfigProto()
device_config.gpu_options.allow_growth = True
sess = tf.Session(config=device_config)

In [6]:
preprocessor = DynamicPreprocessor(config)

In [7]:
data = ParaphraseDataGenerator(preprocessor, config)
summary_writer = SummaryWriter(sess, config)

Pre-trained embedding loaded. Number of OOV : 5272 / 90000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/reply_matching_model/runs/delstm_1024_nsrandom4_lr1e-3/best_loss/best_loss.ckpt


In [8]:
trainer = ParaphraseDetectionTrainer(sess, preprocessor, data, config, summary_writer)

In [9]:
trainer.train()

[32m[23:05:26][INFO] Building train graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


[32m[23:05:32][INFO] Loading checkpoint from /media/scatter/scatterdisk/paraphrase_detection/runs/start/ [0m
[31m[23:05:32][ERROR] No checkpoint found in /media/scatter/scatterdisk/paraphrase_detection/runs/start/ [0m
[32m[23:05:32][INFO] Building val graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


100%|██████████| 1172/1172 [01:32<00:00, 12.70it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 15.87it/s]
100%|██████████| 1172/1172 [01:30<00:00, 12.98it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 15.90it/s]
100%|██████████| 1172/1172 [01:31<00:00, 12.82it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 15.44it/s]
100%|██████████| 1172/1172 [01:31<00:00, 12.82it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:05<00:00, 14.93it/s]
100%|██████████| 1172/1172 [01:30<00:00, 12.95it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 15.41it/s]
100%|██████████| 1172/1172 [01:32<00:00, 12.72it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 15.52it/s]
100%|██████████| 1172/1172 [01:31<00:00, 12.83it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 16.08it/s]
100%|██████████| 1172/1172 [01:30<00:00, 12.99it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:05<00:00, 14.51it/s]
 53%|█████▎    | 622/1172 [00:57<00:50, 10.88it/s]

INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt



  0%|          | 0/76 [00:00<?, ?it/s][A
  1%|▏         | 1/76 [00:01<01:32,  1.23s/it][A
  5%|▌         | 4/76 [00:01<00:24,  2.92it/s][A
  8%|▊         | 6/76 [00:01<00:17,  4.06it/s][A
 11%|█         | 8/76 [00:01<00:13,  5.03it/s][A
 14%|█▍        | 11/76 [00:01<00:10,  6.26it/s][A
 18%|█▊        | 14/76 [00:01<00:08,  7.48it/s][A
 22%|██▏       | 17/76 [00:02<00:06,  8.44it/s][A
 25%|██▌       | 19/76 [00:02<00:06,  8.95it/s][A
 29%|██▉       | 22/76 [00:02<00:05,  9.82it/s][A
 33%|███▎      | 25/76 [00:02<00:04, 10.46it/s][A
 37%|███▋      | 28/76 [00:02<00:04, 11.01it/s][A
 41%|████      | 31/76 [00:02<00:03, 11.44it/s][A
 45%|████▍     | 34/76 [00:02<00:03, 12.00it/s][A
 49%|████▊     | 37/76 [00:02<00:03, 12.49it/s][A
 53%|█████▎    | 40/76 [00:03<00:02, 12.79it/s][A
 57%|█████▋    | 43/76 [00:03<00:02, 13.26it/s][A
 61%|██████    | 46/76 [00:03<00:02, 13.63it/s][A
 64%|██████▍   | 49/76 [00:03<00:01, 13.88it/s][A
 68%|██████▊   | 52/76 [00:03<00:01, 14.04i

INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 16.56it/s]
100%|██████████| 1172/1172 [01:30<00:00, 12.93it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 15.80it/s]
100%|██████████| 1172/1172 [01:31<00:00, 12.79it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:05<00:00, 14.98it/s]
100%|██████████| 1172/1172 [01:31<00:00, 12.88it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 16.47it/s]
100%|██████████| 1172/1172 [01:32<00:00, 12.72it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start/model.ckpt


100%|██████████| 76/76 [00:04<00:00, 15.79it/s]
  0%|          | 0/1172 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [10]:
import os

In [11]:
os.environ['CUDA_VISIBLE_DEVICES']="1"

In [13]:
infer_model, infer_sess = trainer.build_graph(name="train")

[32m[23:28:09][INFO] Building train graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


[32m[23:28:16][INFO] Loading checkpoint from /media/scatter/scatterdisk/paraphrase_detection/runs/start/ [0m
[31m[23:28:16][ERROR] No checkpoint found in /media/scatter/scatterdisk/paraphrase_detection/runs/start/ [0m


In [None]:
infer_model.load(infer_sess, model_dir = "/media/scatter/scatterdisk/paraphrase_detection/runs/start/best_loss/best_loss.ckpt")

In [38]:
val_iterator = data.get_val_iterator(config.batch_size)

In [39]:
score = list()
while True:
    try:
        batch_A, batch_B, batch_sentence_diff, batch_extra_features, labels = next(val_iterator)
        feed_dict = {infer_model.input_A: batch_A,
                 infer_model.input_B: batch_B,
                 infer_model.sentence_vector_diff: batch_sentence_diff,
                 infer_model.extra_features: batch_extra_features,
                 infer_model.dropout_keep_prob: 1,
                 infer_model.labels: [int(l) for l in labels]
                }
        predictions = infer_sess.run([infer_model.predictions], feed_dict=feed_dict)
        score.append(np.mean(np.equal(predictions, np.expand_dims([int(l) for l in labels], -1))))
    except:
        print(np.mean(score))
        break

0.5878539121240601


In [24]:
feed_dict = {infer_model.input_A: batch_A,
             infer_model.input_B: batch_B,
             infer_model.sentence_vector_diff: batch_sentence_diff,
             infer_model.extra_features: batch_extra_features,
             infer_model.dropout_keep_prob: 1,
             infer_model.labels: [int(l) for l in labels]
            }

In [25]:
_, loss, score = infer_graph.run([infer_model.train_step, infer_model.loss, infer_model.score],
                                     feed_dict=feed_dict)

In [28]:
probs = infer_graph.run([infer_model.probs],
                                     feed_dict=feed_dict)

In [30]:
prediction = infer_graph.run([infer_model.predictions], feed_dict=feed_dict)

In [33]:
labels = [int(l) for l in labels]

In [36]:
labels = np.expand_dims(labels, -1)

In [38]:
np.mean(np.equal(prediction, labels))

0.85546875