In [1]:
import tensorflow as tf
import numpy as np
import argparse
from datetime import datetime

from data_loader import DataGenerator, ParaphraseDataGenerator

from trainer import ParaphraseDetectionTrainer
from preprocessor import DynamicPreprocessor
from utils.dirs import create_dirs
from utils.logger import SummaryWriter
from utils.config import load_config, save_config
from models.base import get_model
from utils.utils import JamoProcessor

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES']="1"

In [3]:
import editdistance

In [4]:
class Config:
    def __init__(self):
        self.mode = "train"
        self.name = "start3"
        self.config = ""
        self.train_dir = "/home/angrypark/paraphrase_detection/data/train/"
        self.val_dir = "/home/angrypark/paraphrase_detection/data/small/test.txt"
        
        self.pretrained_embed_dir = "/media/scatter/scatterdisk/pretrained_embedding/fasttext.sent_piece_100K.256D"
        self.checkpoint_dir = "/media/scatter/scatterdisk/paraphrase_detection/runs/"
        
        self.model = "DeepParaphrase"
        self.sent_piece_model = "/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model"
        self.normalizer = "DummyNormalizer"
        self.tokenizer = "DummyTokenizer"
        
        self.vocab_size = 90000
        self.vocab_list = "/media/scatter/scatterdisk/pretrained_embedding/vocab_list.sent_piece_100K.txt"
        self.embed_dim = 256
        
        self.learning_rate = 1e-3
        self.min_length = 1
        self.max_length = 20
        
        self.dropout_keep_prob = 0.8
        
        self.num_epochs = 300
        self.batch_size = 256
        self.evaluate_every = 10000
        self.save_every = 10000
        self.max_to_keep = 4
        
        self.shuffle = True
        self.extra_features_dim = 2

config = Config()

In [5]:
config = create_dirs(config)

In [6]:
device_config = tf.ConfigProto()
device_config.gpu_options.allow_growth = True
sess = tf.Session(config=device_config)

In [7]:
preprocessor = DynamicPreprocessor(config)

In [8]:
data = ParaphraseDataGenerator(preprocessor, config)
summary_writer = SummaryWriter(sess, config)

Pre-trained embedding loaded. Number of OOV : 5272 / 90000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/reply_matching_model/runs/delstm_1024_nsrandom4_lr1e-3/best_loss/best_loss.ckpt


In [9]:
trainer = ParaphraseDetectionTrainer(sess, preprocessor, data, config, summary_writer)

In [10]:
trainer.train()

[32m[20:18:10][INFO] Building train graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


[32m[20:18:16][INFO] Loading checkpoint from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/ [0m


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


[32m[20:18:16][INFO] Building val graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


100%|██████████| 1248/1248 [01:39<00:00, 12.52it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 16.07it/s]
100%|██████████| 1248/1248 [01:37<00:00, 12.80it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 16.25it/s]
100%|██████████| 1248/1248 [01:38<00:00, 12.64it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 15.80it/s]
100%|██████████| 1248/1248 [01:38<00:00, 12.70it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 16.41it/s]
  1%|          | 13/1248 [00:21<34:28,  1.67s/it]

INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt



  0%|          | 0/60 [00:00<?, ?it/s][A
  2%|▏         | 1/60 [00:00<00:47,  1.24it/s][A
  5%|▌         | 3/60 [00:00<00:17,  3.28it/s][A
  8%|▊         | 5/60 [00:01<00:11,  4.92it/s][A
 13%|█▎        | 8/60 [00:01<00:07,  6.84it/s][A
 18%|█▊        | 11/60 [00:01<00:05,  8.41it/s][A
 23%|██▎       | 14/60 [00:01<00:04,  9.54it/s][A
 28%|██▊       | 17/60 [00:01<00:04, 10.62it/s][A
 33%|███▎      | 20/60 [00:01<00:03, 11.60it/s][A
 38%|███▊      | 23/60 [00:01<00:03, 12.26it/s][A
 43%|████▎     | 26/60 [00:02<00:02, 12.96it/s][A
 48%|████▊     | 29/60 [00:02<00:02, 13.49it/s][A
 53%|█████▎    | 32/60 [00:02<00:01, 14.05it/s][A
 58%|█████▊    | 35/60 [00:02<00:01, 14.41it/s][A
 63%|██████▎   | 38/60 [00:02<00:01, 14.88it/s][A
 68%|██████▊   | 41/60 [00:02<00:01, 15.28it/s][A
 73%|███████▎  | 44/60 [00:02<00:01, 15.53it/s][A
 78%|███████▊  | 47/60 [00:02<00:00, 15.78it/s][A
 83%|████████▎ | 50/60 [00:03<00:00, 16.07it/s][A
 88%|████████▊ | 53/60 [00:03<00:00, 16.36i

INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 15.73it/s]
100%|██████████| 1248/1248 [01:37<00:00, 12.74it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 15.38it/s]
100%|██████████| 1248/1248 [01:37<00:00, 12.76it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 16.16it/s]
100%|██████████| 1248/1248 [01:38<00:00, 12.69it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 16.60it/s]
100%|██████████| 1248/1248 [01:38<00:00, 12.66it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:03<00:00, 15.33it/s]
100%|██████████| 1248/1248 [01:42<00:00, 12.20it/s]


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


100%|██████████| 60/60 [00:04<00:00, 14.93it/s]
  3%|▎         | 39/1248 [00:23<12:15,  1.64it/s]

KeyboardInterrupt: 

In [11]:
infer_model, infer_sess = trainer.build_graph(name="train")

[32m[20:36:51][INFO] Building train graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


[32m[20:36:57][INFO] Loading checkpoint from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/ [0m


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/model.ckpt


In [12]:
infer_model.load(infer_sess, model_dir = "/media/scatter/scatterdisk/paraphrase_detection/runs/start3/best_loss/best_loss.ckpt")

INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/paraphrase_detection/runs/start3/best_loss/best_loss.ckpt


  3%|▎         | 39/1248 [00:40<20:40,  1.03s/it]

In [16]:
from collections import namedtuple
from text.tokenizers import SentencePieceTokenizer

In [17]:
Config = namedtuple("config", ["sent_piece_model"])
config = Config("/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model")
processor = JamoProcessor()
tokenizer = SentencePieceTokenizer(config)

In [24]:
def infer(A, B):
    probs = list()
    A = [" ".join(tokenizer.tokenize(a)) for a in A]
    B = [" ".join(tokenizer.tokenize(b)) for b in B]
    extracted_features = data.feature_extractor.extract_features(A, B)
    A, A_lengths = zip(*[data.preprocessor.preprocess(a) for a in A])
    B, B_lengths = zip(*[data.preprocessor.preprocess(b) for b in B])
    
    length = len(A)
    batch_size = 512
    num_batches = (length-1)//batch_size + 1
    
    for batch_num in range(num_batches):
        start = batch_num * batch_size
        end = min([length, (batch_num+1) * batch_size])
        batch_A, batch_B = A[start:end], B[start:end]
        batch_A_lengths, batch_B_lengths = A_lengths[start:end], B_lengths[start:end]
        batch_sentence_diff = data.get_sentence_diff(data.pretrained_model,
                                                    data.pretrained_sess,
                                                    batch_A, 
                                                    batch_B,
                                                    batch_A_lengths,
                                                    batch_B_lengths)
        batch_extra_features = extracted_features[start:end]
        
        feed_dict = {infer_model.input_A: batch_A, 
                     infer_model.input_B: batch_B, 
                     infer_model.sentence_vector_diff: batch_sentence_diff,
                     infer_model.extra_features: batch_extra_features,
                     infer_model.dropout_keep_prob: 1}
        batch_probs = infer_sess.run(infer_model.probs, feed_dict=feed_dict)
        probs += [p[0] for p in batch_probs]
    return probs

In [20]:
with open("../data/test_queries.txt", "r") as f:
    _, test_queries = zip(*[line.strip().split("\t") for line in f])

with open("../data/test_replies.txt", "r") as f:
    _, test_replies = zip(*[line.strip().split("\t") for line in f])

In [25]:
with open("../data/small/test.txt", "r") as f:
    A, B, labels = zip(*[line.split("\t") for line in f])
    labels = [1 if l=="1" else 0 for l in labels]

In [26]:
probs = infer(A, B)

In [28]:
predictions = [p>0.5 for p in probs]

In [30]:
np.mean([p==y for p, y in zip(predictions, labels)])

0.6563367252543941

In [31]:
from tqdm import tqdm

In [32]:
deep_result = dict()
for query in tqdm(test_queries):
    A, B = [query]*200, test_replies
    probs = infer(A, B)
    deep_result[query] = [(reply, score) for reply, score in zip(test_replies, probs)]


  0%|          | 0/600 [00:00<?, ?it/s][A
  0%|          | 2/600 [00:00<00:37, 16.11it/s][A
  1%|          | 4/600 [00:00<00:36, 16.33it/s][A
  1%|          | 6/600 [00:00<00:37, 16.02it/s][A
  1%|▏         | 8/600 [00:00<00:38, 15.39it/s][A
  2%|▏         | 10/600 [00:00<00:36, 15.95it/s][A
  2%|▏         | 12/600 [00:00<00:36, 16.21it/s][A
  2%|▏         | 14/600 [00:00<00:36, 16.17it/s][A
  3%|▎         | 16/600 [00:01<00:37, 15.65it/s][A
  3%|▎         | 18/600 [00:01<00:38, 15.16it/s][A
  3%|▎         | 20/600 [00:01<00:39, 14.78it/s][A
  4%|▎         | 22/600 [00:01<00:39, 14.46it/s][A
  4%|▍         | 24/600 [00:01<00:39, 14.60it/s][A
  4%|▍         | 26/600 [00:01<00:39, 14.54it/s][A
  5%|▍         | 28/600 [00:01<00:39, 14.62it/s][A
  5%|▌         | 30/600 [00:02<00:38, 14.76it/s][A
  5%|▌         | 32/600 [00:02<00:38, 14.66it/s][A
  6%|▌         | 34/600 [00:02<00:38, 14.78it/s][A
  6%|▌         | 36/600 [00:02<00:38, 14.82it/s][A
  6%|▋         | 38/600 

In [33]:
import pickle

In [None]:
pickle.dump(deep_result, open(""))