# 이름만 바꿔서 실행하면 됩니다

In [1]:
from gensim.models import FastText

In [2]:
ft = FastText.load("/media/scatter/scatterdisk/pretrained_embedding/fasttext.syllable.128D")

In [2]:
import tensorflow as tf
import numpy as np
import argparse
from datetime import datetime
import os

from data_loader import DataGenerator

from trainer import MatchingModelTrainer
from preprocessor import DynamicPreprocessor
from utils.dirs import create_dirs
from utils.logger import SummaryWriter
from utils.config import load_config, save_config
from models.base import get_model
from utils.utils import JamoProcessor

In [3]:
NAME = "delstm_1024_nsrandom4_lr1e-3"
TOKENIZER = "SentencePieceTokenizer"

In [4]:
base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/{}/".format(NAME)
config_dir = base_dir + "config.json"
# best_model_dir = base_dir + "best_loss/best_loss.ckpt"
best_model_dir = base_dir + "model.ckpt"

In [5]:
model_config = load_config(config_dir)
preprocessor = DynamicPreprocessor(model_config)
preprocessor.build_preprocessor()

infer_config = load_config(config_dir)
setattr(infer_config, "tokenizer", TOKENIZER)
setattr(infer_config, "soynlp_scores", "/media/scatter/scatterdisk/tokenizer/soynlp_scores.sol.100M.txt")
infer_preprocessor = DynamicPreprocessor(infer_config)
infer_preprocessor.build_preprocessor()

In [6]:
model_config.add_echo = True

In [7]:
graph = tf.Graph()
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True

with graph.as_default():
    Model = get_model(model_config.model)
    data = DataGenerator(preprocessor, model_config)
    infer_model = Model(data, model_config)
    infer_sess = tf.Session(config=tf_config, graph=graph)
    infer_sess.run(tf.global_variables_initializer())
    infer_sess.run(tf.local_variables_initializer())

infer_model.load(infer_sess, model_dir=best_model_dir)

Pre-trained embedding loaded. Number of OOV : 5272 / 90000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/reply_matching_model/runs/delstm_1024_nsrandom4_lr1e-3/model.ckpt


In [8]:
def load_test_data(preprocessor):
    base_dir = "/home/angrypark/reply_matching_model/data/"
    with open(os.path.join(base_dir, "test_queries.txt"), "r") as f:
        test_queries = [line.strip() for line in f]
    with open(os.path.join(base_dir, "test_replies.txt"), "r") as f:
        replies_set = [line.strip().split("\t") for line in f]
    with open(os.path.join(base_dir, "test_labels.txt"), "r") as f:
        test_labels = [[int(y) for y in line.strip().split("\t")] for line in f]

    test_queries, test_queries_lengths = zip(*[preprocessor.preprocess(query)
                                                     for query in test_queries])
    test_replies = list()
    test_replies_lengths = list()
    for replies in replies_set:
        r, l = zip(*[preprocessor.preprocess(reply) for reply in replies])
        test_replies.append(r)
        test_replies_lengths.append(l)
    return test_queries, test_replies, test_queries_lengths, test_replies_lengths, test_labels

In [9]:
def test(model, sess, preprocessor):
    test_queries, test_replies, test_queries_lengths, \
    test_replies_lengths, test_labels = load_test_data(preprocessor)

    # flatten
    row, col, _ = np.shape(test_replies)
    test_queries_expanded = [[q]*col for q in test_queries]
    test_queries_expanded = [y for x in test_queries_expanded for y in x]
    test_queries_lengths_expanded = [[l]*col for l in test_queries_lengths]
    test_queries_lengths_expanded = [y for x in test_queries_lengths_expanded for y in x]
    test_replies = [y for x in test_replies for y in x]
    test_replies_lengths = [y for x in test_replies_lengths for y in x]

    feed_dict = {model.input_queries: test_queries_expanded,
                 model.input_replies: test_replies,
                 model.queries_lengths: test_queries_lengths_expanded,
                 model.replies_lengths: test_replies_lengths, 
                 model.dropout_keep_prob: 1}
    probs = model.infer(sess, feed_dict=feed_dict)
    probs = np.reshape(probs, [row, col])
    return test_labels, probs.tolist()

In [10]:
y_true, y_prob = test(infer_model, infer_sess, infer_preprocessor)

In [11]:
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score

In [12]:
def evaluate_metrics(y_true, y_prob, k=5):
    def get_rank(y_true, y_prob):
        rs = list()
        for y_t, y_p in zip(y_true, y_prob):
            r = sorted([(t, p) for t, p in zip(y_t, y_p)], key=lambda x: x[1], reverse=True)
            r = [t for t, p in r]
            rs.append(r)
        return rs

    def get_precision_at_k(rs, k):
        rs = [(np.asarray(r)[:k] != 0) for r in rs]
        return np.mean([np.mean(r) for r in rs])
    
    def mean_reciprocal_rank(rs):
        rs = (np.asarray(r).nonzero()[0] for r in rs)
        return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
    
    def dcg_at_k(r, k):
        r = np.asfarray(r)[:k]
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    
    def ndcg_at_k(r, k):
        dcg_max = dcg_at_k(sorted(r, reverse=True), k)
        if not dcg_max:
            return 0.
        return dcg_at_k(r, k) / dcg_max
    
    def mean_ndcg_at_k(rs, k):
        return np.mean([ndcg_at_k(r, k) for r in rs])
    
    def flatten(list_of_lists):
        return [y for x in list_of_lists for y in x]
    
    def get_best_threshold(y_true, y_prob):
        y_true_binary = [y!=0 for y in flatten(y_true)]
        precision, recall, thresholds = precision_recall_curve(y_true_binary, flatten(y_prob))
        best_f_measure = 0
        best_threshold = 0
        for p, r, t in zip(precision, recall, thresholds):
            if (p+r) == 0:
                continue
            f_measure = 2*p*r/(p+r)
            if f_measure > best_f_measure:
                best_f_measure = f_measure
                best_threshold = t
        return np.round(best_threshold, 2)
    
    def get_f1_score(y_true, y_prob, threshold):
        return f1_score([y!=0 for y in flatten(y_true)], [int(y>=threshold) for y in flatten(y_prob)])
    
    rs = get_rank(y_true, y_prob)
    threshold = get_best_threshold(y_true, y_prob)
    f_measure = get_f1_score(y_true, y_prob, threshold)
    
    return {"precision_at_{}".format(k): get_precision_at_k(rs, k), 
            "mrr": mean_reciprocal_rank(rs), 
            "ndcg": mean_ndcg_at_k(rs, 10), 
            "threshold": threshold, 
            "f1_score": f_measure}

In [13]:
import editdistance

In [14]:
result = {"name": model_config.name, 
          "model": model_config.model, 
          "negative_sampling": model_config.negative_sampling, 
          "num_negative_samples": model_config.num_negative_samples, 
          "epoch": infer_model.cur_epoch_tensor.eval(infer_sess),
          "step": infer_model.global_step_tensor.eval(infer_sess)}
result.update(evaluate_metrics(y_true, y_prob))
result

{'epoch': 4,
 'f1_score': 0.5118012422360249,
 'model': 'DualEncoderLSTM',
 'mrr': 0.7244558384409869,
 'name': 'delstm_1024_nsrandom4_lr1e-3',
 'ndcg': 0.7488139150267936,
 'negative_sampling': 'random',
 'num_negative_samples': 4,
 'precision_at_5': 0.42772277227722777,
 'step': 3759999,
 'threshold': 0.56}

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
list(np.array([[1,2], [2,3]]))

[array([1, 2]), array([2, 3])]

In [39]:
import pandas as pd

In [41]:
%%time
with open("/home/angrypark/paraphrase_detection/data/small/train.txt", "r") as f:
    train_set = {"sentence_A": list(), 
                 "sentence_B": list(), 
                 "ab_prob": list(), 
                 "ba_prob": list(), 
                 "semantic_sim": list(), 
                 "label": list()}
    batch = list()
    for line in f:
        batch.append(line)
        if len(batch) % 512 == 0:
            A, B, labels = zip(*[line.strip().split("\t") for line in batch])
            indexed_A, A_lengths = zip(*[infer_preprocessor.preprocess(a) for a in A])
            indexed_B, B_lengths = zip(*[infer_preprocessor.preprocess(b) for b in B])
            feed_dict = {infer_model.input_queries: indexed_A,
                 infer_model.input_replies: indexed_B,
                 infer_model.queries_lengths: A_lengths,
                 infer_model.replies_lengths: B_lengths,
                 infer_model.dropout_keep_prob: 1, 
                 }
            A_sentence_vectors, AB_probs = infer_sess.run([infer_model.encoding_queries, 
                                                           infer_model.positive_probs], 
                                                          feed_dict=feed_dict)
            
            feed_dict = {infer_model.input_queries: indexed_B,
                 infer_model.input_replies: indexed_A,
                 infer_model.queries_lengths: B_lengths,
                 infer_model.replies_lengths: A_lengths,
                 infer_model.dropout_keep_prob: 1,
                 }
            B_sentence_vectors, BA_probs = infer_sess.run([infer_model.encoding_queries, 
                                                           infer_model.positive_probs], 
                                                          feed_dict=feed_dict)
            semantic_sim = [cosine_similarity([a_vector], [b_vector])[0][0] for a_vector, b_vector in zip(list(A_sentence_vectors), list(B_sentence_vectors))]
            
            train_set["sentence_A"] += A
            train_set["sentence_B"] += B
            train_set["ab_prob"] += [p[0] for p in list(AB_probs)]
            train_set["ba_prob"] += [p[0] for p in list(BA_probs)]
            train_set["semantic_sim"] += semantic_sim
            train_set["label"] += labels
            
            batch = list()
    A, B, labels = zip(*[line.strip().split("\t") for line in batch])
    indexed_A, A_lengths = zip(*[infer_preprocessor.preprocess(a) for a in A])
    indexed_B, B_lengths = zip(*[infer_preprocessor.preprocess(b) for b in B])
    feed_dict = {infer_model.input_queries: indexed_A,
         infer_model.input_replies: indexed_B,
         infer_model.queries_lengths: A_lengths,
         infer_model.replies_lengths: B_lengths,
         infer_model.dropout_keep_prob: 1, 
         }
    A_sentence_vectors, AB_probs = infer_sess.run([infer_model.encoding_queries, 
                                                   infer_model.positive_probs], 
                                                  feed_dict=feed_dict)

    feed_dict = {infer_model.input_queries: indexed_B,
         infer_model.input_replies: indexed_A,
         infer_model.queries_lengths: B_lengths,
         infer_model.replies_lengths: A_lengths,
         infer_model.dropout_keep_prob: 1,
         }
    B_sentence_vectors, BA_probs = infer_sess.run([infer_model.encoding_queries, 
                                                   infer_model.positive_probs], 
                                                  feed_dict=feed_dict)
    semantic_sim = [cosine_similarity([a_vector], [b_vector])[0][0] for a_vector, b_vector in zip(list(A_sentence_vectors), list(B_sentence_vectors))]

    train_set["sentence_A"] += A
    train_set["sentence_B"] += B
    train_set["ab_prob"] += [p[0] for p in list(AB_probs)]
    train_set["ba_prob"] += [p[0] for p in list(BA_probs)]
    train_set["semantic_sim"] += semantic_sim
    train_set["label"] += labels

CPU times: user 2min 35s, sys: 28.8 s, total: 3min 4s
Wall time: 1min 51s


In [46]:
pickle.dump(train_set, open("../paraphrase_detection/data/train_set.pkl", "wb"))

In [79]:
with open("/home/angrypark/reply_matching_model/data/reply_set.txt", "r") as f:
    reply_set = [line.strip() for line in f]

In [80]:
query = "나 오늘 술약속 있다"

indexed_query, query_length = infer_preprocessor.preprocess(query)
indexed_replies, replies_lengths = zip(*[infer_preprocessor.preprocess(reply) for reply in reply_set])

feed_dict = {infer_model.input_queries: [indexed_query]*len(reply_set),
             infer_model.input_replies: indexed_replies,
             infer_model.queries_lengths: [query_length]*len(reply_set),
             infer_model.replies_lengths: replies_lengths,
             infer_model.dropout_keep_prob: 1}

In [81]:
probs = infer_model.infer(infer_sess, feed_dict=feed_dict)
probs = [p[0] for p in probs]

print(sorted([(reply, prob) for reply, prob in zip(reply_set, probs)], key=lambda x: x[1], reverse=True)[:5])

[('약속 취소됐어요?', 0.99920183), ('저랑 약속해요', 0.994364), ('저녁에 술 한 잔 할까요?', 0.97491246), ('오늘 회식이에요?', 0.9738028), ('요즘 회식은 주로 목요일에 하지 않아요?', 0.9341068)]


In [15]:
import os
import json
base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/"

In [17]:
for name in os.listdir(base_dir):
    config_dir = os.path.join(base_dir, name, "config.json")
    try:
        d = json.load(open(config_dir, "r"))
        print("{:30s} : {:6}".format(name, d["best_step"]))
    except:
        pass

sentpiece100K_ns4_lr1e-3       : 1480000
delstm_nshard4_lr3e-4          : 40000 
start_2                        : 3080000
delstm_1024_nsrandom4_lr1e-3   : 1780000
sentpiece50K_ns4_lr1e-3        : 480000
delstm_nsrandom4echo_lr1e-3    : 1600000
detcn_nsrandom4_lr1e-3         : 1800000
delstm_nsrandom9_lr1e-3        : 20000 
sentpiece100K_ns1_lr1e-3       : 1480000
debug_embedding                : 129   
start_3                        : 3500000
soynlp_ns4_lr1e-3              : 400000
soynlp_ns1_lr1e-3              : 340000
sentpiece50K_ns1_lr1e-3        : 480000


In [5]:
d

{'batch_size': 512,
 'best_epoch': '5',
 'best_loss': '0.36251',
 'best_step': '3080000',
 'checkpoint_dir': '/media/scatter/scatterdisk/reply_matching_model/runs/start_2/',
 'config': '',
 'dropout_keep_prob': 0.9,
 'embed_dim': 256,
 'evaluate_every': 20000,
 'gpu': 'a',
 'learning_rate': 0.0001,
 'lstm_dim': 512,
 'max_length': 20,
 'max_to_keep': 5,
 'min_length': 1,
 'mode': 'train',
 'model': 'DualEncoderLSTM',
 'name': 'start_2',
 'negative_sampling': 'random',
 'normalizer': 'DummyNormalizer',
 'num_epochs': 20,
 'num_negative_samples': 1,
 'pretrained_embed_dir': '/media/scatter/scatterdisk/pretrained_embedding/fasttext.sent_piece_50K.256D',
 'save_every': 10000,
 'sent_piece_model': '/media/scatter/scatterdisk/tokenizer/sent_piece.50K.model',
 'shuffle': True,
 'tokenizer': 'DummyTokenizer',
 'train_dir': '/media/scatter/scatterdisk/reply_matching_model/sol.tokenized.sent_piece_50K/',
 'val_dir': '/media/scatter/scatterdisk/reply_matching_model/sol.tokenized.sent_piece_50K/so

In [14]:
import json

In [30]:
base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/soynlp_ns4_lr1e-3/config.json"

In [3]:
import tensorflow as tf