# 이름만 바꿔서 실행하면 됩니다

In [8]:
import tensorflow as tf
import numpy as np
import argparse
from datetime import datetime
import os

from data_loader import DataGenerator

from trainer import MatchingModelTrainer
from preprocessor import DynamicPreprocessor
from utils.dirs import create_dirs
from utils.logger import SummaryWriter
from utils.config import load_config, save_config
from models.base import get_model
from utils.utils import JamoProcessor

In [9]:
NAME = "soynlp_ns4_lr1e-3"
TOKENIZER = "SoyNLPTokenizer"

In [10]:
base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/{}/".format(NAME)
config_dir = base_dir + "config.json"
best_model_dir = base_dir + "best_loss/best_loss.ckpt"

In [11]:
model_config = load_config(config_dir)
preprocessor = DynamicPreprocessor(model_config)
preprocessor.build_preprocessor()

infer_config = load_config(config_dir)
setattr(infer_config, "tokenizer", TOKENIZER)
setattr(infer_config, "soynlp_scores", "/media/scatter/scatterdisk/tokenizer/soynlp_scores.sol.100M.txt")
infer_preprocessor = DynamicPreprocessor(infer_config)
infer_preprocessor.build_preprocessor()

In [12]:
graph = tf.Graph()
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True

with graph.as_default():
    Model = get_model(model_config.model)
    infer_model = Model(preprocessor, model_config)
    infer_sess = tf.Session(config=tf_config, graph=graph)
    infer_sess.run(tf.global_variables_initializer())
    infer_sess.run(tf.local_variables_initializer())

infer_model.load(infer_sess, model_dir=best_model_dir)

Pre-trained embedding loaded. Number of OOV : 21 / 100000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/reply_matching_model/runs/soynlp_ns4_lr1e-3/best_loss/best_loss.ckpt


In [13]:
infer_model.save(infer_sess,
                 os.path.join(model_config.checkpoint_dir, "model.ckpt"))

In [8]:
def load_test_data(preprocessor):
    base_dir = "/home/angrypark/reply_matching_model/data/"
    with open(os.path.join(base_dir, "test_queries.txt"), "r") as f:
        test_queries = [line.strip() for line in f]
    with open(os.path.join(base_dir, "test_replies.txt"), "r") as f:
        replies_set = [line.strip().split("\t") for line in f]
    with open(os.path.join(base_dir, "test_labels.txt"), "r") as f:
        test_labels = [[int(y) for y in line.strip().split("\t")] for line in f]

    test_queries, test_queries_lengths = zip(*[preprocessor.preprocess(query)
                                                     for query in test_queries])
    test_replies = list()
    test_replies_lengths = list()
    for replies in replies_set:
        r, l = zip(*[preprocessor.preprocess(reply) for reply in replies])
        test_replies.append(r)
        test_replies_lengths.append(l)
    return test_queries, test_replies, test_queries_lengths, test_replies_lengths, test_labels

In [9]:
def test(model, sess, preprocessor):
    test_queries, test_replies, test_queries_lengths, \
    test_replies_lengths, test_labels = load_test_data(preprocessor)

    # flatten
    row, col, _ = np.shape(test_replies)
    test_queries_expanded = [[q]*col for q in test_queries]
    test_queries_expanded = [y for x in test_queries_expanded for y in x]
    test_queries_lengths_expanded = [[l]*col for l in test_queries_lengths]
    test_queries_lengths_expanded = [y for x in test_queries_lengths_expanded for y in x]
    test_replies = [y for x in test_replies for y in x]
    test_replies_lengths = [y for x in test_replies_lengths for y in x]

    feed_dict = {model.input_queries: test_queries_expanded,
                 model.input_replies: test_replies,
                 model.queries_lengths: test_queries_lengths_expanded,
                 model.replies_lengths: test_replies_lengths, 
                 model.dropout_keep_prob: 1}
    probs = model.infer(sess, feed_dict=feed_dict)
    probs = np.reshape(probs, [row, col])
    return test_labels, probs.tolist()

In [10]:
y_true, y_prob = test(infer_model, infer_sess, preprocessor)

In [11]:
from sklearn.metrics import precision_recall_curve, f1_score, average_precision_score

In [12]:
def evaluate_metrics(y_true, y_prob, k=5):
    def get_rank(y_true, y_prob):
        rs = list()
        for y_t, y_p in zip(y_true, y_prob):
            r = sorted([(t, p) for t, p in zip(y_t, y_p)], key=lambda x: x[1], reverse=True)
            r = [t for t, p in r]
            rs.append(r)
        return rs

    def get_precision_at_k(rs, k):
        rs = [(np.asarray(r)[:k] != 0) for r in rs]
        return np.mean([np.mean(r) for r in rs])
    
    def mean_reciprocal_rank(rs):
        rs = (np.asarray(r).nonzero()[0] for r in rs)
        return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
    
    def dcg_at_k(r, k):
        r = np.asfarray(r)[:k]
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    
    def ndcg_at_k(r, k):
        dcg_max = dcg_at_k(sorted(r, reverse=True), k)
        if not dcg_max:
            return 0.
        return dcg_at_k(r, k) / dcg_max
    
    def mean_ndcg_at_k(rs, k):
        return np.mean([ndcg_at_k(r, k) for r in rs])
    
    def flatten(list_of_lists):
        return [y for x in list_of_lists for y in x]
    
    def get_best_threshold(y_true, y_prob):
        y_true_binary = [y!=0 for y in flatten(y_true)]
        precision, recall, thresholds = precision_recall_curve(y_true_binary, flatten(y_prob))
        best_f_measure = 0
        best_threshold = 0
        for p, r, t in zip(precision, recall, thresholds):
            if (p+r) == 0:
                continue
            f_measure = 2*p*r/(p+r)
            if f_measure > best_f_measure:
                best_f_measure = f_measure
                best_threshold = t
        return np.round(best_threshold, 2)
    
    def get_f1_score(y_true, y_prob, threshold):
        return f1_score([y!=0 for y in flatten(y_true)], [int(y>=threshold) for y in flatten(y_prob)])
    
    rs = get_rank(y_true, y_prob)
    threshold = get_best_threshold(y_true, y_prob)
    f_measure = get_f1_score(y_true, y_prob, threshold)
    
    return {"precision_at_{}".format(k): get_precision_at_k(rs, k), 
            "mrr": mean_reciprocal_rank(rs), 
            "ndcg": mean_ndcg_at_k(rs, 10), 
            "threshold": threshold, 
            "f1_score": f_measure}

In [13]:
result = {"name": model_config.name, 
          "model": model_config.model, 
          "negative_sampling": model_config.negative_sampling, 
          "num_negative_samples": model_config.num_negative_samples, 
          "best_epoch": model_config.best_epoch,
          "best_step": model_config.best_step}
result.update(evaluate_metrics(y_true, y_prob))
result

{'best_epoch': '1',
 'best_step': '80000',
 'f1_score': 0.4714887102946805,
 'model': 'DualEncoderLSTM',
 'mrr': 0.6511217193147887,
 'name': 'soynlp_ns1_lr1e-3',
 'ndcg': 0.6870581862944968,
 'negative_sampling': 'random',
 'num_negative_samples': 1,
 'precision_at_5': 0.3465346534653465,
 'threshold': 0.01}

In [11]:
with open("/home/angrypark/reply_matching_model/data/reply_set.txt", "r") as f:
    reply_set = [line.strip() for line in f]

In [12]:
query = "나 오늘 술약속 있다"

indexed_query, query_length = infer_preprocessor.preprocess(query)
indexed_replies, replies_lengths = zip(*[infer_preprocessor.preprocess(reply) for reply in reply_set])

feed_dict = {infer_model.input_queries: [indexed_query]*len(reply_set),
             infer_model.input_replies: indexed_replies,
             infer_model.queries_lengths: [query_length]*len(reply_set),
             infer_model.replies_lengths: replies_lengths,
             infer_model.dropout_keep_prob: 1}

In [13]:
probs = infer_model.infer(infer_sess, feed_dict=feed_dict)
probs = [p[0] for p in probs]

print(sorted([(reply, prob) for reply, prob in zip(reply_set, probs)], key=lambda x: x[1], reverse=True)[:5])

[('약속 취소됐어요?', 0.99697006), ('저랑 약속해요', 0.99523675), ('전 술 잘 안먹어요', 0.99372214), ('오늘 같은 날은 소주를 마시고 싶네요', 0.9936784), ('술 잘먹는 편이에요?', 0.9933994)]


In [3]:
import os
import json
base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/"

In [11]:
for name in os.listdir(base_dir):
    config_dir = os.path.join(base_dir, name, "config.json")
    d = json.load(open(config_dir, "r"))
    print("{:30s} : {:6}".format(name, d["best_step"]))

start_2                        : 3080000
start                          : 35000 
sentpiece50K_ns4_lr1e-3        : 360000
start_3                        : 3500000
soynlp_ns4_lr1e-3              : 180000
soynlp_ns1_lr1e-3              : 80000 
sentpiece50K_ns1_lr1e-3        : 360000


In [5]:
d

{'batch_size': 512,
 'best_epoch': '5',
 'best_loss': '0.36251',
 'best_step': '3080000',
 'checkpoint_dir': '/media/scatter/scatterdisk/reply_matching_model/runs/start_2/',
 'config': '',
 'dropout_keep_prob': 0.9,
 'embed_dim': 256,
 'evaluate_every': 20000,
 'gpu': 'a',
 'learning_rate': 0.0001,
 'lstm_dim': 512,
 'max_length': 20,
 'max_to_keep': 5,
 'min_length': 1,
 'mode': 'train',
 'model': 'DualEncoderLSTM',
 'name': 'start_2',
 'negative_sampling': 'random',
 'normalizer': 'DummyNormalizer',
 'num_epochs': 20,
 'num_negative_samples': 1,
 'pretrained_embed_dir': '/media/scatter/scatterdisk/pretrained_embedding/fasttext.sent_piece_50K.256D',
 'save_every': 10000,
 'sent_piece_model': '/media/scatter/scatterdisk/tokenizer/sent_piece.50K.model',
 'shuffle': True,
 'tokenizer': 'DummyTokenizer',
 'train_dir': '/media/scatter/scatterdisk/reply_matching_model/sol.tokenized.sent_piece_50K/',
 'val_dir': '/media/scatter/scatterdisk/reply_matching_model/sol.tokenized.sent_piece_50K/so

In [14]:
import json

In [30]:
base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/soynlp_ns4_lr1e-3/config.json"