In [1]:
import os
import sys
import tensorflow as tf
import numpy as np
import argparse
from datetime import datetime

from dataset import Dataset
from trainer import MatchingModelTrainer
from preprocessor import Preprocessor
from utils.dirs import create_dirs
from utils.logger import SummaryWriter
from utils.config import load_config, save_config
from models.base import get_model
from utils.utils import JamoProcessor

now = datetime.now()

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
class Config:
    def __init__(self):
        self.mode = "train"
        self.name = "debug1000"
        self.config = ""
        self.train_dir = "/media/scatter/scatterdisk/reply_matching_model/sol.tokenized.sent_piece_100K/"
        self.val_dir = "/media/scatter/scatterdisk/reply_matching_model/sol.tokenized.sent_piece_100K/sol.validation.txt"
        self.pretrained_embed_dir = "/media/scatter/scatterdisk/pretrained_embedding/fasttext.sent_piece_100K.256D"
        self.checkpoint_dir = "/home/angrypark/"
        self.model = "DualEncoderLSTMCNN"
        self.sent_piece_model = "/media/scatter/scatterdisk/tokenizer/sent_piece.50K.model"
        self.soynlp_scores = "/media/scatter/scatterdisk/tokenizer/soynlp_scores.sol.100M.txt"
        self.normalizer = "DummyNormalizer"
        self.tokenizer = "DummyTokenizer"
        self.vocab_size = 90000
        self.vocab_list = "/media/scatter/scatterdisk/pretrained_embedding/vocab_list.sent_piece_100K.txt"
        
        self.embed_dim = 256
        self.learning_rate = 1e-1
        self.min_length = 1
        self.max_length = 20
        self.embed_dropout_keep_prob = 0.9
        self.lstm_dropout_keep_prob = 0.9
        
        self.lstm_dim = 512
        self.negative_sampling = "random"
        self.num_negative_samples = 4
        self.add_echo = False
        
        self.batch_size = 512
        self.num_epochs = 300
        self.evaluate_every = 100000
        self.save_every = 1000000
        
        self.max_to_keep = 1
        self.shuffle = True
        
        self.filter_sizes="2,3"
        self.num_filters=64
        self.num_hidden=128
        self.hidden_dropout_keep_prob=0.9
        self.dense_dropout_keep_prob=0.9
        
        self.weak_supervision=False
        self.hinge_loss = 0.3

config = Config()

In [4]:
config = create_dirs(config)
device_config = tf.ConfigProto()
device_config.gpu_options.allow_growth = True
sess = tf.Session(config=device_config)

In [5]:
preprocessor = Preprocessor(config)

In [6]:
data = Dataset(preprocessor, 
               config.train_dir, 
               config.val_dir, 
               config.min_length, 
               config.max_length, 
               config.num_negative_samples,
               config.batch_size, 
               config.shuffle, 
               config.num_epochs, 
               debug=False)
summary_writer = SummaryWriter(sess, config)
trainer = MatchingModelTrainer(sess, preprocessor, data, config, summary_writer)

In [7]:
data.train_size = 10000
data.val_size = 10000
trainer.num_steps_per_epoch = (10000 - 1) // config.batch_size + 1

In [12]:
model, sess = trainer.build_graph()

[32m[14:39:56][INFO] Building train graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


AttributeError: 'Config' object has no attribute 'hinge_loss'

In [8]:
trainer.train()

[32m[14:46:23][INFO] Building train graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[32m[14:46:33][INFO] Loading checkpoint from /home/angrypark/debug1000/ [0m
[31m[14:46:33][ERROR] No checkpoint found in /home/angrypark/debug1000/ [0m
[32m[14:46:33][INFO] Building val graph... [0m


Pre-trained embedding loaded. Number of OOV : 5272 / 90000


100%|██████████| 20/20 [00:08<00:00,  2.42it/s]


INFO:tensorflow:Restoring parameters from /home/angrypark/debug1000/model.ckpt


100%|██████████| 20/20 [00:03<00:00,  5.68it/s]
100%|██████████| 20/20 [00:05<00:00,  3.78it/s]


INFO:tensorflow:Restoring parameters from /home/angrypark/debug1000/model.ckpt


100%|██████████| 20/20 [00:03<00:00,  5.80it/s]
100%|██████████| 20/20 [00:05<00:00,  3.96it/s]


INFO:tensorflow:Restoring parameters from /home/angrypark/debug1000/model.ckpt


100%|██████████| 20/20 [00:03<00:00,  5.79it/s]
100%|██████████| 20/20 [00:05<00:00,  3.89it/s]


INFO:tensorflow:Restoring parameters from /home/angrypark/debug1000/model.ckpt


  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
feed_dict = {model.lstm_dropout_keep_prob: 1,
             model.num_negative_samples: config.num_negative_samples,
             model.embed_dropout_keep_prob: 1,
             model.dense_dropout_keep_prob: 1}
if config.weak_supervision:
    input_queries, input_replies, query_lengths, reply_lengths, weak_distances = \
    trainer.infer_sess.run([trainer.infer_model.input_queries, 
                         trainer.infer_model.input_replies, 
                         trainer.infer_model.queries_lengths, 
                         trainer.infer_model.replies_lengths, 
                         trainer.infer_model.distances], 
                        feed_dict={trainer.infer_model.dropout_keep_prob: 1, 
                                   trainer.infer_model.add_echo: False})
    feed_dict.update({model.input_queries: input_queries, 
                      model.input_replies: input_replies, 
                      model.query_lengths: query_lengths, 
                      model.reply_lengths: reply_lengths, 
                      model.weak_distances: weak_distances})

In [9]:
queries_embedded, replies_embedded, queries_encoded, replies_encoded, positive_inputs, \
negative_inputs, negative_queries_indices, negative_replies_indices, \
logits, labels = sess.run([model.queries_embedded, model.replies_embedded, model.queries_encoded,
                            model.replies_encoded,
                            model.positive_inputs, 
                            model.negative_inputs, 
                            model.negative_queries_indices,
                            model.negative_replies_indices,
                           model.logits, 
                           model.labels
                            ], feed_dict = feed_dict)

In [10]:
queries_embedded.shape

(512, 20, 256)

In [11]:
positive_inputs.shape

(512, 1024)

In [12]:
positive_inputs.shape

(512, 1024)

In [13]:
negative_inputs.shape

(2045, 1024)

In [14]:
negative_queries_indices[:100]

array([ 0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  4,
        4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,
        8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12,
       12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16,
       17, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21,
       21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25])

In [15]:
negative_replies_indices[:100]

array([207, 244, 453, 458, 169, 207, 294, 323, 171, 271, 294, 297, 164,
       213, 428, 466,  20, 146, 192, 361,  62,  86, 181, 258,  25, 184,
       291, 464, 133, 138, 167, 216,   0,  48, 239, 471, 306, 334, 468,
       478,  50,  94, 387, 487, 186, 323, 457, 503,  18,  36, 222, 316,
       180, 239, 394, 490,  18,  59, 203, 331, 162, 234, 287, 454,  24,
       134, 497, 509, 165, 205, 281, 439, 172, 193, 247, 164, 305, 322,
       475, 255, 273, 395, 471,  37, 119, 130, 178,  82, 162, 417, 466,
        32, 157, 182, 307,  26,  35,  67, 176, 146])

In [16]:
negative_inputs[0]

array([-1.12742082,  6.61410342, -1.17185286, ..., -0.10520501,
        0.0991332 ,  0.07647195])

In [19]:
sum(np.concatenate([queries_transformed[0], replies_encoded[207]]) == negative_inputs[0])

1024

In [23]:
negative_inputs.shape

(2045, 1024)

In [25]:
logits.shape

(2557,)

In [26]:
labels.shape

(2557,)

In [27]:
labels

array([1., 1., 1., ..., 0., 0., 0.])