# 3
- statistical machiche learning 성능 측정

In [1]:
import sys
sys.path.append("/home/angrypark/paraphrase_detection/code/")

from utils.utils import JamoProcessor
from text.tokenizers import SentencePieceTokenizer

class Config:
    def __init__(self):
        self.sent_piece_model = "/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model"
config = Config()

processor = JamoProcessor()
tokenizer = SentencePieceTokenizer(config)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

### TFIDF Vectorizer 학습

In [45]:
# def my_tokenizer(raw, pos=["Noun","Alpha","Verb","Number"], stopword=[]):
#     return [processor.word_to_jamo for word in tokenizer.tokenize(raw)]
def my_tokenizer(raw, pos=["Noun", "Alpha", "Verb", "Number"], stopword=[]):
    return [word for word in tokenizer.tokenize(raw)]

In [47]:
vectorizer = TfidfVectorizer(analyzer="word", 
                             tokenizer=my_tokenizer, 
                             #ngram_range=(3, 6)
                            )

In [54]:
with open("../data/small/train.txt", "r") as f:
    train_lines = list()
    for line in f:
        splits = line.strip().split("\t")
        train_lines += splits[:2]
        
with open("../data/small/val.txt", "r") as f:
    for line in f:
        splits = line.strip().split("\t")
        train_lines += splits[:2]
        
with open("../data/small/test.txt", "r") as f:
    for line in f:
        splits = line.strip().split("\t")
        train_lines += splits[:2]

In [56]:
vectorizer.fit(train_lines)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function my_tokenizer at 0x7f89aefa4620>, use_idf=True,
        vocabulary=None)

In [57]:
a = vectorizer.transform(["안녕하세요 만나서 반갑습니다"])

In [58]:
import numpy as np

In [59]:
np.array(a.todense()).shape

(1, 7102)

### 이걸 바탕으로 dataset 구성

In [70]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [63]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [65]:
import pickle

In [78]:
train_set = pd.DataFrame(pickle.load(open("../data/train_set.pkl", "rb")))
val_set = pd.DataFrame(pickle.load(open("../data/val_set.pkl", "rb")))
test_set = pd.DataFrame(pickle.load(open("../data/test_set.pkl", "rb")))

In [84]:
from tqdm import tqdm

In [85]:
tfidf_sim = list()
for item in tqdm(train_set[["sentence_A", "sentence_B"]].values):
    a, b = item
    tfidf_sim.append(cosine_similarity(vectorizer.transform([a]), vectorizer.transform([b]))[0][0])
train_set["tfidf_sim"] = pd.DataFrame(tfidf_sim)

100%|██████████| 300000/300000 [05:55<00:00, 844.46it/s]


In [86]:
tfidf_sim = list()
for item in tqdm(val_set[["sentence_A", "sentence_B"]].values):
    a, b = item
    tfidf_sim.append(cosine_similarity(vectorizer.transform([a]), vectorizer.transform([b]))[0][0])
val_set["tfidf_sim"] = pd.DataFrame(tfidf_sim)

100%|██████████| 19312/19312 [00:23<00:00, 828.13it/s]


In [87]:
tfidf_sim = list()
for item in tqdm(test_set[["sentence_A", "sentence_B"]].values):
    a, b = item
    tfidf_sim.append(cosine_similarity(vectorizer.transform([a]), vectorizer.transform([b]))[0][0])
test_set["tfidf_sim"] = pd.DataFrame(tfidf_sim)

100%|██████████| 15134/15134 [00:18<00:00, 832.49it/s]


In [90]:
from difflib import SequenceMatcher

In [94]:
import editdistance

In [95]:
def proper_edit_distance(a_jamos, b_jamos):
    long_length = max([len(a_jamos), len(b_jamos)])
    edit_distance = editdistance.eval(a_jamos, b_jamos) / long_length
    return edit_distance

def substring(a_jamos, b_jamos):
    long_length = max([len(a_jamos), len(b_jamos)])
    match = SequenceMatcher(None, a_jamos, b_jamos).find_longest_match(0, len(a_jamos), 0, len(b_jamos))
    return match.size / long_length

In [96]:
for dataset in [train_set, val_set, test_set]:
    edit_distance = list()
    substring_ratio = list()
    for item in tqdm(dataset[["sentence_A", "sentence_B"]].values):
        a, b = item
        a_jamos, b_jamos = processor.word_to_jamo(a).replace("_", ""), processor.word_to_jamo(b).replace("_", "")
        edit_distance.append(proper_edit_distance(a_jamos, b_jamos))
        substring_ratio.append(substring(a_jamos, b_jamos))
    dataset["edit_distance"] = pd.DataFrame(edit_distance)
    dataset["substring_ratio"] = pd.DataFrame(substring_ratio)

100%|██████████| 300000/300000 [00:21<00:00, 14219.88it/s]
100%|██████████| 19312/19312 [00:01<00:00, 13521.53it/s]
100%|██████████| 15134/15134 [00:01<00:00, 14107.43it/s]


In [98]:
features = ["ab_prob", "ba_prob", "semantic_sim", "tfidf_sim", "edit_distance", "substring_ratio"]
X_train = train_set[features]
y_train = train_set["label"].map(lambda x: 1 if x=="1" else 0)

X_val = val_set[features]
y_val = val_set["label"].map(lambda x: 1 if x=="1" else 0)

X_test = test_set[features]
y_test = test_set["label"].map(lambda x: 1 if x=="1" else 0)

In [102]:
from xgboost import XGBClassifier

In [104]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [105]:
xgb.score(X_val, y_val)

0.7230737365368682

In [106]:
xgb.score(X_test, y_test)

0.7140874851328135

In [107]:
from catboost import CatBoostClassifier

In [112]:
np.shape(y_train.values)

(300000,)

In [115]:
cb = CatBoostClassifier()
cb.fit(X_train, y_train)

Learning rate set to 0.084168
0:	learn: 0.5944124	total: 385ms	remaining: 6m 24s
1:	learn: 0.5271129	total: 446ms	remaining: 3m 42s
2:	learn: 0.4831925	total: 501ms	remaining: 2m 46s
3:	learn: 0.4508307	total: 548ms	remaining: 2m 16s
4:	learn: 0.4260066	total: 580ms	remaining: 1m 55s
5:	learn: 0.4035120	total: 914ms	remaining: 2m 31s
6:	learn: 0.3918514	total: 978ms	remaining: 2m 18s
7:	learn: 0.3769020	total: 1.03s	remaining: 2m 7s
8:	learn: 0.3659169	total: 1.07s	remaining: 1m 57s
9:	learn: 0.3570714	total: 1.13s	remaining: 1m 51s
10:	learn: 0.3502800	total: 1.19s	remaining: 1m 47s
11:	learn: 0.3469513	total: 1.23s	remaining: 1m 41s
12:	learn: 0.3424987	total: 1.26s	remaining: 1m 35s
13:	learn: 0.3397654	total: 1.33s	remaining: 1m 33s
14:	learn: 0.3376895	total: 1.38s	remaining: 1m 30s
15:	learn: 0.3340342	total: 1.42s	remaining: 1m 27s
16:	learn: 0.3325286	total: 1.45s	remaining: 1m 24s
17:	learn: 0.3314208	total: 1.49s	remaining: 1m 21s
18:	learn: 0.3305647	total: 1.55s	remaining: 

<catboost.core.CatBoostClassifier at 0x7f89405e9748>

In [116]:
cb.score(X_val, y_val)

0.7237986743993372

In [118]:
cb.score(X_test, y_test)

0.7160037002775208

In [120]:
X_train.columns

Index(['ab_prob', 'ba_prob', 'semantic_sim', 'tfidf_sim', 'edit_distance',
       'substring_ratio'],
      dtype='object')

In [119]:
cb._feature_importance

[14.130268975629814,
 15.46273840689112,
 31.852532383216158,
 11.861477085049998,
 8.788012591116289,
 17.904970558096615]

In [1]:
import tensorflow as tf
import numpy as np
import os
import sys
sys.path.append("/home/angrypark/korean-text-matching-tf")

In [2]:
from data_loader import DataGenerator
from trainer import MatchingModelTrainer
from preprocessor import DynamicPreprocessor
from utils.dirs import create_dirs
from utils.logger import SummaryWriter
from utils.config import load_config, save_config
from models.base import get_model
from utils.utils import JamoProcessor
from text.tokenizers import SentencePieceTokenizer

In [3]:
from collections import namedtuple
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
import editdistance

Config = namedtuple("config", ["sent_piece_model"])
config = Config("/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model")
processor = JamoProcessor()
tokenizer = SentencePieceTokenizer(config)

In [4]:
def my_word_tokenizer(raw, pos=["Noun", "Alpha", "Verb", "Number"], stopword=[]):
    return [word for word in tokenizer.tokenize(raw)]

def my_char_tokenizer(raw, pos=["Noun", "Alpha", "Verb", "Number"], stopword=[]):
    return [processor.word_to_jamo(word) for word in tokenizer.tokenize(raw)]

def proper_edit_distance(a_jamos, b_jamos):
    long_length = max([len(a_jamos), len(b_jamos)])
    edit_distance = editdistance.eval(a_jamos, b_jamos) / long_length
    return edit_distance

def substring(a_jamos, b_jamos):
    long_length = max([len(a_jamos), len(b_jamos)])
    match = SequenceMatcher(None, a_jamos, b_jamos).find_longest_match(0, len(a_jamos), 0, len(b_jamos))
    return match.size / long_length

In [5]:
class FeatureExtractor:
    def __init__(self):
        self.infer_model, self.infer_sess = self._load_pretrained_model()
        self.tfidf_char_vectorizer = pickle.load(open("../dump/tfidf_char_vectorizer.pkl", "rb"))
        self.tfidf_word_vectorizer = pickle.load(open("../dump/tfidf_word_vectorizer.pkl", "rb"))
        self.processor = JamoProcessor()
        self.tokenizer = SentencePieceTokenizer(config)
        
    def _load_pretrained_model(self):
        base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/delstm_1024_nsrandom4_lr1e-3/"
        config_dir = base_dir + "config.json"
        best_model_dir = base_dir + "best_loss/best_loss.ckpt"
        model_config = load_config(config_dir)
        model_config.add_echo = False
        preprocessor = DynamicPreprocessor(model_config)
        preprocessor.build_preprocessor()

        infer_config = load_config(config_dir)
        setattr(infer_config, "tokenizer", "SentencePieceTokenizer")
        setattr(infer_config, "soynlp_scores", "/media/scatter/scatterdisk/tokenizer/soynlp_scores.sol.100M.txt")
        infer_preprocessor = DynamicPreprocessor(infer_config)
        infer_preprocessor.build_preprocessor()
        graph = tf.Graph()
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True

        with graph.as_default():
            Model = get_model(model_config.model)
            data = DataGenerator(preprocessor, model_config)
            infer_model = Model(data, model_config)
            infer_sess = tf.Session(config=tf_config, graph=graph)
            infer_sess.run(tf.global_variables_initializer())
            infer_sess.run(tf.local_variables_initializer())

        infer_model.load(infer_sess, model_dir=best_model_dir)
        self.infer_preprocessor = infer_preprocessor
        return infer_model, infer_sess
        
    def _batch_infer(self, batch_A, batch_B):
        indexed_A, A_lengths = zip(*[self.infer_preprocessor.preprocess(a) for a in batch_A])
        indexed_B, B_lengths = zip(*[self.infer_preprocessor.preprocess(b) for b in batch_B])
        
        feed_dict = {self.infer_model.input_queries: indexed_A,
             self.infer_model.input_replies: indexed_B,
             self.infer_model.queries_lengths: A_lengths,
             self.infer_model.replies_lengths: B_lengths,
             self.infer_model.dropout_keep_prob: 1, 
             }
        A_sentence_vectors, AB_probs = self.infer_sess.run([self.infer_model.encoding_queries, 
                                                            self.infer_model.positive_probs], 
                                                            feed_dict=feed_dict)

        feed_dict = {self.infer_model.input_queries: indexed_B,
             self.infer_model.input_replies: indexed_A,
             self.infer_model.queries_lengths: B_lengths,
             self.infer_model.replies_lengths: A_lengths,
             self.infer_model.dropout_keep_prob: 1, 
             }
        B_sentence_vectors, BA_probs = self.infer_sess.run([self.infer_model.encoding_queries, 
                                                            self.infer_model.positive_probs], 
                                                            feed_dict=feed_dict)

        semantic_sim = [cosine_similarity([a_vector], [b_vector])[0][0] for a_vector, b_vector 
                        in zip(list(A_sentence_vectors), list(B_sentence_vectors))]
        return [p[0] for p in AB_probs], [p[0] for p in BA_probs], semantic_sim
    
    def extract_features(self, sentences_A, sentences_B):
        def get_semantic_sim(A, B, batch_size=512):
            length = len(A)
            num_batches = (length - 1) // batch_size + 1
    
            result = {"ab_probs": list(), "ba_probs": list(), "semantic_sim": list()}
            for batch_num in range(num_batches):
                start = batch_num * batch_size
                end = min([(batch_num+1) * batch_size, length])
                
                ab_probs, ba_probs, semantic_sim = self._batch_infer(A[start:end], B[start:end])
                result["ab_probs"] += list(ab_probs)
                result["ba_probs"] += list(ba_probs)
                result["semantic_sim"] += semantic_sim
            return result
        
        def get_word_tfidf_sim(A, B):
            word_sim = list()
            for a, b in zip(A, B):
                word_sim.append(cosine_similarity(self.tfidf_word_vectorizer.transform([a]), 
                                                  self.tfidf_word_vectorizer.transform([b]))[0][0])
            return {"tfidf_word_sim": word_sim}
                
        def get_char_tfidf_sim(A, B):
            char_sim = list()
            for a, b in zip(A, B):
                char_sim.append(cosine_similarity(self.tfidf_char_vectorizer.transform([a]), 
                                                  self.tfidf_char_vectorizer.transform([b]))[0][0])
            return {"tfidf_char_sim": char_sim}
            
        def get_edit_distance(A, B):
            edit_distance = list()
            substring_ratio = list()
            for a, b in zip(A, B):
                a_jamos = self.processor.word_to_jamo(a).replace("_", "")
                b_jamos = self.processor.word_to_jamo(b).replace("_", "")
                edit_distance.append(proper_edit_distance(a_jamos, b_jamos))
                substring_ratio.append(substring(a_jamos, b_jamos))
            return {"edit_distance": edit_distance, 
                    "substring_ratio": substring_ratio}
        
        extracted_features = dict()
        extracted_features.update(get_semantic_sim(sentences_A, sentences_B, batch_size=512))
        extracted_features.update(get_word_tfidf_sim(sentences_A, sentences_B))
        extracted_features.update(get_char_tfidf_sim(sentences_A, sentences_B))
        extracted_features.update(get_edit_distance(sentences_A, sentences_B))
        
        return extracted_features

In [6]:
feature_extractor = FeatureExtractor()

Pre-trained embedding loaded. Number of OOV : 5272 / 90000


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from /media/scatter/scatterdisk/reply_matching_model/runs/delstm_1024_nsrandom4_lr1e-3/best_loss/best_loss.ckpt


In [31]:
with open("../data/small/train.txt", "r") as f:
    train_A, train_B, train_labels = zip(*[line.strip().split("\t") for line in f])

with open("../data/small/val.txt", "r") as f:
    val_A, val_B, val_labels = zip(*[line.strip().split("\t") for line in f])
    
with open("../data/small/test.txt", "r") as f:
    test_A, test_B, test_labels = zip(*[line.strip().split("\t") for line in f])

In [32]:
%%time
train_set = feature_extractor.extract_features(train_A, train_B)

CPU times: user 16min 16s, sys: 31.4 s, total: 16min 47s
Wall time: 15min 31s


In [33]:
val_set = feature_extractor.extract_features(val_A, val_B)

In [34]:
test_set = feature_extractor.extract_features(test_A, test_B)

In [37]:
tfidf_char_vectorizer = pickle.load(open("../dump/tfidf_char_vectorizer.pkl", "rb"))

In [38]:
char_sim = list()
for a, b in zip(train_A, train_B):
    char_sim.append(cosine_similarity(tfidf_char_vectorizer.transform([a]), 
                                      tfidf_char_vectorizer.transform([b]))[0][0])
train_set["tfidf_char_sim"] = pd.DataFrame(char_sim)

In [39]:
char_sim = list()
for a, b in zip(val_A, val_B):
    char_sim.append(cosine_similarity(tfidf_char_vectorizer.transform([a]), 
                                      tfidf_char_vectorizer.transform([b]))[0][0])
val_set["tfidf_char_sim"] = pd.DataFrame(char_sim)

In [40]:
char_sim = list()
for a, b in zip(test_A, test_B):
    char_sim.append(cosine_similarity(tfidf_char_vectorizer.transform([a]), 
                                      tfidf_char_vectorizer.transform([b]))[0][0])
test_set["tfidf_char_sim"] = pd.DataFrame(char_sim)

In [45]:
train_set["tfidf_char_sim"] = [p[0] for p in train_set["tfidf_char_sim"]]
val_set["tfidf_char_sim"] = [p[0] for p in val_set["tfidf_char_sim"]]
test_set["tfidf_char_sim"] = [p[0] for p in test_set["tfidf_char_sim"]]

In [48]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [62]:
X_train = pd.concat([pd.DataFrame(train_set), pd.DataFrame(val_set)])
y_train = pd.concat([pd.Series(list(train_labels)), pd.Series(list(val_labels))])

In [65]:
y_train = y_train.map(lambda x: 1 if x=="1" else 0)

In [66]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [67]:
cb = CatBoostClassifier()
cb.fit(X_train, y_train)

Learning rate set to 0.085668
0:	learn: 0.6006461	total: 108ms	remaining: 1m 47s
1:	learn: 0.5403487	total: 151ms	remaining: 1m 15s
2:	learn: 0.4947036	total: 188ms	remaining: 1m 2s
3:	learn: 0.4627213	total: 251ms	remaining: 1m 2s
4:	learn: 0.4422271	total: 298ms	remaining: 59.4s
5:	learn: 0.4208292	total: 361ms	remaining: 59.9s
6:	learn: 0.4008429	total: 396ms	remaining: 56.2s
7:	learn: 0.3913386	total: 455ms	remaining: 56.4s
8:	learn: 0.3788648	total: 517ms	remaining: 56.9s
9:	learn: 0.3729280	total: 569ms	remaining: 56.4s
10:	learn: 0.3678085	total: 617ms	remaining: 55.4s
11:	learn: 0.3640233	total: 680ms	remaining: 56s
12:	learn: 0.3567042	total: 737ms	remaining: 56s
13:	learn: 0.3538322	total: 777ms	remaining: 54.7s
14:	learn: 0.3487607	total: 815ms	remaining: 53.5s
15:	learn: 0.3466389	total: 886ms	remaining: 54.5s
16:	learn: 0.3440457	total: 931ms	remaining: 53.8s
17:	learn: 0.3428826	total: 972ms	remaining: 53s
18:	learn: 0.3398027	total: 1.01s	remaining: 52.3s
19:	learn: 0.33

<catboost.core.CatBoostClassifier at 0x7faf48e15ba8>

In [68]:
pickle.dump(xgb, open("../models/xgb.pkl", "wb"))
pickle.dump(cb, open("../models/cb.pkl", "wb"))

In [69]:
y_test = pd.Series(list(test_labels)).map(lambda x: 1 if x=="1" else 0)
xgb.score(pd.DataFrame(test_set), y_test)

  if diff:


0.7232721025505484

In [70]:
cb.score(pd.DataFrame(test_set), y_test)

0.7166644641205233

## Generate submission dataset

In [8]:
with open("../data/test_queries.txt", "r") as f:
    _, test_queries = zip(*[line.strip().split("\t") for line in f])

with open("../data/test_replies.txt", "r") as f:
    _, test_replies = zip(*[line.strip().split("\t") for line in f])

In [9]:
len(test_replies)

200

저장 형식 : (query, ((reply, score), (reply, score)...)

In [13]:
from tqdm import tqdm
import pickle
import pandas as pd

In [7]:
xgb = pickle.load(open("../models/xgb.pkl", "rb"))
cb = pickle.load(open("../models/cb.pkl", "rb"))

In [14]:
xgb_result = dict()
for query in tqdm(test_queries):
    A, B = [query]*200, test_replies
    extracted_features = feature_extractor.extract_features(A, B)
    probs = [p[1] for p in xgb.predict_proba(pd.DataFrame(extracted_features)).tolist()]
    xgb_result[query] = [(reply, score) for reply, score in zip(test_replies, probs)]

100%|██████████| 600/600 [06:16<00:00,  1.59it/s]


In [15]:
cb_result = dict()
for query in tqdm(test_queries):
    A, B = [query]*200, test_replies
    extracted_features = feature_extractor.extract_features(A, B)
    probs = [p[1] for p in cb.predict_proba(pd.DataFrame(extracted_features)).tolist()]
    cb_result[query] = [(reply, score) for reply, score in zip(test_replies, probs)]

100%|██████████| 600/600 [06:02<00:00,  1.66it/s]


In [16]:
pickle.dump(xgb_result, open("../dump/xgb_result.pkl", "wb"))
pickle.dump(cb_result, open("../dump/cb_result.pkl", "wb"))