# 1. Load scripts and queries

In [1]:
import sys
sys.path.append('/home/hs/mudbox/ml/')
from pingpong.utils import get_ingredient_factory
factory = get_ingredient_factory()
factory.config.add_property('corpora', 'preprocessed', False)
pipeline = factory.get_preprocess_pipeline() + factory.get_tokenized_pipeline()

Using TensorFlow backend.


In [2]:
class SentenceLoader:
    base_dir=''
    
    def __init__(self):        
        self.preprocessor = Preprocessor()
        
    @staticmethod
    def _load(filepath):
        sent2id = {}
        id2sent = []
        with open(filepath, 'r') as f:
            lines = f.readlines()
            for line in lines[1:]:
                try:
                    idx, sent = line.strip().split(',')
                except ValueError:
                    continue
                
                sent = pipeline.run(sent)
                
                sent2id[sent] = int(idx)
                id2sent.append(sent)
        
        return sent2id, id2sent
    
    @classmethod
    def load(cls):
        query2id, id2query = cls._load(filepath=cls.base_dir + 'query_id.csv')
        script2id, id2script = cls._load(filepath=cls.base_dir + 'script_id.csv')
        return query2id, id2query, script2id, id2script

In [3]:
query2id, id2query, script2id, id2script = SentenceLoader.load()

In [4]:
len(query2id), len(script2id), len(id2query), len(id2script)

(600, 200, 600, 200)

# 2. TF-IDF Model

In [14]:
import sys
sys.path.append('/home/hs/mudbox/ml/')
from math import log10, sqrt
from playground.hs_projects.resembla.feature_extraction.sentence_feature_extractor import TermVectorizer

In [15]:
class Sim:
    """Abstract class for cosine-based similarity"""

    def __init__(self):
        pass

    @staticmethod
    def union(d1, d2):
        """두 딕셔너리의 키 값에서 차집합과 교집합을 구함"""
        u1 = set(d1.keys())
        u2 = set(d2.keys())
        return u1 - u2, u1 & u2, u2 - u1

    @staticmethod
    def norm(d):
        """the norm of vector"""
        return sqrt(sum([i ** 2 for i in d.values()]))

    @classmethod
    def dict_similarity(cls, d1, d2):
        norm_1 = cls.norm(d1)
        norm_2 = cls.norm(d2)
        _, intersection, _ = cls.union(d1, d2)

        union_sum = sum([d1[word] * d2[word] for word in intersection])
        return union_sum / (norm_1*norm_2)

In [32]:
class TfidfModel:
    def __init__(self):
        idf = self.get_idf()
        self.vectorizer = TermVectorizer(idf)

    @staticmethod
    def get_idf():
        with open('/media/scatter/projects/sandbox/project/AIChatbot/25000pingpong/pre_identified_corpus/document_frequency.txt', 'r') as f:
            num_documents = int(f.readline().strip())
            idf = {}
            for line in f:
                term, freq = line.strip().split('\t')
                idf[term] = log10(num_documents / int(freq))
        return idf
    
    def preprocess(self, sent):
        return self.vectorizer.vectorize(sent)['tfidf']
        
    def compare(self, pre_sent0, pre_sent1):
        return Sim.dict_similarity(pre_sent0, pre_sent1)
    
    def score(self, sent0, sent1):
        sent0 = self.preprocess(sent0)
        sent1 = self.preprocess(sent1)
        return self.compare(sent0, sent1)

# 3. Inference
- 만든 모델을 이용하여 inference합니다.
- model의 interface는 다음과 같다고 가정합니다.  
    - pair단위로 inference  
        - score: query와 script pair의 score를 계산합니다.
    - preprocess 후 사용
        - preprocess: sentence를 preprocess 합니다.
        - compare: preprocessed_query와 preprocessed_script의 score를 계산합니다.

In [41]:
import numpy as np
import pandas as pd
from time import time

In [71]:
class Inference:
    
    def __init__(self, query2id, id2query, script2id, id2script):
        self.query2id = query2id
        self.id2query = id2query
        self.script2id = script2id
        self.id2script = id2script
    
    def infer_pbp(self, model, filepath, verbose=0):
        """script와 query를 pair단위로 비교합니다."""
        results = []
        times = []
        for query in self.id2query:
            _results = []
            for script in self.id2script:
                _start = time()
                _results.append(model.score(query, script))
                times.append(time() - _start)
            results.append(_results)
        results = np.argmax(np.array(results), axis=1)
        
        times = np.array(times)
        mean_time = np.mean(times) * 1e6
        std_time = np.std(times) * 1e6
        print('%.4f ± %.4f μs per comparison' % (mean_time, std_time))
        
        if verbose:
            print(self._verbose(results))
        
        self._save(filepath, results)
        
    def infer_preprocessed(self, model, filepath, verbose=0):
        """script와 query를 preprocess한 다음에 비교합니다."""
        
        def _preprocess(model, sents):
            _preprocessed = []
            _times = []
            for sent in sents:
                _start = time()
                _preprocessed.append(model.preprocess(sent))
                _times.append(time() - _start)
            return _preprocessed, _times
        
        preprocessed_id2query, query_preprosessing_time = _preprocess(model, self.id2query)
        preprocessed_id2script, script_preprocessing_time = _preprocess(model, self.id2script)
        
        results = []
        comparison_times = []
        for query in preprocessed_id2query:
            _results = []
            for script in preprocessed_id2script:
                _start = time()
                _results.append(model.compare(query, script))
                comparison_times.append(time() - _start)
            results.append(_results)
        results = np.argmax(np.array(results), axis=1)
        
        preprocessing_times = np.array(query_preprosessing_time + script_preprocessing_time)
        mean_preprocessing_times = np.mean(preprocessing_times) * 1e6
        std_preprocessing_times = np.std(preprocessing_times) * 1e6
        print('%.4f ± %.4f μs per preprocessing' % (mean_preprocessing_times, std_preprocessing_times))
        
        comparison_times = np.array(comparison_times)
        mean_comparison_times = np.mean(comparison_times) * 1e6
        std_comparison_times = np.std(comparison_times) * 1e6
        print('%.4f ± %.4f μs per comparison' % (mean_comparison_times, std_comparison_times))
        
        if verbose:
            print(self._verbose(results))
        
        self._save(filepath, results)

    @staticmethod
    def _save(filepath, results):
        with open(filepath, 'w') as f:
            f.write('id_query,id_script\n')
            for n, i in enumerate(results):
                f.write('%d\t,%d\n' % (n, i))
    
    def _verbose(self, results):
        eval_results = []
        for query, result in zip(self.id2query, results):
            eval_results.append((query, self.id2script[result]))
        series = pd.Series(eval_results)
        return series

In [72]:
inferrer = Inference(query2id, id2query, script2id, id2script)
tfidf_model = TfidfModel()

In [73]:
inferrer.infer_preprocessed(tfidf_model, 'submission_baseline.csv', verbose=0)

11.5415 ± 3.8397 μs per preprocessing
4.6463 ± 1.5086 μs per comparison


In [74]:
inferrer.infer_pbp(tfidf_model, 'submission_baseline.csv', verbose=0)

25.1546 ± 4.0037 μs per comparison
