# 7. Model Exploration

- Based on Quora Winners
- Based on DeepParaphrase paper

---
## Based on Quora Winners 
- 그냥 이것만 보자 https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle
- 회사에서 구현할 것! Word Mover's Distance : https://markroxor.github.io/gensim/static/notebooks/WMD_tutorial.html
- http://blog.kaggle.com/2016/07/21/approaching-almost-any-machine-learning-problem-abhishek-thakur/

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
sys.path.append("/home/angrypark/korean-text-matching-tf/")

from utils.utils import JamoProcessor
from text.tokenizers import SentencePieceTokenizer

class Config:
    def __init__(self):
        self.sent_piece_model = "/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model"
config = Config()

processor = JamoProcessor()
tokenizer = SentencePieceTokenizer(config)
def my_word_tokenizer(raw, pos=["Noun", "Alpha", "Verb", "Number"], stopword=[]):
    return [word for word in tokenizer.tokenize(raw)]

def my_char_tokenizer(raw, pos=["Noun", "Alpha", "Verb", "Number"], stopword=[]):
    return [processor.word_to_jamo(word) for word in tokenizer.tokenize(raw)]

---
### TFIDF Vectorizer + Naive Bayes

In [8]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf_word_vectorizer = pickle.load(open("../dump/tfidf_word_vectorizer_big.pkl", "rb"))

In [9]:
%%time
with open("../data/small/train.txt", "r") as f:
    # read raw text lines
    train_A, train_B, train_labels = zip(*[line.strip().split("\t") for line in f])
    train_labels = [1 if l=="1" else 0 for l in train_labels] 
    
    # make tfidf vectorized dataset
    train_set = tfidf_word_vectorizer.transform(train_A).toarray()
    train_set = np.concatenate((train_set, tfidf_word_vectorizer.transform(train_B).toarray()), axis=1)

CPU times: user 12.4 s, sys: 5min 28s, total: 5min 40s
Wall time: 5min 40s


In [11]:
%%time
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_set, train_labels)

CPU times: user 21.2 s, sys: 452 ms, total: 21.6 s
Wall time: 21.6 s


In [12]:
nb.score(train_set, train_labels)

0.8218587247646129

In [14]:
%%time
with open("../data/small/val.txt", "r") as f:
    # read raw text lines
    val_A, val_B, val_labels = zip(*[line.strip().split("\t") for line in f])
    val_labels = [1 if l=="1" else 0 for l in val_labels] 
    
    # make tfidf vectorized dataset
    val_set = tfidf_word_vectorizer.transform(val_A).toarray()
    val_set = np.concatenate((val_set, tfidf_word_vectorizer.transform(val_B).toarray()), axis=1)

CPU times: user 884 ms, sys: 440 ms, total: 1.32 s
Wall time: 1.32 s


In [15]:
nb.score(val_set, val_labels)

0.6231817719458854

In [16]:
%%time
with open("../data/small/test.txt", "r") as f:
    # read raw text lines
    test_A, test_B, test_labels = zip(*[line.strip().split("\t") for line in f])
    test_labels = [1 if l=="1" else 0 for l in test_labels] 
    
    # make tfidf vectorized dataset
    test_set = tfidf_word_vectorizer.transform(test_A).toarray()
    test_set = np.concatenate((test_set, tfidf_word_vectorizer.transform(test_B).toarray()), axis=1)

CPU times: user 720 ms, sys: 332 ms, total: 1.05 s
Wall time: 1.05 s


In [17]:
nb.score(test_set, test_labels)

0.6565486954275381

feature로 쓸만함. models에 저장

In [18]:
pickle.dump(nb, open("../models/nb.pkl", "wb"))

---

### TFIDF Vectorizer + SVD + SVM

In [1]:
import numpy as np
from sklearn.decomposition import TruncatedSVD

In [22]:
%%time
svd = TruncatedSVD(n_components=150)
svd.fit(train_set)

CPU times: user 52min 57s, sys: 25min 31s, total: 1h 18min 28s
Wall time: 9min 20s


In [23]:
train_set_svd = svd.transform(train_set)
val_set_svd = svd.transform(val_set)
test_set_svd = svd.transform(test_set)

In [24]:
np.save("../data/small/train_tfidf_vec.npy", train_set)

In [25]:
np.save("../data/small/val_tfidf_vec.npy", val_set)
np.save("../data/small/test_tfidf_vec.npy", test_set)

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_set_svd)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [28]:
train_set_svd_scaled = scaler.transform(train_set_svd)
val_set_svd_scaled = scaler.transform(val_set_svd)
test_set_svd_scaled = scaler.transform(test_set_svd)

In [None]:
%%time
from sklearn.svm import SVC
svm = SVC(C=1.0, probability=True)
svm.fit(train_set_svd_scaled, train_labels)

In [None]:
1