In [1]:
from test_ifidf import get_tf_title
import os
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from gensim.models import word2vec
import gensim
import pickle
import logging
import numpy
import pandas as pd
import statistics
from check_kind import all_tf_nums

In [2]:
train_file = os.path.abspath('./liar_dataset/train.tsv')
test_file = os.path.abspath('./liar_dataset/test.tsv')
valid_file = os.path.abspath('./liar_dataset/valid.tsv')

In [3]:
if os.path.exists('GoogleNews-vectors-negative300.bin'):
    model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    with open('infos.pickle', mode='rb') as f:
        infos = pickle.load(f)
elif os.path.exists("./wiki.model"):
    model = word2vec.Word2Vec.load("wiki.model")
    with open('infos.pickle', mode='rb') as f:
        infos = pickle.load(f)
else:
    infos = [info for filename in [train_file, test_file, valid_file] for info in get_tf_title(filename) ]
    with open('infos.pickle', mode='wb') as f:
        pickle.dump(infos, f)
    with open('quotes.txt', 'w') as f:
        quotes = infos[1]
        for quote in quotes:
            f.write(quote + "\n")

    sentences = word2vec.Text8Corpus('quotes.txt')
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = word2vec.Word2Vec(sentences, size=500, min_count=1, window=15)
    model.save("./wiki.model")
    os.remove('quotes.txt')

In [4]:
# https://medium.com/eureka-engineering/pairs%E3%81%AE%E3%82%B3%E3%83%9F%E3%83%A5%E3%83%8B%E3%83%86%E3%82%A3%E3%82%92word2vec%E3%81%A8svm%E3%81%A7%E5%88%86%E9%A1%9E%E3%81%97%E3%81%A6%E3%81%BF%E3%81%9F-48f4099f0ffc
def text_to_vec(words, model):
    word_vecs = []
    for word in words:
        try:
            word_vecs.append(model[word])
        except:
            pass
    if len(word_vecs) == 0:
        return None
    text_vec = numpy.zeros(word_vecs[0].shape, dtype = word_vecs[0].dtype)
    for word_vec in word_vecs:
        text_vec = text_vec + word_vec
    return text_vec

In [5]:
def normalize(vec):
    return vec / numpy.linalg.norm(vec)

In [6]:
if os.path.exists('./features.pickle'):
    with open('features.pickle', mode='rb') as f:
        features = pickle.load(f)
else:
    quotes = infos[1]
    features = []
    for quote in infos[1]:
        quote_words = quote.rstrip().split(' ')
        quote_name = ''.join(quote_words).replace("/", "")[:10]
        quote_vec = text_to_vec(quote_words, model)
        quote_vec = normalize(quote_vec)
        numpy.savetxt('./vec/' + quote_name, quote_vec)
        features.append(quote_vec)
    with open('features.pickle', mode='wb') as f:
        pickle.dump(features, f)

In [7]:
%%time
# トレーニングデータから分類器を作成 (SVM)
estimator = SVC(C=1.0)
scoring = {
    "p": "precision",
    "r": "recall",
    "f": "f1"
}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_validate(estimator, features, infos[0], cv=skf, scoring=scoring, return_train_score=True,)
with open('scores.pickle', mode='wb') as f:
    pickle.dump(scores, f)

CPU times: user 7min 13s, sys: 1.08 s, total: 7min 14s
Wall time: 7min 25s


In [8]:
all_true, all_false = all_tf_nums([train_file, test_file, valid_file])
base_p = all_true / (all_true + all_false)
base_r = all_true / all_true
base_f = statistics.harmonic_mean([base_p, base_r])

In [9]:
result = pd.DataFrame([
        [base_p, base_r, base_f],
        [scores["test_p"].mean(), scores["test_r"].mean(), scores["test_f"].mean()],
    ],
    index=["Baseline", "Proposed method"],
    columns=["Precision", "Recall", "F1 score"],
)

In [11]:
display(result)

Unnamed: 0,Precision,Recall,F1 score
Baseline,0.557736,1.0,0.716085
Proposed method,0.723439,1.0,0.83953
