In [1]:
%matplotlib inline

In [11]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
train_file = os.path.abspath('./liar_dataset/train.tsv')
test_file = os.path.abspath('./liar_dataset/test.tsv')

In [4]:
def get_tf_title(train_file):
    tfs, titles = [], []
    with open(train_file, "r") as f:
        line = f.readline()
        while line:
            cols = line.split("\t")
            raw_tf = cols[1]
            if raw_tf in {"false", "pants-fire"}:
                tf = False
            else:
                tf = True
            title = cols[2]
            tfs.append(tf)
            titles.append(title)
            line = f.readline()
    infos = [tfs, titles]     
    return infos

In [46]:
train_infos, test_infos = get_tf_title(train_file), get_tf_title(test_file)
infos = [train_infos[0] + test_infos[0], train_infos[1] + test_infos[1]]

In [45]:
# http://moritamori.hatenablog.com/entry/tfidf_vectorizer
def make_tfidf_vec(infos):
    vec = TfidfVectorizer(max_df=10, ngram_range=(1, 1), sublinear_tf=True, norm='l2', stop_words='english')
    docs = infos[1]
    term_doc = vec.fit_transform(docs)
    info_x = term_doc.toarray()
    info_y = infos[0]
    return info_x, info_y

In [49]:
info_x, info_y = make_tfidf_vec(infos)
train_x, train_y = info_x[:len(train_infos[0])], info_y[:len(train_infos[1])]
test_x, test_y = info_x[len(train_infos[0]):], info_y[len(train_infos[1]):]

(1283, 10684)

In [52]:
%%time
#https://qiita.com/kazuki_hayakawa/items/18b7017da9a6f73eba77
# 線形SVMのインスタンスを生成
model = SVC(kernel='linear', random_state=None)

# モデルの学習。fit関数で行う。
model.fit(train_x, train_y)

CPU times: user 18min 52s, sys: 5.72 s, total: 18min 58s
Wall time: 20min 12s


In [53]:
%%time
pred_train = model.predict(train_x)
accuracy_train = accuracy_score(train_y, pred_train)
print("トレーニングデータに対する正解率： %.2f" % accuracy_train)

トレーニングデータに対する正解率： 0.80
CPU times: user 18min 2s, sys: 4.86 s, total: 18min 7s
Wall time: 19min 21s


In [54]:
%%time
pred_test = model.predict(test_x)
accuracy_test = accuracy_score(test_y, pred_test)
print('テストデータに対する正解率： %.2f' % accuracy_test)

テストデータに対する正解率： 0.72
CPU times: user 2min 12s, sys: 625 ms, total: 2min 13s
Wall time: 2min 21s
