In [2]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_validate
import pandas as pd

In [3]:
train_file = os.path.abspath('./liar_dataset/train.tsv')
test_file = os.path.abspath('./liar_dataset/test.tsv')

In [4]:
def get_truefalse_info(filename):
    infos = []
    with open(filename, "r") as f:
        line = f.readline()
        while line:
            cols = line.split("\t")
            raw_tf = cols[1]
            if raw_tf in {"false", "pants-fire"}:
                tf = False
            else:
                tf = True
            title = cols[2]
            infos.append([tf, title])
            line = f.readline()
    return infos

In [5]:
train_infos, test_infos = get_tf_title(train_file), get_tf_title(test_file)
infos = [train_infos[0] + test_infos[0], train_infos[1] + test_infos[1]]

In [6]:
# http://moritamori.hatenablog.com/entry/tfidf_vectorizer
def make_tfidf_vec(infos):
    vec = TfidfVectorizer(max_df=10, ngram_range=(1, 1), sublinear_tf=True, norm='l2', stop_words='english')
    docs = infos[1]
    term_doc = vec.fit_transform(docs)
    info_x = term_doc.toarray()
    info_y = infos[0]
    return info_x, info_y

In [7]:
info_x, info_y = make_tfidf_vec(infos)

In [None]:
%%time
# https://hayataka2049.hatenablog.jp/entry/2018/03/31/184557
# 線形SVMのインスタンスを生成
model = SVC(C=1.0)
scoring = {
    "p": "precision",
    "r": "recall",
    "f":"f1"
}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_validate(model, info_x, info_y, cv=skf, scoring=scoring, return_train_score=True,)

In [None]:
pd.DataFrame(
    [["Precision", scores["test_p"].mean()], ["Recall", scores["test_r"].mean()], ["F score", scores["test_f"].mean()]],
    columns=("name", "score")
)