Inspired by [Text Classification With Word2Vec](http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/),
code [here](https://github.com/nadbordrozd/blog_stuff/blob/master/classification_w2v/benchmarking.ipynb).

Configuration

In [1]:
# How many subjects to take
TOP_SUBJECTS = 15

In [2]:
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

Load data, pick top subjects and prepare `X` and `y`

In [3]:
with open('preprocessed_arxiv.pickle', 'rb') as f:
    articles_by_subject = pickle.load(f)

# Here we select TOP_SUBJECTS subjects (which have the most articles)
subjects = sorted(articles_by_subject,
                  key=lambda k: len(articles_by_subject[k]), reverse=True)[:TOP_SUBJECTS]

# Prepare X and y
def tokenize(article):
    text = article['title'] + ' ' + article['abstract']

    return " ".join(word for word in text.lower().split())

X, y = [], []

for subj in subjects:
    for article in articles_by_subject[subj]:
        X.append(tokenize(article))
        y.append(subj)
        
X, y = np.array(X), np.array(y)
print("total examples %d" % len(y))

total examples 9042


Prepare classification models

In [4]:
mult_nb = Pipeline([("count_vectorizer", CountVectorizer()),
                    ("multinomial nb", MultinomialNB())])

bern_nb = Pipeline([("count_vectorizer", CountVectorizer()),
                    ("bernoulli nb", BernoulliNB())])

mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer()),
                          ("multinomial nb", MultinomialNB())])

bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer()),
                          ("bernoulli nb", BernoulliNB())])

svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer='word')), 
                ("linear svc", SVC(kernel="linear"))])

svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer='word')),
                      ("linear svc", SVC(kernel="linear"))])

In [6]:
all_models = [
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
]

scores = []

for name, model in all_models:
    print("Model {}".format(name))
    
    score = cross_val_score(model, X, y, cv=5, verbose=2, n_jobs=-1).mean()
    print("Score {}".format(score))
    
    scores.append((name, score))

Model mult_nb
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   2.9s
[CV]  ................................................................
[CV] ................................................. , total=   2.9s
[CV] ................................................. , total=   3.0s
[CV] ................................................. , total=   3.0s
[CV] ................................................. , total=   1.8s
Score 0.7052161211178045
Model mult_nb_tfidf


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   3.1s
[CV]  ................................................................
[CV] ................................................. , total=   3.2s
[CV] ................................................. , total=   3.1s
[CV] ................................................. , total=   3.0s
[CV] ................................................. , total=   1.7s
Score 0.6390770303705934
Model bern_nb


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   3.2s
[CV]  ................................................................
[CV] ................................................. , total=   3.2s
[CV] ................................................. , total=   3.2s
[CV] ................................................. , total=   3.1s
[CV] ................................................. , total=   1.8s
Score 0.6231627141839039
Model bern_nb_tfidf


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.7s finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   3.6s
[CV]  ................................................................
[CV] ................................................. , total=   3.7s
[CV] ................................................. , total=   3.7s
[CV] ................................................. , total=   3.6s
[CV] ................................................. , total=   1.8s
Score 0.6231627141839039
Model svc


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.0s finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=  46.1s
[CV]  ................................................................
[CV] ................................................. , total=  48.1s
[CV] ................................................. , total=  48.7s
[CV] ................................................. , total=  48.7s
[CV] ................................................. , total=  31.6s
Score 0.6144033251159782
Model svc_tfidf


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total= 1.2min
[CV]  ................................................................
[CV] ................................................. , total= 1.3min
[CV] ................................................. , total= 1.2min
[CV] ................................................. , total= 1.3min
[CV] ................................................. , total=  48.5s
Score 0.6826532394001967


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished


In [16]:
for name, score in sorted(scores, key=lambda s: -s[1]):
    print("%20s %.4f" % (name, score))

             mult_nb 0.7052
           svc_tfidf 0.6827
       mult_nb_tfidf 0.6391
             bern_nb 0.6232
       bern_nb_tfidf 0.6232
                 svc 0.6144
