Configuration

In [9]:
# How many subjects to take
TOP_SUBJECTS = 15

# These words will be ignored
STOP_LIST = set('for a an of the and to in on - with i. we'.split())

In [17]:
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit

Load data, pick top subjects and prepare `X` and `y`

In [25]:
with open('preprocessed_arxiv.pickle', 'rb') as f:
    articles_by_subject = pickle.load(f)

# Here we select TOP_SUBJECTS subjects (which have the most articles)
subjects = sorted(articles_by_subject,
                  key=lambda k: len(articles_by_subject[k]), reverse=True)[:TOP_SUBJECTS]

# Prepare X and y
def tokenize(article):
    text = article['title'] + ' ' + article['abstract']

    return " ".join(word for word in text.lower().split() if word not in STOP_LIST)

X, y = [], []

for subj in subjects:
    for article in articles_by_subject[subj]:
        X.append(tokenize(article))
        y.append(subj)
        
X, y = np.array(X), np.array(y)
print("total examples %d" % len(y))

total examples 9042


Prepare classification models

In [26]:
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer='word')), 
                ("linear svc", SVC(kernel="linear"))])

svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer='word')),
                      ("linear svc", SVC(kernel="linear"))])

In [27]:
all_models = [
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
]

for name, model in all_models:
    print("Model {}".format(name))
    
    score = cross_val_score(model, X, y, cv=5, verbose=2, n_jobs=-1).mean()
    print("Score {}".format(score))

Model svc
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=  42.8s
[CV]  ................................................................
[CV] ................................................. , total=  45.0s
[CV] ................................................. , total=  44.9s
[CV] ................................................. , total=  45.4s
[CV] ................................................. , total=  29.8s
Score 0.6134120247211878
Model svc_tfidf


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total= 1.2min
[CV]  ................................................................
[CV] ................................................. , total= 1.2min
[CV] ................................................. , total= 1.2min
[CV] ................................................. , total= 1.2min
[CV] ................................................. , total=  46.6s
Score 0.681544227969687


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished
