In [1]:
# classification with scikit learn
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification

from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import precision_recall_fscore_support

h = .02  # step size in the mesh

names = ["Linear SVM", "Neural Net", "Naive Bayes"]

classifiers = [
    SVC(kernel="linear", C=0.025),
    MLPClassifier(alpha=1, max_iter=1000),
    GaussianNB()]

In [2]:
def load_data(fin):
    X = []
    y = []
    with open(fin) as f:
        for line in f:
            label, feats = line.rstrip('\n').split('\t')
            #print(feats)
            feats = [int(i) for i in feats.split()]
            X.append(feats)
            y.append(label)
    return X, y

In [3]:
datasets = [('ngram-128', load_data('../data/ngram_128.tsv')),
            ('ngram-200', load_data('../data/ngram_200.tsv')),
            ('ngram-300', load_data('../data/ngram_300.tsv'))]

In [4]:
# iterate over datasets
for ds_cnt, (name, ds) in enumerate(datasets):
    print('-'*80)
    print(name)
    # preprocess dataset, split into training and test part
    X, y = ds
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.2, random_state=42)

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        print(name)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(score)
        
        y_pred = clf.predict(X_test)
        prf = precision_recall_fscore_support(y_test, y_pred, average='macro')
        print('precision', 'recall', 'f1')
        print(prf)
        print()

--------------------------------------------------------------------------------
ngram-128
Linear SVM
0.5238095238095238
precision recall f1
(0.6987654320987655, 0.5946275946275946, 0.5155828639699607, None)

Neural Net
0.5714285714285714
precision recall f1
(0.5565323565323566, 0.5427350427350427, 0.5343915343915344, None)

Naive Bayes
0.6428571428571429
precision recall f1
(0.6813765182186234, 0.7032967032967031, 0.6357235325692274, None)

--------------------------------------------------------------------------------
ngram-200
Linear SVM
0.5476190476190477
precision recall f1
(0.7136752136752137, 0.6202686202686203, 0.5473551050301274, None)

Neural Net
0.6428571428571429
precision recall f1
(0.611489898989899, 0.6098901098901099, 0.6010973379394433, None)

Naive Bayes
0.7142857142857143
precision recall f1
(0.7555555555555555, 0.7606837606837606, 0.7068117068117069, None)

--------------------------------------------------------------------------------
ngram-300
Linear SVM
0.59523

### References

[Scikit Learn Classifier comparison](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html)