In [1]:
import sys
import csv

from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

csv.field_size_limit(sys.maxsize)

In [None]:
def get_data(t='test'):
    text= []
    label= []

    with open(f'data-vermeer/{t}.csv') as fi:
        next(fi)
        reader = csv.reader(fi, delimiter=',')
        for row in reader:
            text.append(row[0])
            label.append(row[1])

    return text, label

In [2]:
X_test, y_test = get_data('test')
X_train, y_train = get_data('train')

In [13]:
configurations = [('NB with Count', CountVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('NB with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), MultinomialNB()),
                 ('LogReg with Count', CountVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear')),
                 ('LogReg with TfIdf', TfidfVectorizer(min_df=5, max_df=.5), LogisticRegression(solver='liblinear')),
                 ('SVM with Count - rbf kernel', CountVectorizer(min_df=5, max_df=.5), SVC(kernel='rbf')),
                 ('SVM with Count - linear kernel', CountVectorizer(min_df=5, max_df=.5), SVC(kernel='linear')),
                 ('SVM with Tfidf - rbf kernel', TfidfVectorizer(min_df=5, max_df=.5), SVC(kernel='rbf')),
                 ('SVM with Tfidf - linear kernel', TfidfVectorizer(min_df=5, max_df=.5), SVC(kernel='linear')),

                 ]

for description, vectorizer, classifier in configurations:
    print(description)
    X_tr = vectorizer.fit_transform(X_train)
    X_te = vectorizer.transform(X_test)
    classifier.fit(X_tr, y_train)
    y_pred = classifier.predict(X_te)
    print(metrics.classification_report(y_test, y_pred) )
    print('\n')

NB with Count
               precision    recall  f1-score   support

     business       0.48      0.76      0.59       101
entertainment       0.95      0.73      0.83       400
        other       0.52      0.58      0.55        73
     politics       0.71      0.85      0.78       124

     accuracy                           0.74       698
    macro avg       0.66      0.73      0.68       698
 weighted avg       0.79      0.74      0.75       698



NB with TfIdf
               precision    recall  f1-score   support

     business       0.80      0.16      0.26       101
entertainment       0.67      0.99      0.80       400
        other       1.00      0.11      0.20        73
     politics       0.90      0.53      0.67       124

     accuracy                           0.70       698
    macro avg       0.84      0.45      0.48       698
 weighted avg       0.76      0.70      0.64       698



LogReg with Count
               precision    recall  f1-score   support

     bus