In [7]:
from operator import itemgetter

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# restore objects and unpack them into variables
%store -r object_keep
df_bbc, list_categories, X, y, X_train, X_test, y_train, y_test = itemgetter('df_bbc',
                                                                             'list_categories',
                                                                             'X',
                                                                             'y',
                                                                             'X_train',
                                                                             'X_test',
                                                                             'y_train',
                                                                             'y_test')(object_keep)

## Linear Support Vector Machine
[Linear SVMs](https://scikit-learn.org/stable/modules/svm.html#svm) is widely considered as one of the best text classification algorithms.

In [5]:
model_sgd = Pipeline(steps = [('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', SGDClassifier(loss = 'hinge', penalty = 'l2', alpha = 1e-3, random_state = 42, max_iter = 5, tol = None))])
model_sgd.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [8]:
%%time

y_pred = model_sgd.predict(X_test)

print('Accuracy: {:.3f}'.format(accuracy_score(y_pred, y_test)))
print(classification_report(y_test, y_pred, target_names = list_categories))

Accuracy: 0.984
               precision    recall  f1-score   support

entertainment       0.97      0.97      0.97       144
     business       0.99      0.98      0.99       110
        sport       0.98      0.97      0.97       129
     politics       0.99      1.00      0.99       164
         tech       1.00      0.99      1.00       121

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668

CPU times: user 153 ms, sys: 4.7 ms, total: 157 ms
Wall time: 160 ms


> **We have improved on our Naive Bayes by 1.4 percentage points - this is looking very difficult to improve on!**

In [9]:
object_keep = {'df_bbc': df_bbc,
               'list_categories': list_categories,
               'X': X,
               'y': y,
               'X_train': X_train,
               'X_test': X_test,
               'y_train': y_train,
               'y_test': y_test}
%store object_keep

Stored 'object_keep' (dict)
