In [1]:
from operator import itemgetter

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# restore objects and unpack them into variables
%store -r object_keep
df_bbc, list_categories, X, y, X_train, X_test, y_train, y_test = itemgetter('df_bbc',
                                                                             'list_categories',
                                                                             'X',
                                                                             'y',
                                                                             'X_train',
                                                                             'X_test',
                                                                             'y_train',
                                                                             'y_test')(object_keep)

## Logistic Regression
Logistic regression is simple and easy to understand classification algorithm, it can be easily generalised for multiple classes/categories via multinomial logistic regression.

In [5]:
model_logreg = Pipeline(steps = [('vect', CountVectorizer()),
                                 ('tfidf', TfidfTransformer()),
                                 ('clf', LogisticRegression(multi_class = 'multinomial', n_jobs = 1, C = 1e5, max_iter = 200))])
model_logreg.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 LogisticRegression(C=100000.0, max_iter=200,
                                    multi_class='multinomial', n_jobs=1))])

In [6]:
%%time

y_pred = model_logreg.predict(X_test)

print('Accuracy: {:.3f}'.format(accuracy_score(y_pred, y_test)))
print(classification_report(y_test, y_pred, target_names = list_categories))

Accuracy: 0.984
               precision    recall  f1-score   support

entertainment       0.96      0.98      0.97       144
     business       0.99      0.98      0.99       110
        sport       0.98      0.96      0.97       129
     politics       0.99      1.00      0.99       164
         tech       1.00      0.99      1.00       121

     accuracy                           0.98       668
    macro avg       0.98      0.98      0.98       668
 weighted avg       0.98      0.98      0.98       668

CPU times: user 162 ms, sys: 6.33 ms, total: 168 ms
Wall time: 168 ms


> **Interestingly, a simple logistic regression provides comparable performance to the widely considered, best-in-class Linear SVM in the previous section. The differences lie in the scores for each of the individual categories.**

In [7]:
object_keep = {'df_bbc': df_bbc,
               'list_categories': list_categories,
               'X': X,
               'y': y,
               'X_train': X_train,
               'X_test': X_test,
               'y_train': y_train,
               'y_test': y_test}
%store object_keep

Stored 'object_keep' (dict)
