## Support Vector Machine for tagging GOV.UK

### Load requirements and data

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score



In [4]:
# Double check we are in the right environment!
!which python3
!python3 --version

/Users/matthewupson/.pyenv/shims/python3
Python 3.4.6


In [5]:
# Becuase the file is large it uses the low_memory option 
# to read the file in chunks. The result of this is that it
# it infers different column types in the different chunks,
# solve this either by reading the file in a oner with low_memory=False,
# or be explicit about the column types by setting them with dtype = {'column':type}

labelled_level2 = pd.read_csv(
    '../../../data/labelled_level2.csv',
    low_memory=False
)

In [4]:
labelled_level2.shape

(173560, 21)

In [5]:
# Collapse down World

labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [6]:
# There are are 210 unique taxons remaining in level2taxon

labelled_level2['level2taxon'].nunique()

210

In [7]:
# Drop cases where there are multiple taxons applied to a content item.
# This is the simplest solution to the multiple class problem

print('Before:', labelled_level2.shape)
labelled_level2.drop_duplicates(subset=['content_id'], inplace=True, keep='first')
print('After: ', labelled_level2.shape)

Before: (173560, 21)
After:  (114048, 21)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    labelled_level2['combined_text'], labelled_level2['level2taxon'], test_size = 0.2, random_state=1337)

In [9]:
"""Support Vector Machine (SVM) classifier"""
#svm_clf = Pipeline([('vect', CountVectorizer()),
#                    ('tfidf', TfidfTransformer()),
#                    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter = 5, random_state=42)),
#])


from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.svm import LinearSVC

# Score on the training set was:0.7925764732068796
svc_clf = Pipeline([
                             ('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('select_perc', SelectPercentile(score_func=f_classif, percentile=2)),
                             ('svc', LinearSVC(C=10.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.01))
                             ])


In [10]:
svc_clf.fit(X_train, y_train)

  f = msb / msw


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
     verbose=0))])

In [11]:
""" Predict the test dataset using SVM"""
predicted_svm = svc_clf.predict(X_test)
print('SVM correct prediction: {:4.2f}'.format(np.mean(predicted_svm == y_test)))

SVM correct prediction: 0.67


In [12]:
print(classification_report(y_test, predicted_svm))


                                                                        precision    recall  f1-score   support

                                         Administrative justice reform       0.33      1.00      0.50         2
                                     Adoption, fostering and surrogacy       0.71      0.62      0.67        16
                                                 Armed Forces Covenant       0.00      0.00      0.00         1
                                                          Armed forces       0.00      0.00      0.00         3
                           Armed forces and Ministry of Defence reform       0.00      0.00      0.00         4
                         Armed forces support for activities in the UK       0.00      0.00      0.00         1
                                                      Arts and culture       0.66      0.77      0.71       361
                                        Assessing environmental impact       0.00      0.00      0.00  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## Metadata classifier

In [13]:
labelled_level2.columns

Index(['base_path', 'content_id', 'description', 'details', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'document_type_gp', 'body', 'combined_text',
       'taxon_id', 'taxon_base_path', 'taxon_name', 'level1taxon',
       'level2taxon', 'level3taxon', 'level4taxon', 'level5taxon'],
      dtype='object')

In [14]:
metadata = labelled_level2.filter(['document_type', 'primary_publishing organisation', 'publishing_app', 'document_type_gp', 'locale'], axis=1)

In [15]:
metadata.columns

Index(['document_type', 'publishing_app', 'document_type_gp', 'locale'], dtype='object')

In [17]:
document_type_encoder = LabelEncoder()
publishing_app = LabelEncoder()
document_type_gp_encoder = LabelEncoder()
locale_encoder = LabelEncoder()

metadata['document_type'] = document_type_encoder.fit_transform(metadata['document_type'])
metadata['publishing_app'] = document_type_encoder.fit_transform(metadata['publishing_app'])
metadata['document_type_gp'] = document_type_encoder.fit_transform(metadata['document_type_gp'])
metadata['locale'] = document_type_encoder.fit_transform(metadata['locale'])

X_train, X_test, y_train, y_test = train_test_split(
    metadata, labelled_level2['level2taxon'], test_size = 0.2, random_state=1337)

In [18]:
ada_clf = AdaBoostClassifier()

ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [19]:
predicted_ada = ada_clf.predict(X_test)

In [21]:
print('Adaboost correct prediction: {:4.2f}'.format(np.mean(predicted_ada == y_test)))

Adaboost correct prediction: 0.13


In [22]:
print(classification_report(y_test, predicted_ada))

                                                                        precision    recall  f1-score   support

                                         Administrative justice reform       0.00      0.00      0.00         2
                                     Adoption, fostering and surrogacy       0.00      0.00      0.00        16
                                                 Armed Forces Covenant       0.00      0.00      0.00         1
                                                          Armed forces       0.00      0.00      0.00         3
                           Armed forces and Ministry of Defence reform       0.00      0.00      0.00         4
                         Armed forces support for activities in the UK       0.00      0.00      0.00         1
                                                      Arts and culture       0.00      0.00      0.00       361
                                        Assessing environmental impact       0.00      0.00      0.00  

  'precision', 'predicted', average, warn_for)


In [23]:
from sklearn.ensemble import RandomForestClassifier

rfo_clf = RandomForestClassifier()
rfo_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
predicted_rfo = rfo_clf.predict(X_test)
print('Adaboost correct prediction: {:4.2f}'.format(np.mean(predicted_rfo == y_test)))

Adaboost correct prediction: 0.19


In [25]:
print(classification_report(y_test, predicted_ada))

                                                                        precision    recall  f1-score   support

                                         Administrative justice reform       0.00      0.00      0.00         2
                                     Adoption, fostering and surrogacy       0.00      0.00      0.00        16
                                                 Armed Forces Covenant       0.00      0.00      0.00         1
                                                          Armed forces       0.00      0.00      0.00         3
                           Armed forces and Ministry of Defence reform       0.00      0.00      0.00         4
                         Armed forces support for activities in the UK       0.00      0.00      0.00         1
                                                      Arts and culture       0.00      0.00      0.00       361
                                        Assessing environmental impact       0.00      0.00      0.00  

  'precision', 'predicted', average, warn_for)


In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
predicted_rfo = rfo_clf.predict_proba(X_test)
predicted_ada = ada_clf.predict_proba(X_test)

In [28]:
blended = pd.DataFrame(np.c_[
    predicted_rfo, predicted_ada
])

In [29]:
blended_model = LogisticRegression(random_state=1337)
blended_model.fit(blended, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1337, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
blended_pred = blended_model.predict(blended)
class_report = classification_report(y_test, blended_pred)
print(class_report)
blended_model.get_params()

                                                                        precision    recall  f1-score   support

                                         Administrative justice reform       0.00      0.00      0.00         2
                                     Adoption, fostering and surrogacy       0.00      0.00      0.00        16
                                                 Armed Forces Covenant       0.00      0.00      0.00         1
                                                          Armed forces       0.00      0.00      0.00         3
                           Armed forces and Ministry of Defence reform       0.00      0.00      0.00         4
                         Armed forces support for activities in the UK       0.00      0.00      0.00         1
                                                      Arts and culture       0.00      0.00      0.00       361
                                        Assessing environmental impact       0.00      0.00      0.00  

  'precision', 'predicted', average, warn_for)


{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': 1337,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [33]:
# Double check this with CV

scores = cross_val_score(blended_model, X_test, y_test,
                        scoring='accuracy',cv=10)

print(round(scores.mean(),3))
print(round(scores.std(),3))
#72



0.125
0.006
