## Support Vector Machine for tagging GOV.UK

### Load requirements and data

In [26]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder

In [18]:
!which python3

/Users/matthewupson/.pyenv/shims/python3


In [3]:
%time content = pd.read_csv('../../data/clean_content.csv.gz', compression='gzip')

CPU times: user 45.4 s, sys: 5.63 s, total: 51 s
Wall time: 51.9 s


In [4]:
%time taxons = pd.read_csv('../../data/clean_taxons.csv')

CPU times: user 25.6 ms, sys: 9.53 ms, total: 35.1 ms
Wall time: 36.7 ms


##  ------------- This data prep step should move to clean_content.py -------------

Remove taxons that are legacy (Imported), World, Corporate information.

In [5]:
taxons = taxons[['base_path','content_id','taxon_name','level1taxon','level2taxon','level3taxon','level4taxon']]

taxons['level1taxon'] = taxons['level1taxon'].astype('str')

# Drop taxons that start with Imported (i.e. legacy taxons at the top level)

taxons = taxons[~taxons['level1taxon'].str.startswith("Imported")]
taxons = taxons[~taxons['taxon_name'].str.startswith("Imported")]

print("Taxons shape after deleting imported top taxons: {}".format(taxons.shape))

taxons = taxons[taxons.level1taxon != 'World']
taxons = taxons[taxons.taxon_name != 'World']

print("Taxons shape after deleting 'World' top taxons: {}".format(taxons.shape))

taxons = taxons[taxons.level1taxon != 'Corporate information']
taxons = taxons[taxons.taxon_name != 'Corporate information']
print("Taxons shape after deleting 'corporate information' top taxons: {}".format(taxons.shape))

# Convert nans to None

taxons['level1taxon'] = taxons['level1taxon'].where(taxons['level1taxon'] != 'nan', None)
taxons['level2taxon'] = taxons['level2taxon'].where(~taxons['level2taxon'].isnull(), None)

print("Taxons shape after converting nans to Nones: {}".format(taxons.shape))

Taxons shape after deleting imported top taxons: (4530, 7)
Taxons shape after deleting 'World' top taxons: (2028, 7)
Taxons shape after deleting 'corporate information' top taxons: (2027, 7)
Taxons shape after converting nans to Nones: (2027, 7)


Combine the taxons with the content 

In [6]:
%time content_taxons = pd.merge(left=content, right=taxons, left_on='taxon_id', right_on='content_id', how='outer', indicator=True)

assert content_taxons.shape == (335720, 23)
content_taxons.shape

CPU times: user 973 ms, sys: 124 ms, total: 1.1 s
Wall time: 1.11 s


(335720, 23)

##  --------------------------------------------------

In [7]:
content_taxons.dropna(subset = ['level2taxon'], inplace=True)
print('This leaves us with {} pre-classified rows.'.format(content_taxons.shape[0]))

This leaves us with 68967 pre-classified rows.


There are likely to be lots of content items that have more than one tag. Check here and remove for now:

__TODO: devise a way to deal with multiple tags applied to each content item.__

In [8]:
# Identify where duplicates exist on content_id and count

dupes = content_taxons['content_id_x'].value_counts().to_frame('dupes')
dupes = dupes.groupby('dupes').size().to_frame('count')

# Add index as a column

dupes.reset_index(level=0, inplace=True)
dupes


Unnamed: 0,dupes,count
0,1,43212
1,2,8871
2,3,1614
3,4,447
4,5,79
5,6,43
6,7,8
7,8,10
8,9,2
9,10,5


In [9]:
multiple_tags = sum(dupes.loc[dupes['dupes'] > 1, 'count'])
single_tags = sum(dupes.loc[dupes['dupes'] == 1, 'count'])

print('Stripping multiply applied tags to one will '
      'leave a total of {} tagged content items to train on'
      .format(multiple_tags + single_tags))

Stripping multiply applied tags to one will leave a total of 54330 tagged content items to train on


In [10]:
print('Before deduplication that are {} items.'.format(content_taxons.shape))
      
content_taxons.drop_duplicates(subset = ['content_id_x'], inplace=True)
      
print('After deduplication that are {} items.'.format(content_taxons.shape))


Before deduplication that are (68967, 23) items.
After deduplication that are (54331, 23) items.


In [11]:
content_taxons['level2taxoncat'] = content_taxons['level2taxon'].astype('category')

format our text samples and labels into tensors that can be fed into a neural network. To do this, we will rely on Keras utilities keras.preprocessing.text.Tokenizer and keras.preprocessing.sequence.pad_sequences.

In [12]:
content_taxons.drop(335627, axis=0, inplace=True)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    content_taxons['combined_text'], content_taxons['level2taxoncat'], test_size = 0.2, random_state=1337)

In [14]:
""" Support Vector Machine (SVM) classifier"""
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter = 5, random_state=42)),
])

In [31]:
svm_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [16]:
#joblib.dump(svm_clf, "svm_allgovuk.pkl", compress=9)

In [32]:
""" Predict the test dataset using SVM"""
predicted_svm = svm_clf.predict(X_test)
print('SVM correct prediction: {:4.2f}'.format(np.mean(predicted_svm == y_test)))

SVM correct prediction: 0.77


In [33]:
print(classification_report(y_test, predicted_svm))

                                                                        precision    recall  f1-score   support

                                     Adoption, fostering and surrogacy       0.88      0.64      0.74        11
                                                      Arts and culture       0.86      0.81      0.83       136
                                                                Asylum       0.67      0.40      0.50         5
                                                              Aviation       0.91      0.78      0.84        50
                                                     Brexit and the EU       1.00      0.20      0.33         5
                                            British nationals overseas       0.78      0.88      0.83        43
                                               Business and enterprise       0.77      0.76      0.76       570
                                          Business and the environment       0.85      0.50      0.63  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


## Metadata classifier

In [23]:
content.columns

Index(['Unnamed: 0', 'base_path', 'content_id', 'description', 'details',
       'document_type', 'first_published_at', 'locale',
       'primary_publishing_organisation', 'publishing_app', 'title', 'body',
       'combined_text', 'variable', 'taxon_id'],
      dtype='object')

In [29]:
content_taxons.head()

Unnamed: 0.1,Unnamed: 0,base_path_x,content_id_x,description,details,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,...,taxon_id,base_path_y,content_id_y,taxon_name,level1taxon,level2taxon,level3taxon,level4taxon,_merge,level2taxoncat
5,1.0,/student-finance-register-login,e57daef4-5eb5-431c-b0ad-14119ab0355f,your student finance online account - check pa...,{'will_continue_on': 'the Student Finance Engl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,64aa6eec-48b5-481d-9131-9c8b6326eea1,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,,,both,Funding and finance for students
6,32.0,/student-finance,d38bafd3-2c46-4be2-b50b-50c2ba7d30ed,student finance - student loans or student gra...,{'external_related_links': [{'title': 'Student...,guide,2016-02-29T09:24:10.000+00:00,en,,publisher,...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,64aa6eec-48b5-481d-9131-9c8b6326eea1,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,,,both,Funding and finance for students
7,35.0,/contact-student-finance-england,d490be5f-1998-4f20-ab52-d3dd5db7fa71,contact details for student finance england - ...,"{'body': '\n<div class=""summary"">\n<p>Contact ...",answer,2016-02-29T09:24:10.000+00:00,en,,publisher,...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,64aa6eec-48b5-481d-9131-9c8b6326eea1,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,,,both,Funding and finance for students
8,129.0,/student-finance-calculator,434b6eb5-33c8-4300-aba3-f5ead58600b8,student finance calculator - get a quick estim...,"{'start_button_text': 'Start now', 'transactio...",transaction,2016-02-29T09:24:10.000+00:00,en,,smartanswers,...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,64aa6eec-48b5-481d-9131-9c8b6326eea1,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,,,both,Funding and finance for students
9,133.0,/apply-online-for-student-finance,83155b50-418e-437c-9389-cf0e1302900f,apply online for student finance - online appl...,{'will_continue_on': 'the Student Finance Engl...,transaction,2016-02-29T09:24:10.000+00:00,en,,publisher,...,64aa6eec-48b5-481d-9131-9c8b6326eea1,/education/student-grants-bursaries-scholarships,64aa6eec-48b5-481d-9131-9c8b6326eea1,"Student grants, bursaries and scholarships","Education, training and skills",Funding and finance for students,,,both,Funding and finance for students


In [25]:
metadata = content_taxons.filter(['document_type', 'primary_publishing organisation', 'publishing_app','description','title'], axis=1)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    metadata, content_taxons['level2taxoncat'], test_size = 0.2, random_state=1337)

""" Support Vector Machine (SVM) classifier"""
metadata_text_clf = Pipeline(
    [('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter = 5, random_state=42)),
])

metadata_text_clf.fit(content[['description','title']])


ValueError: bad input shape ()