## Convolutional NN to classify all govuk content to level2 taxons

Based on:
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

### Load requirements and data

In [5]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder

In [6]:
!which python3

/Users/matthewupson/.pyenv/shims/python3


In [7]:
%time content = pd.read_csv('../../data/clean_content.csv.gz', compression='gzip')

CPU times: user 1min 5s, sys: 8.83 s, total: 1min 14s
Wall time: 1min 59s


In [8]:
%time taxons = pd.read_csv('../../data/clean_taxons.csv')

CPU times: user 30.7 ms, sys: 7.09 ms, total: 37.8 ms
Wall time: 46 ms


##  ------------- This data prep step should move to clean_content.py -------------

Remove taxons that are legacy (Imported), World, Corporate information.

In [9]:
taxons = taxons[['base_path','content_id','taxon_name','level1taxon','level2taxon','level3taxon','level4taxon']]

taxons['level1taxon'] = taxons['level1taxon'].astype('str')

# Drop taxons that start with Imported (i.e. legacy taxons at the top level)

taxons = taxons[~taxons['level1taxon'].str.startswith("Imported")]
taxons = taxons[~taxons['taxon_name'].str.startswith("Imported")]

print("Taxons shape after deleting imported top taxons: {}".format(taxons.shape))

taxons = taxons[taxons.level1taxon != 'World']
taxons = taxons[taxons.taxon_name != 'World']

print("Taxons shape after deleting 'World' top taxons: {}".format(taxons.shape))

taxons = taxons[taxons.level1taxon != 'Corporate information']
taxons = taxons[taxons.taxon_name != 'Corporate information']
print("Taxons shape after deleting 'corporate information' top taxons: {}".format(taxons.shape))

# Convert nans to None

taxons['level1taxon'] = taxons['level1taxon'].where(taxons['level1taxon'] != 'nan', None)
taxons['level2taxon'] = taxons['level2taxon'].where(~taxons['level2taxon'].isnull(), None)

print("Taxons shape after converting nans to Nones: {}".format(taxons.shape))

Taxons shape after deleting imported top taxons: (4530, 7)
Taxons shape after deleting 'World' top taxons: (2028, 7)
Taxons shape after deleting 'corporate information' top taxons: (2027, 7)
Taxons shape after converting nans to Nones: (2027, 7)


Combine the taxons with the content 

In [11]:
%time content_taxons = pd.merge(left=content, right=taxons, left_on='taxon_id', right_on='content_id', how='outer', indicator=True)

assert content_taxons.shape == (335720, 23)
content_taxons.shape

CPU times: user 1.44 s, sys: 379 ms, total: 1.82 s
Wall time: 3.18 s


(335720, 23)

##  --------------------------------------------------

In [12]:
content_taxons.dropna(subset = ['level2taxon'], inplace=True)
print('This leaves us with {} pre-classified rows.'.format(content_taxons.shape[0]))

This leaves us with 68967 pre-classified rows.


There are likely to be lots of content items that have more than one tag. Check here and remove for now:

__TODO: devise a way to deal with multiple tags applied to each content item.__

In [13]:
# Identify where duplicates exist on content_id and count

dupes = content_taxons['content_id_x'].value_counts().to_frame('dupes')
dupes = dupes.groupby('dupes').size().to_frame('count')

# Add index as a column

dupes.reset_index(level=0, inplace=True)
dupes


Unnamed: 0,dupes,count
0,1,43212
1,2,8871
2,3,1614
3,4,447
4,5,79
5,6,43
6,7,8
7,8,10
8,9,2
9,10,5


In [14]:
multiple_tags = sum(dupes.loc[dupes['dupes'] > 1, 'count'])
single_tags = sum(dupes.loc[dupes['dupes'] == 1, 'count'])

print('Stripping multiply applied tags to one will '
      'leave a total of {} tagged content items to train on'
      .format(multiple_tags + single_tags))

Stripping multiply applied tags to one will leave a total of 54330 tagged content items to train on


In [15]:
print('Before deduplication that are {} items.'.format(content_taxons.shape))
      
content_taxons.drop_duplicates(subset = ['content_id_x'], inplace=True)
      
print('After deduplication that are {} items.'.format(content_taxons.shape))


Before deduplication that are (68967, 23) items.
After deduplication that are (54331, 23) items.


In [20]:
content_taxons['level2taxoncat'] = content_taxons['level2taxon'].astype('category')

format our text samples and labels into tensors that can be fed into a neural network. To do this, we will rely on Keras utilities keras.preprocessing.text.Tokenizer and keras.preprocessing.sequence.pad_sequences.

In [35]:
content_taxons.drop(335627, axis=0, inplace=True)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    content_taxons['combined_text'], content_taxons['level2taxoncat'], test_size = 0.33, random_state=1337)

In [37]:
""" Support Vector Machine (SVM) classifier"""
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter = 5, random_state=42)),
])

In [39]:
svm_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [40]:
#joblib.dump(svm_clf, "svm_allgovuk.pkl", compress=9)

['svm_transport_toptaxons.pkl']

In [42]:
""" Predict the test dataset using SVM"""
predicted_svm = svm_clf.predict(X_test)
print('SVM correct prediction: {:4.2f}'.format(np.mean(predicted_svm == y_test)))
print(metrics.classification_report(y_test, predicted_svm))
#print(metrics.confusion_matrix(y_test, predicted_svm))

SVM correct prediction: 0.78
                                                                        precision    recall  f1-score   support

                                     Adoption, fostering and surrogacy       0.79      0.52      0.63        21
                                                      Arts and culture       0.88      0.83      0.85       219
                                                                Asylum       0.75      0.38      0.50         8
                                                              Aviation       0.94      0.79      0.86        75
                                                     Brexit and the EU       0.00      0.00      0.00         6
                                            British nationals overseas       0.83      0.92      0.87        88
                                               Business and enterprise       0.79      0.76      0.77       940
                                          Business and the environment    

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
