# Automated algorithm selection using TPOT for GOVUK tagging

### Load requirements and data

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTClassifier


In [2]:
!which python3

/Users/matthewupson/.pyenv/shims/python3


In [3]:
%time content = pd.read_csv('../../data/clean_content.csv.gz', compression='gzip')

CPU times: user 45.5 s, sys: 3.53 s, total: 49 s
Wall time: 49.2 s


In [4]:
%time taxons = pd.read_csv('../../data/clean_taxons.csv')

CPU times: user 17.9 ms, sys: 4.36 ms, total: 22.3 ms
Wall time: 24.4 ms


##  ------------- This data prep step should move to clean_content.py -------------

Remove taxons that are legacy (Imported), World, Corporate information.

In [5]:
taxons = taxons[['base_path','content_id','taxon_name','level1taxon','level2taxon','level3taxon','level4taxon']]

taxons['level1taxon'] = taxons['level1taxon'].astype('str')

# Drop taxons that start with Imported (i.e. legacy taxons at the top level)

taxons = taxons[~taxons['level1taxon'].str.startswith("Imported")]
taxons = taxons[~taxons['taxon_name'].str.startswith("Imported")]

print("Taxons shape after deleting imported top taxons: {}".format(taxons.shape))

taxons = taxons[taxons.level1taxon != 'World']
taxons = taxons[taxons.taxon_name != 'World']

print("Taxons shape after deleting 'World' top taxons: {}".format(taxons.shape))

taxons = taxons[taxons.level1taxon != 'Corporate information']
taxons = taxons[taxons.taxon_name != 'Corporate information']
print("Taxons shape after deleting 'corporate information' top taxons: {}".format(taxons.shape))

# Convert nans to None

taxons['level1taxon'] = taxons['level1taxon'].where(taxons['level1taxon'] != 'nan', None)
taxons['level2taxon'] = taxons['level2taxon'].where(~taxons['level2taxon'].isnull(), None)

print("Taxons shape after converting nans to Nones: {}".format(taxons.shape))

Taxons shape after deleting imported top taxons: (4530, 7)
Taxons shape after deleting 'World' top taxons: (2028, 7)
Taxons shape after deleting 'corporate information' top taxons: (2027, 7)
Taxons shape after converting nans to Nones: (2027, 7)


Combine the taxons with the content 

In [6]:
%time content_taxons = pd.merge(left=content, right=taxons, left_on='taxon_id', right_on='content_id', how='outer', indicator=True)

assert content_taxons.shape == (335720, 23)
content_taxons.shape

CPU times: user 961 ms, sys: 124 ms, total: 1.08 s
Wall time: 1.08 s


(335720, 23)

##  --------------------------------------------------

In [7]:
content_taxons.dropna(subset = ['level2taxon'], inplace=True)
print('This leaves us with {} pre-classified rows.'.format(content_taxons.shape[0]))

This leaves us with 68967 pre-classified rows.


There are likely to be lots of content items that have more than one tag. Check here and remove for now:

__TODO: devise a way to deal with multiple tags applied to each content item.__

In [8]:
# Identify where duplicates exist on content_id and count

dupes = content_taxons['content_id_x'].value_counts().to_frame('dupes')
dupes = dupes.groupby('dupes').size().to_frame('count')

# Add index as a column

dupes.reset_index(level=0, inplace=True)
dupes


Unnamed: 0,dupes,count
0,1,43212
1,2,8871
2,3,1614
3,4,447
4,5,79
5,6,43
6,7,8
7,8,10
8,9,2
9,10,5


In [9]:
multiple_tags = sum(dupes.loc[dupes['dupes'] > 1, 'count'])
single_tags = sum(dupes.loc[dupes['dupes'] == 1, 'count'])

print('Stripping multiply applied tags to one will '
      'leave a total of {} tagged content items to train on'
      .format(multiple_tags + single_tags))

Stripping multiply applied tags to one will leave a total of 54330 tagged content items to train on


In [10]:
print('Before deduplication that are {} items.'.format(content_taxons.shape))
      
content_taxons.drop_duplicates(subset = ['content_id_x'], inplace=True)
      
print('After deduplication that are {} items.'.format(content_taxons.shape))


Before deduplication that are (68967, 23) items.
After deduplication that are (54331, 23) items.


In [11]:
content_taxons['level2taxoncat'] = content_taxons['level2taxon'].astype('category')

In [12]:
content_taxons.drop(335627, axis=0, inplace=True)

In [13]:
nlp_pipeline = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),])
X = nlp_pipeline.fit_transform(content_taxons['combined_text'])

In [14]:
#X = X.todense()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, content_taxons['level2taxoncat'], test_size = 0.2, random_state=1337)

In [21]:
tpot = TPOTClassifier(
    generations=5, 
    population_size=50, 
    verbosity=3, 
    config_dict="TPOT sparse", 
    memory='/Users/matthewupson/Documents/govuk-taxonomy-supervised-learning/checkpoints',
    periodic_checkpoint_folder='/Users/matthewupson/Documents/govuk-taxonomy-supervised-learning/checkpoints',
    warm_start=True,
    max_time_mins=10,
    n_jobs=-1
)

12 operators have been imported by TPOT.




In [22]:
tpot.fit(X_train, y_train)

                                                                   



TPOT closed prematurely. Will use the current best pipeline.




RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

In [23]:
tpot.export('tpot_pipeline.py')

RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

In [18]:
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/2 [00:00<?, ?pipeline/s]

12 operators have been imported by TPOT.


                                                                           

Skipped pipeline #1 due to time out. Continuing to the next pipeline.




RuntimeError: There was an error in the TPOT optimization process. This could be because the data was not formatted properly, or because data for a regression problem was provided to the TPOTClassifier object. Please make sure you passed the data to TPOT correctly.