In [1]:
# import gensim
import collections
import random


import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances_chunked, pairwise_distances


from tqdm import tnrange, tqdm_notebook


from itertools import compress

import time


### Set-up and get data

As this notebook uses data derived from the [govuk-taxonomy-supervised-learning repo](https://github.com/alphagov/govuk-taxonomy-supervised-learning), we clone that. Given you are likely to perform this for a given date, we suggest you create a dir with the date of the format `dd-mm-yy` within the data folder of the aforementioned repo and point to the data folder therein as the DATADIR. 

For example:

```
/Users/adalovelace/Documents/govuk-taxonomy-supervised-learning/data/11-02-19
```

In [2]:
DATADIR = os.getenv("DATADIR")


In [3]:
DATADIR


'/Users/ellieking/Documents/govuk-taxonomy-supervised-learning/data/2019-04-15'

Import vectors with same index as labelled. This may require some modification to match your filename.

In [4]:
embedded_sentences = np.load('embedded_clean_content'+os.path.basename(DATADIR)+'.npy')


The Google Universal Sentence Encoder model takes a word, sentence or a paragraph as input and outputs a 512-dimensional vector.

In [5]:
# rows, cols
embedded_sentences.shape


(242861, 512)

In [6]:
content = pd.read_csv(
    os.path.join(DATADIR, 'clean_content.csv'),
    low_memory=False)


In [7]:
content.shape

(242861, 11)

How many unique pieces of content are there on GOV.UK? 

In [8]:
content.content_id.nunique()


242861

## Get brexit subset of content

In [9]:
text_file = open('brexit_url_prefixes.txt', 'r')
brexit_url_prefixes = text_file.read().split(',')
text_file.close()

In [10]:
len(brexit_url_prefixes)

1998

### there are some prefixes which are urls to subsections of guides
For example, this set of prefixes would only be represented once in content

In [11]:
[prefix for prefix in brexit_url_prefixes if prefix.startswith('/settled-status-eu-citizens-families')]

['/settled-status-eu-citizens-families/applying-for-settled-status',
 '/settled-status-eu-citizens-families',
 '/settled-status-eu-citizens-families/eligibility',
 '/settled-status-eu-citizens-families/what-youll-need-to-apply',
 '/settled-status-eu-citizens-families/what-settled-and-presettled-status-means',
 '/settled-status-eu-citizens-families/after-youve-applied',
 '/settled-status-eu-citizens-families/if-you-have-permanent-residence-or-indefinite-leave-to-remain',
 '/settled-status-eu-citizens-families/not-eu-eea-swiss-citizen',
 '/settled-status-eu-citizens-families/apply-settled-status-for-child',
 '/settled-status-eu-citizens-families/settled-status-less-than-5-years',
 '/settled-status-eu-citizens-families/print',
 '/settled-status-eu-citizens-families/when-to-apply',
 '/settled-status-eu-citizens-families#brexit',
 '/settled-status-eu-citizens-families/settled-status-if-youre-under-21',
 '/settled-status-eu-citizens-families/',
 '/settled-status-eu-citizens-families/what-you

In [12]:
len([prefix for prefix in brexit_url_prefixes if prefix.startswith('/settled-status-eu-citizens-families')])

19


the base_path in content will only have the first part of this brexit_url. So to find these in content 

- First find which prefixes have duplicate first parts
e.g, 
'/settled-status-eu-citizens-families'
- Then truncate them to root path 
- Add truncated root paths to lookup list (brexit_url_prefixes)

In [13]:
def splitall(path):
    """split the prefix url into parts separated by /
    returns a list containing each string part of path"""
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts

In [14]:
# capture the first two parts into a list if there are more than 2 (there are a handful that are just 2) e.g., '/staying-uk-eu-citizen'
first_2parts = []
for path in brexit_url_prefixes:
    parts = splitall(path)
    if len(parts) >2:
        first_2parts.append(parts[0]+parts[1])


In [15]:
len(first_2parts)

1984

In [16]:
#find repeated items of the same truncated root path to identify the stem to search for in content
seen = {}
dupes = []

for x in first_2parts:
    if x not in seen:
        seen[x] = 1
    else:
        if seen[x] == 1:
            dupes.append(x)
        seen[x] += 1

In [17]:
#these are the repeated stems. But some of these should be there. So pluck the guides out manually
dupes

['/settled-status-eu-citizens-families',
 '/government',
 '/guidance',
 '/driving-abroad',
 '/world',
 '/eu-withdrawal-act-2018-statutory-instruments',
 '/print',
 '/prepare-eu-exit',
 '/visit-europe-brexit']

In [18]:
#add the stems to the lookup list so they can be matched to content
brexit_url_prefixes = brexit_url_prefixes + ['/settled-status-eu-citizens-families',
 '/driving-abroad',
 '/eu-withdrawal-act-2018-statutory-instruments',
 '/prepare-eu-exit',
 '/visit-europe-brexit']

In [19]:
#create a brexit flag as a pandas col
content['brexit'] = np.where(content['base_path'].str.startswith(tuple(brexit_url_prefixes)), 1, 0)

In [20]:
#there are fewer than len(brexit_url_preifx) because the list has repeated content items in it
content.brexit.value_counts()

0    241386
1      1475
Name: brexit, dtype: int64

In [21]:
embedded_sentences_brexit = embedded_sentences[content.brexit==1]

## Get guidance and publications subset

In [23]:
content.groupby(['document_type']).size()

document_type
aaib_report                                8387
answer                                      743
asylum_support_decision                      74
authored_article                            361
business_finance_support_scheme             180
calendar                                      2
case_study                                 1716
closed_consultation                         709
cma_case                                   1928
consultation_outcome                       3372
correspondence                             4028
countryside_stewardship_grant               244
decision                                   4629
detailed_guide                             5719
document_collection                        4278
drug_safety_update                          546
employment_appeal_tribunal_decision        1115
employment_tribunal_decision              27318
esi_fund                                   1112
export_health_certificate                  1435
fatality_notice           

These are the publication formats. From here: https://docs.google.com/spreadsheets/d/1AWsPFXUhJzqCBR0r0iag1UZ1bavneu8ZBABZzKFZ3XY/edit#gid=0

This is for MVP. If doing this again, extract schema_name from content store and identify publications where schemea_name=publication

publication: corporate report
publication: correspondence
publication: foi release
publication: form
publication: guidance
publication: impact assessment
publication: independant report
publication: international treaty
publication: map
publication: notice
publication: policy paper
publication: promotional material
publication: regulation
publication: research and analysis
publication: statutory guidance
publication: transparency data
    
    content[content['document_type']=='publications']

In [28]:
publications = ["corporate report", 
                "correspondence",
                "foi release",
                "form",
                "guidance",
                "impact assessment",
                "independant report",
                "international treaty",
                "map",
                "notice",
                "policy paper",
                "promotional material",
                "regulation",
                "research and analysis",
                "statutory guidance",
                "transparency data"
               ]

In [34]:
content.shape

(242861, 12)

In [40]:
content[(content.brexit==1) & (content['document_type'].isin(publications))].shape

(269, 12)

In [42]:
content[(content.brexit==1) & (content['document_type'].isin(publications))]

Unnamed: 0,base_path,content_id,description,document_type,first_published_at,locale,primary_publishing_organisation,publishing_app,title,body,combined_text,brexit
1034,/government/publications/eu-exit-no-deal-prepa...,ec7c51f8-b462-422c-85a7-2f625e598c5c,advice to further education and apprenticeship...,guidance,2019-01-31T13:01:00.000+00:00,en,Department for Education,whitehall,eu exit: no deal preparations for further educ...,preparations that organisations and businesses...,eu exit: no deal preparations for further educ...,1
3220,/government/publications/trading-gas-with-the-...,2bc4c469-c897-45c7-acde-fd2d13c0bf64,how cross-border gas trading with the eu will ...,guidance,2018-10-12T12:00:00.000+00:00,en,"Department for Business, Energy & Industrial S...",whitehall,trading gas with the eu if there's no brexit deal,if the uk leaves the eu on 31 october 2019 wit...,trading gas with the eu if there's no brexit d...,1
4203,/government/publications/exceeding-permit-limi...,6a2e0886-8acb-4945-9ee8-43eaf541ca74,holding additional radioactive materials and m...,guidance,2019-04-01T15:36:00.000+00:00,en,Environment Agency,whitehall,exceeding permit limits when you obtain additi...,eu exit may result in restricted supplies of r...,exceeding permit limits when you obtain additi...,1
5014,/government/publications/costa-amendment-lette...,6c6b6d5d-c450-4afb-8be3-ad3d57e178b9,secretary of state steve barclay and european ...,correspondence,2019-03-05T13:33:00.000+00:00,en,Department for Exiting the European Union,whitehall,costa amendment: correspondence with the eu in...,in his letter the secretary of state for exiti...,costa amendment: correspondence with the eu in...,1
5438,/government/publications/cmas-role-in-mergers-...,9f4a1967-d9b2-46c7-8b97-a132c397b4c4,role of the cma in mergers if there's 'no deal...,guidance,2018-10-30T12:53:00.000+00:00,en,Competition and Markets Authority,whitehall,merger cases if there's no brexit deal,how the cma will handle mergers in a ‘no deal’...,merger cases if there's no brexit deal role of...,1
6536,/government/publications/using-and-trading-in-...,947be292-6e11-4701-b6bd-8c69fb73967d,how businesses dealing with fluorinated gases ...,guidance,2018-09-13T12:00:00.000+00:00,en,"Department for Environment, Food & Rural Affairs",whitehall,using and trading in fluorinated gases and ozo...,if the uk leaves the eu in march 2019 without ...,using and trading in fluorinated gases and ozo...,1
7794,/government/publications/approval-of-premises-...,31e6684e-5f70-43aa-b5b6-f1a02b0c5c0d,get approval for inspection of plant health co...,form,2018-12-19T12:00:00.000+00:00,en,"Department for Environment, Food & Rural Affairs",whitehall,approval of premises as place of first arrival...,read the guide on approval and operation of pl...,approval of premises as place of first arrival...,1
8221,/government/publications/broadcasting-and-vide...,3cc5136a-ca14-45a5-8513-3a9dd3aa74f2,how the rules for broadcasters and providers o...,guidance,2018-09-13T12:00:00.000+00:00,en,"Department for Digital, Culture, Media & Sport",whitehall,broadcasting and video on demand if there’s no...,if the uk leaves the eu with no deal find out ...,broadcasting and video on demand if there’s no...,1
8372,/government/publications/chancellor-letter-to-...,9388ef57-2729-4782-8879-ee3516fb1a60,a letter from philip hammond responding to nic...,correspondence,2018-08-23T14:00:12.000+00:00,en,HM Treasury,whitehall,chancellor letter to chair of treasury committ...,the chancellor of the exchequer philip hammond...,chancellor letter to chair of treasury committ...,1
8742,/government/publications/how-medicines-medical...,fa4cccc1-7688-470a-97e7-9be00bcf2eab,continuing human medicine device and clinical ...,guidance,2018-08-23T11:00:00.000+00:00,en,Department of Health and Social Care,whitehall,how medicines medical devices and clinical tri...,if the uk leaves the eu ion 12 april 2019 with...,how medicines medical devices and clinical tri...,1


In [35]:
publications_and_guidance = content[content['document_type'].isin(publications)]

In [43]:
embedded_sentences_brexit_publications = embedded_sentences[(content.brexit==1) & (content['document_type'].isin(publications))]

## Get similarity scores, brexit content by all GOV.UK content

In [44]:
def get_top_20_links(D_chunk, start):
    """return only the top 20 (including self) related link indices and distance metric values
    according to distance metric"""
    top_k_indices = np.argpartition(D_chunk, range(20))[:, :20]

    return top_k_indices, D_chunk[:, top_k_indices]

brexit_generator = pairwise_distances_chunked(
    X=embedded_sentences_brexit_publications,
    Y=embedded_sentences,
    reduce_func=get_top_20_links,
    working_memory=0,
    metric='cosine',
    n_jobs=-1)


In [45]:
brexit_url = []
close_content_urls = []
urls = pd.DataFrame(columns=['brexit_url', 'close_content_urls', 'cosine_sims'])
for i, (indices, values) in enumerate(tqdm_notebook(brexit_generator)):

    brexit_url = content.iat[indices[0][0],
                                     0]  #basepath is first column
    close_content_urls = [content.iat[i, 0] for i in indices[0]]
    #     cosine_sims = pd.Series(values)
    i_urls = pd.DataFrame({
        'brexit_url': brexit_url,
        'close_content_urls': close_content_urls,
        'cosine_sims': values.reshape(20)
    })
    urls = urls.append(i_urls, ignore_index=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  (working_memory, np.ceil(row_bytes * 2 ** -20)))





### Save out list

In [47]:
urls.to_csv("brexit_publications_potential_dupes.csv", index=False)