In [1]:
# import gensim
import collections
import random


import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances_chunked, pairwise_distances


from tqdm import tnrange, tqdm_notebook


from itertools import compress

import time


### Set-up and get data

As this notebook uses data derived from the [govuk-taxonomy-supervised-learning repo](https://github.com/alphagov/govuk-taxonomy-supervised-learning), we clone that. Given you are likely to perform this for a given date, we suggest you create a dir with the date of the format `dd-mm-yy` within the data folder of the aforementioned repo and point to the data folder therein as the DATADIR. 

For example:

```
/Users/adalovelace/Documents/govuk-taxonomy-supervised-learning/data/11-02-19
```

In [2]:
DATADIR = os.getenv("DATADIR")


Import vectors with same index as labelled. This may require some modification to match your filename.

In [5]:
embedded_sentences = np.load('embedded_clean_content'+os.path.basename(DATADIR)+'.npy')


The Google Universal Sentence Encoder model takes a word, sentence or a paragraph as input and outputs a 512-dimensional vector.

In [6]:
# rows, cols
embedded_sentences.shape


(242861, 512)

In [7]:
content = pd.read_csv(
    os.path.join(DATADIR, 'clean_content.csv'),
    low_memory=False)


In [8]:
content.shape

(242861, 11)

New variable, called level, to categorise taxon by level in the tree

How many unique pieces of content are there on GOV.UK? 

In [11]:
content.content_id.nunique()


242861

## Get brexit subset of content

In [142]:
text_file = open('brexit_url_prefixes.txt', 'r')
brexit_url_prefixes = text_file.read().split(',')
text_file.close()

In [143]:
len(brexit_url_prefixes)

1998

### there are some prefixes which are urls to subsections of guides
For example, this set of prefixes would only be represented once in content

In [150]:
[prefix for prefix in brexit_url_prefixes if prefix.startswith('/settled-status-eu-citizens-families')]

['/settled-status-eu-citizens-families/applying-for-settled-status',
 '/settled-status-eu-citizens-families',
 '/settled-status-eu-citizens-families/eligibility',
 '/settled-status-eu-citizens-families/what-youll-need-to-apply',
 '/settled-status-eu-citizens-families/what-settled-and-presettled-status-means',
 '/settled-status-eu-citizens-families/after-youve-applied',
 '/settled-status-eu-citizens-families/if-you-have-permanent-residence-or-indefinite-leave-to-remain',
 '/settled-status-eu-citizens-families/not-eu-eea-swiss-citizen',
 '/settled-status-eu-citizens-families/apply-settled-status-for-child',
 '/settled-status-eu-citizens-families/settled-status-less-than-5-years',
 '/settled-status-eu-citizens-families/print',
 '/settled-status-eu-citizens-families/when-to-apply',
 '/settled-status-eu-citizens-families#brexit',
 '/settled-status-eu-citizens-families/settled-status-if-youre-under-21',
 '/settled-status-eu-citizens-families/',
 '/settled-status-eu-citizens-families/what-you

In [153]:
len([prefix for prefix in brexit_url_prefixes if prefix.startswith('/settled-status-eu-citizens-families')])

20


the base_path in content will only have the first part of this brexit_url. So to find these in content 

- First find which prefixes have duplicate first parts
e.g, 
'/settled-status-eu-citizens-families'
- Then truncate them to root path 
- Add truncated root paths to lookup list (brexit_url_prefixes)

In [151]:
def splitall(path):
    """split the prefix url into parts separated by /
    returns a list containing each string part of path"""
    allparts = []
    while 1:
        parts = os.path.split(path)
        if parts[0] == path:  # sentinel for absolute paths
            allparts.insert(0, parts[0])
            break
        elif parts[1] == path: # sentinel for relative paths
            allparts.insert(0, parts[1])
            break
        else:
            path = parts[0]
            allparts.insert(0, parts[1])
    return allparts

In [122]:
# capture the first two parts into a list if there are more than 2 (there are a handful that are just 2) e.g., '/staying-uk-eu-citizen'
first_2parts = []
for path in brexit_url_prefixes:
    parts = splitall(path)
    if len(parts) >2:
        first_2parts.append(parts[0]+parts[1])


In [123]:
len(first_2parts)

1984

In [140]:
#find repeated items of the same truncated root path to identify the stem to search for in content
seen = {}
dupes = []

for x in first_2parts:
    if x not in seen:
        seen[x] = 1
    else:
        if seen[x] == 1:
            dupes.append(x)
        seen[x] += 1

In [141]:
#these are the repeated stems. But some of these should be there. So pluck the guides out manually
dupes

['/settled-status-eu-citizens-families',
 '/government',
 '/guidance',
 '/driving-abroad',
 '/world',
 '/eu-withdrawal-act-2018-statutory-instruments',
 '/print',
 '/prepare-eu-exit',
 '/visit-europe-brexit']

In [144]:
#add the stems to the lookup list so they can be matched to content
brexit_url_prefixes = brexit_url_prefixes + ['/settled-status-eu-citizens-families',
 '/driving-abroad',
 '/eu-withdrawal-act-2018-statutory-instruments',
 '/prepare-eu-exit',
 '/visit-europe-brexit']

In [146]:
#create a brexit flag as a pandas col
content['brexit'] = np.where(content['base_path'].str.startswith(tuple(brexit_url_prefixes)), 1, 0)

In [147]:
#there are fewer than len(brexit_url_preifx) because the list has repeated content items in it
content.brexit.value_counts()

0    241386
1      1475
Name: brexit, dtype: int64

In [156]:
embedded_sentences_brexit = embedded_sentences[content.brexit==1]

## Get similarity scores, brexit content by all GOV.UK content

In [159]:
def get_top_20_links(D_chunk, start):
    """return only the top 20 (including self) related link indices and distance metric values
    according to distance metric"""
    top_k_indices = np.argpartition(D_chunk, range(20))[:, :20]

    return top_k_indices, D_chunk[:, top_k_indices]

brexit_generator = pairwise_distances_chunked(
    X=embedded_sentences_brexit,
    Y=embedded_sentences,
    reduce_func=get_top_20_links,
    working_memory=0,
    metric='cosine',
    n_jobs=-1)


In [163]:
brexit_url = []
close_content_urls = []
urls = pd.DataFrame(columns=['brexit_url', 'close_content_urls', 'cosine_sims'])
for i, (indices, values) in enumerate(tqdm_notebook(brexit_generator)):

    brexit_url = content.iat[indices[0][0],
                                     0]  #basepath is first column
    close_content_urls = [content.iat[i, 0] for i in indices[0]]
    #     cosine_sims = pd.Series(values)
    i_urls = pd.DataFrame({
        'brexit_url': brexit_url,
        'close_content_urls': close_content_urls,
        'cosine_sims': values.reshape(20)
    })
    urls = urls.append(i_urls, ignore_index=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)





### Save out list

In [164]:
urls.to_csv("brexit_potential_dupes.csv", index=False)