# Model Inference

## 1. Installs and Imports

In [4]:
# !pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 5.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 38.5 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.5 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 36.8 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 55.4 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 315 kB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp

In [5]:
import os
import random
import transformers
import pandas as pd
import pandas as pd
import numpy as np
from datetime import date
from google.colab import drive
from seqeval.metrics import accuracy_score
from IPython.display import display, HTML
from collections import defaultdict, Counter, OrderedDict
from datasets import load_dataset, load_metric, load_from_disk, ClassLabel, Sequence
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

#inference
import torch
from transformers import pipeline

print(transformers.__version__)

4.14.1


In [6]:
system = "COLAB" #["AWS", "COLAB"]

In [7]:
if system=="AWS":
    fs = s3fs.S3FileSystem()    
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f's3://{s3_bucket}/model-data/govner-data'
    for f in fs.ls(DATA_DIR):
        print(f)
    #Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session() 
    sagemaker_session_bucket= s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()
        
    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system=="COLAB":
    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data")
    MODEL_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models")
    RESULTS_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/Metrics")

Mounted at /content/gdrive


In [8]:
print("Data Folder: {}".format(DATA_DIR))
print(os.listdir(DATA_DIR)[:3])
print("Model Folder: {}".format(MODEL_DIR))
print(os.listdir(MODEL_DIR)[:3])

Data Folder: /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data
['label_map_09062020_more_ents.json', 'label_map_12062020_more_ents.json', 'line_by_line_NER_data_sampled_09062020_more_ents.csv']
Model Folder: /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models
['distilbert-base-uncased-finetuned-ner-conll2003', 'distilbert-base-uncased-finetuned-ner-govuk', 'distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-08-12-2021']


## Load Model for Inference

Load model from local


In [9]:
MODEL_DIR 
model_name = "distilbert-base-uncased"
task = "ner"
dataset_name = "govuk"
req_date = "13-12-2021"
dataset_type = 'FULL'
chkpoint = 'checkpoint-73500'

In [10]:
OUTPUT_PATH = f"{MODEL_DIR}/{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{req_date}/{chkpoint}"
OUTPUT_PATH

'/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-FULL-13-12-2021/checkpoint-73500'

In [11]:
os.listdir(OUTPUT_PATH)

['config.json',
 'pytorch_model.bin',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'vocab.txt',
 'tokenizer.json',
 'training_args.bin',
 'optimizer.pt',
 'scheduler.pt',
 'trainer_state.json',
 'rng_state.pth']

## Load model and tokeniser

In [12]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

In [13]:
model = AutoModelForTokenClassification.from_pretrained(OUTPUT_PATH)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_PATH)

In [2]:
sequence = "You must be at least 17 years old to have a drivers licence " \
"failure to provide this certificate will mean imprisonment in the UK and barring from countries like EU and US"

In [16]:
inputs = tokenizer(sequence, return_tensors="pt")

In [17]:
inputs = tokenizer(sequence, return_tensors="pt")

In [18]:
inputs

{'input_ids': tensor([[  101,  2017,  2442,  2022,  2012,  2560,  2459,  2086,  2214,  2000,
          2031,  1037,  6853, 11172,  4945,  2000,  3073,  2023,  8196,  2097,
          2812, 10219,  1999,  1996,  2866,  1998, 19820,  2075,  2013,  3032,
          2066,  7327,  1998,  2149,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [19]:
tokens = inputs.tokens()

In [20]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 None]

In [21]:
tokens

['[CLS]',
 'you',
 'must',
 'be',
 'at',
 'least',
 '17',
 'years',
 'old',
 'to',
 'have',
 'a',
 'drivers',
 'licence',
 'failure',
 'to',
 'provide',
 'this',
 'certificate',
 'will',
 'mean',
 'imprisonment',
 'in',
 'the',
 'uk',
 'and',
 'barr',
 '##ing',
 'from',
 'countries',
 'like',
 'eu',
 'and',
 'us',
 '[SEP]']

In [22]:
outputs = model(**inputs).logits

In [23]:
predictions = torch.argmax(outputs, dim=2)

In [24]:
predictions

tensor([[ 0,  0,  0,  0,  0,  0, 12, 12, 12,  0,  0,  0, 10,  5,  0,  0,  0,  0,
          5,  0,  0,  0,  0,  6,  6,  0,  0,  0,  0,  6,  0,  6,  0,  6,  0]])

In [29]:
for token, prediction in zip(tokens, predictions[0].numpy()):
  print((token, model.config.id2label[prediction]))

('[CLS]', 'O')
('you', 'O')
('must', 'O')
('be', 'O')
('at', 'O')
('least', 'O')
('17', 'I-STATE')
('years', 'I-STATE')
('old', 'I-STATE')
('to', 'O')
('have', 'O')
('a', 'O')
('drivers', 'I-PER')
('licence', 'I-FORM')
('failure', 'O')
('to', 'O')
('provide', 'O')
('this', 'O')
('certificate', 'I-FORM')
('will', 'O')
('mean', 'O')
('imprisonment', 'O')
('in', 'O')
('the', 'I-LOC')
('uk', 'I-LOC')
('and', 'O')
('barr', 'O')
('##ing', 'O')
('from', 'O')
('countries', 'I-LOC')
('like', 'O')
('eu', 'I-LOC')
('and', 'O')
('us', 'I-LOC')
('[SEP]', 'O')


## Using Huggingface Pipelines

In [30]:
# Replace this with your own checkpoint
token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [31]:
print(sequence)
print(len(sequence))

You must be at least 17 years old to have a drivers licence failure to provide this certificate will mean imprisonment in the UK and barring from countries like EU and US
170


In [32]:
result = token_classifier(sequence)

In [33]:
result

[{'end': 33,
  'entity_group': 'STATE',
  'score': 0.6916776,
  'start': 21,
  'word': '17 years old'},
 {'end': 51,
  'entity_group': 'PER',
  'score': 0.99765354,
  'start': 44,
  'word': 'drivers'},
 {'end': 59,
  'entity_group': 'FORM',
  'score': 0.9795044,
  'start': 52,
  'word': 'licence'},
 {'end': 95,
  'entity_group': 'FORM',
  'score': 0.98986244,
  'start': 84,
  'word': 'certificate'},
 {'end': 128,
  'entity_group': 'LOC',
  'score': 0.9997448,
  'start': 122,
  'word': 'the uk'},
 {'end': 155,
  'entity_group': 'LOC',
  'score': 0.99967575,
  'start': 146,
  'word': 'countries'},
 {'end': 163,
  'entity_group': 'LOC',
  'score': 0.9998914,
  'start': 161,
  'word': 'eu'},
 {'end': 170,
  'entity_group': 'LOC',
  'score': 0.99533564,
  'start': 168,
  'word': 'us'}]

## Visualise Entites

In [3]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sequence)
displacy.render(doc, style='ent', jupyter=True, options={'distance': 90})

In [4]:
text = "My name is John Smith and I live in Paris"
entities = [
    ("Employee", 11, 21),  # John Smith
    ("Location", 36, 41),  # Paris
]

In [5]:
import spacy

def display_entities(text, entities):
  nlp = spacy.blank("en")
  doc = nlp(text)
  ents = []
  for ee in entities:
      ents.append(doc.char_span(ee[1], ee[2], ee[0]))
  doc.ents = ents
  displacy.render(doc, style='ent', jupyter=True, options={'distance': 90})

def tokenise_and_display(text):
  result = token_classifier(text)
  res_ents = [(i['entity_group'], i['start'], i['end']) for i in result]
  display_entities(text, entities=res_ents)

In [6]:
display_entities(text, entities)

In [38]:
result

[{'end': 33,
  'entity_group': 'STATE',
  'score': 0.6916776,
  'start': 21,
  'word': '17 years old'},
 {'end': 51,
  'entity_group': 'PER',
  'score': 0.99765354,
  'start': 44,
  'word': 'drivers'},
 {'end': 59,
  'entity_group': 'FORM',
  'score': 0.9795044,
  'start': 52,
  'word': 'licence'},
 {'end': 95,
  'entity_group': 'FORM',
  'score': 0.98986244,
  'start': 84,
  'word': 'certificate'},
 {'end': 128,
  'entity_group': 'LOC',
  'score': 0.9997448,
  'start': 122,
  'word': 'the uk'},
 {'end': 155,
  'entity_group': 'LOC',
  'score': 0.99967575,
  'start': 146,
  'word': 'countries'},
 {'end': 163,
  'entity_group': 'LOC',
  'score': 0.9998914,
  'start': 161,
  'word': 'eu'},
 {'end': 170,
  'entity_group': 'LOC',
  'score': 0.99533564,
  'start': 168,
  'word': 'us'}]

In [39]:
res_ents = [(i['entity_group'], i['start'], i['end']) for i in result]
res_ents

[('STATE', 21, 33),
 ('PER', 44, 51),
 ('FORM', 52, 59),
 ('FORM', 84, 95),
 ('LOC', 122, 128),
 ('LOC', 146, 155),
 ('LOC', 161, 163),
 ('LOC', 168, 170)]

In [40]:
display_entities(sequence, res_ents)

In [41]:
tokenise_and_display(sequence)

## Test With GOV.UK Pages

Now, we want to test how the model performs on inference tasks for NER on a selection of pages from GOV.UK.

The pages are:
*   Coronavirus guidance
  * https://www.gov.uk/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed

* Visitor Visa
  * Marriage visitor visa - https://www.gov.uk/marriage-visa
  * Marriage visitor visa eligibility - https://www.gov.uk/marriage-visa/eligibility
  * Marriage visitor visa documents you’ll need - https://www.gov.uk/marriage-visa/documents-you-will-need
  * Marriage visitor visa apply from outside the UK - https://www.gov.uk/marriage-visa/apply

* Study in the UK
  * https://www.gov.uk/student-visa


### Approach

1. Get content of pages into local notebook
2. Locate units of the content - title, sub-heading, main body
3. For each unit, split into sentences
4. Run each sentence of the model through the model

#### 1. Get content of pages into local notebook

Download preprocessed content store data from AWS.

In [43]:
content_path = os.path.join(DATA_DIR, 'govuk_content/preprocessed_content_store_141221.csv')
content_path

'/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/govuk_content/preprocessed_content_store_141221.csv'

In [108]:
govuk_content = pd.read_csv(content_path, sep='\t', nrows=100, encoding='utf-8', compression='gzip')

In [109]:
for i in range(len(govuk_content)):
  print(govuk_content.iloc[i,:]['base_path'])
  print()

/employment-tribunal-decisions/mrs-c-harris-v-worldaware-ltd-2206826-2018

/government/news/fully-vaccinated-arrivals-from-france-to-england-must-continue-to-quarantine

/government/publications/vehicle-airbags-safety-guidance/vehicle-airbags-safety-guidance

/research-for-development-outputs/id21-natural-resources-highlights-6-water

/government/statistics/announcements/construction-output-in-great-britain-april-2022

/government/publications/english-housing-survey-2019-to-2020-questionnaire-and-physical-survey-form

/government/news/new-digital-resource-for-charity-trustees-launched

/employment-tribunal-decisions/mrs-k-todd-v-london-borough-of-harrow-3335085-2018

/guidance/land-compensation-manual-section-4-disturbance/practice-note-4-2-disturbance-payments-for-persons-without-compensatable-interests

/employment-tribunal-decisions/miss-t-hutchison-v-fife-council-104081-2008

/government/publications/patent-journal-special-notices-6636

/government/statistics/uk-consumer-price-infl

#### 2. 

In [117]:
paths = ['/student-visa', 
         '/marriage-visa', 
         '/marriage-visa/eligibility', 
         '/marriage-visa/documents-you-will-need', 
         '/marriage-visa/apply', 
         '/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed',
         ]

In [118]:
all_govuk_content = pd.read_csv(content_path, sep='\t', encoding='utf-8', compression='gzip', chunksize=10000, iterator=True)

In [119]:
df = pd.DataFrame()
count = 0
for chunk in all_govuk_content:
  count += 1
  print(count)
  rows = chunk[chunk['base_path'].isin(paths)]
  if rows.shape[0] > 0:
    print(rows['base_path'])
    df = df.append(rows)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
340932    /coronavirus
Name: base_path, dtype: object
36
37
38
39
40
41
42
43
44
45
46
47
460877    /marriage-visa
Name: base_path, dtype: object
48
49
482087    /student-visa
Name: base_path, dtype: object
50
51
52
53
54
55
56
57
58
59
60
61


In [120]:
df

Unnamed: 0,base_path,content_id,title,description,publishing_app,document_type,details,text,organisations,taxons,step_by_steps,details_parts,first_published_at,public_updated_at,updated_at,finder,facet_values,facet_groups,has_brexit_no_deal_notice,withdrawn,withdrawn_at,withdrawn_explanation
340932,/coronavirus,774cee22-d896-44c1-a611-e3109cce8eae,Coronavirus (COVID-19): guidance and support,"Find information on coronavirus, including gui...",collections-publisher,coronavirus_landing_page,{},,{},[[{'title': 'Rules and restrictions during cor...,,,2020-03-20 14:29:42.000,2021-07-19 06:51:15.000,2021-12-10 15:47:46.112,,,,False,False,,
460877,/marriage-visa,b589f602-7427-4ac0-b220-ee35f711548b,Marriage Visitor visa,Apply for a visa to visit the UK if you want t...,publisher,guide,"{'parts': [{'body': [{'content': ""You must app...",You must apply for a Marriage Visitor visa if:...,{'organisations': [('04148522-b0c1-4137-b687-5...,"[[{'title': 'Family visas', 'content_id': 'd61...",,"[{'title': 'Overview', 'slug': 'overview'}, {'...",2014-02-05 15:41:42.000,2015-02-05 17:08:15.000,2021-11-26 14:54:16.695,,,,False,False,,
482087,/student-visa,c0a2a4d9-8d28-449f-8ccb-4c2b7f6e9c0f,Student visa,Apply for a Student visa to study in the UK if...,publisher,guide,"{'parts': [{'body': [{'content': ""You can appl...",You can apply for a Student visa to study in t...,{'organisations': [('04148522-b0c1-4137-b687-5...,"[[{'title': 'Student visas', 'content_id': '51...",,"[{'title': 'Overview', 'slug': 'overview'}, {'...",2014-01-29 16:10:58.000,2014-11-28 13:38:19.000,2021-12-09 16:19:24.879,,,,False,False,,


In [121]:
df.iloc[0,:]

base_path                                                         /coronavirus
content_id                                774cee22-d896-44c1-a611-e3109cce8eae
title                             Coronavirus (COVID-19): guidance and support
description                  Find information on coronavirus, including gui...
publishing_app                                           collections-publisher
document_type                                         coronavirus_landing_page
details                                                                     {}
text                                                                       NaN
organisations                                                               {}
taxons                       [[{'title': 'Rules and restrictions during cor...
step_by_steps                                                              NaN
details_parts                                                              NaN
first_published_at                                  

In [125]:
df.iloc[1,:]['details']

'{\'parts\': [{\'body\': [{\'content\': "You must apply for a Marriage Visitor visa if:\\r\\n\\r\\n+ you want to get married or register a civil partnership in the UK\\r\\n+ you want to give notice of a marriage or civil partnership in UK\\r\\n+ you\'re not planning to stay or settle in the UK after your marriage or civil partnership\\r\\n+ you meet the other [eligibility requirements](/marriage-visa/eligibility)\\r\\n\\r\\nYou do not need a Marriage Visitor visa to [convert your civil partnership into a marriage](/convert-civil-partnership) - you can apply for a [Standard Visitor visa](/standard-visitor-visa).\\r\\n\\r\\nYou also do not need a Marriage Visitor visa if one of the following is true:\\r\\n\\r\\n- you have settled or pre-settled status under the EU Settlement Scheme\\r\\n- you have applied to the EU Settlement Scheme, and have not got a decision yet\\r\\n- you’re an Irish citizen\\r\\n\\r\\n^You cannot apply if you [qualify for British citizenship](/check-british-citizen)

### Scraping Approach

In [170]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.gov.uk/student-visa"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

body = soup.findAll(attrs={"class":"gem-c-govspeak"})
sent_list = []
for x in body:
  sent_list.append(x.text)

In [172]:
sent_list[0].split('\n')

['', 'You can apply for a Student visa to study in the UK if you’re 16 or over and you:', '', 'have been offered a place on a course by a licensed student sponsor', '', 'have enough money to support yourself and pay for your course - the amount will vary depending on your circumstances', 'can speak, read, write and understand English', '', 'have consent from your parents if you’re 16 or 17 - you’ll need evidence of this when you apply', '', '', 'If you’re 16 or 17 and you want to study at an independent school in the UK, you may be eligible for a Child Student visa instead.', '', 'This visa has replaced the Tier 4 (General) student visa.', 'If you or your family are from the EU, Switzerland, Norway, Iceland or Liechtenstein', 'If you or your family member started living in the UK by 31 December 2020, you may be able to apply to the free EU Settlement Scheme.', 'The deadline to apply was 30 June 2021 for most people. You can still apply if either:', '', '', 'you have a later deadline - 

In [186]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.gov.uk/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

body = soup.findAll(attrs={"class":"gem-c-govspeak"})
sents = [i.text.split('\n') for i in body]
sents_clean = [list(filter(None, i)) for i in sents]

In [187]:
for s in sents_clean[0]:
  print(s)

What has changed
The government has announced that England is moving to Plan B in response to the risks of the Omicron variant.
This means:
Face coverings are required by law in most indoor settings.
From 13 December office workers who can work from home should do so.
From 15 December, certain venues and events will be required by law to check that all visitors aged 18 years or over are fully vaccinated, have proof of a negative test in the last 48 hours, or have an exemption.
COVID-19 remains a risk
It is still possible to catch and spread COVID-19, even if you are fully vaccinated.
Anyone with COVID-19 symptoms or a positive test result should stay at home and self-isolate immediately. If you have symptoms of COVID-19, you should arrange to take a PCR test as soon as possible, even if you’ve had one or more doses of a COVID-19 vaccine.
COVID-19 will be a feature of our lives for the foreseeable future, so we need to learn to live with it and manage the risk to ourselves and others.
A