# Model Inference

## 1. Installs and Imports

In [None]:
!pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[K     |████████████████████████████████| 306 kB 3.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 12.4 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.2 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 62.5 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 32.5 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 344 kB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [1]:
import os
import random
import transformers
import pandas as pd
import numpy as np
from datetime import date
from pathlib import Path
from google.colab import drive
from seqeval.metrics import accuracy_score
from IPython.display import display, HTML
from collections import defaultdict, Counter, OrderedDict
from datasets import load_dataset, load_metric, load_from_disk, ClassLabel, Sequence
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification

#scraping
import requests
from bs4 import BeautifulSoup

import spacy
from spacy import displacy

#inference
import torch
from transformers import pipeline

print(transformers.__version__)

In [None]:
system = "COLAB" #["AWS", "COLAB"]

In [None]:
if system=="AWS":
    fs = s3fs.S3FileSystem()    
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f's3://{s3_bucket}/model-data/govner-data'
    for f in fs.ls(DATA_DIR):
        print(f)
    #Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session() 
    sagemaker_session_bucket= s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()
        
    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system=="COLAB":
    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data")
    MODEL_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models")
    RESULTS_DIR = os.path.join("/content/gdrive/Shared drives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/Metrics")

Mounted at /content/gdrive


In [None]:
print("Data Folder: {}".format(DATA_DIR))
print(os.listdir(DATA_DIR)[:3])
print("Model Folder: {}".format(MODEL_DIR))
print(os.listdir(MODEL_DIR)[:3])

Data Folder: /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data
['label_map_09062020_more_ents.json', 'label_map_12062020_more_ents.json', 'line_by_line_NER_data_sampled_09062020_more_ents.csv']
Model Folder: /content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models
['distilbert-base-uncased-finetuned-ner-conll2003', 'distilbert-base-uncased-finetuned-ner-govuk', 'distilbert-base-uncased-finetuned-ner-govuk-SAMPLED-08-12-2021']


## 2. Scrape Govuk Content

In [None]:
def get_page_soup(url):
  page = requests.get(url)
  soup = BeautifulSoup(page.content, "html.parser")
  return soup

In [None]:
def get_sents_from_soup(soup):
  body = soup.findAll(attrs={"class":"gem-c-govspeak"})
  sents = [i.text.split('\n') for i in body]
  sents_clean = [list(filter(None, i)) for i in sents]
  return sents_clean

In [None]:
def url_get_sents(url):
  soup = get_page_soup(url)
  sents_clean = get_sents_from_soup(soup)
  return sents_clean

In [None]:
result = url_get_sents('https://www.gov.uk/student-visa')

In [None]:
result

[['You can apply for a Student visa to study in the UK if you’re 16 or over and you:',
  'have been offered a place on a course by a licensed student sponsor',
  'have enough money to support yourself and pay for your course - the amount will vary depending on your circumstances',
  'can speak, read, write and understand English',
  'have consent from your parents if you’re 16 or 17 - you’ll need evidence of this when you apply',
  'If you’re 16 or 17 and you want to study at an independent school in the UK, you may be eligible for a Child Student visa instead.',
  'This visa has replaced the Tier 4 (General) student visa.',
  'If you or your family are from the EU, Switzerland, Norway, Iceland or Liechtenstein',
  'If you or your family member started living in the UK by 31 December 2020, you may be able to apply to the free EU Settlement Scheme.',
  'The deadline to apply was 30 June 2021 for most people. You can still apply if either:',
  'you have a later deadline - for example, yo

## 3. Do Inference

### 3A. Load Model

Load model from local


In [None]:
MODEL_DIR 
model_name = "distilbert-base-uncased"
task = "ner"
dataset_name = "govuk"
req_date = "13-12-2021"
dataset_type = 'FULL'
chkpoint = 'checkpoint-73500'

In [None]:
OUTPUT_PATH = f"{MODEL_DIR}/{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{req_date}/{chkpoint}"
OUTPUT_PATH

'/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-FULL-13-12-2021/checkpoint-73500'

In [None]:
os.listdir(OUTPUT_PATH)

['config.json',
 'pytorch_model.bin',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'vocab.txt',
 'tokenizer.json',
 'training_args.bin',
 'optimizer.pt',
 'scheduler.pt',
 'trainer_state.json',
 'rng_state.pth']

Load model and tokeniser

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

In [None]:
model = AutoModelForTokenClassification.from_pretrained(OUTPUT_PATH)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_PATH)

### 3B. Hugging Face Pipelines

Use Huggingface Pipelines

In [None]:
sequence = "You must be at least 17 years old to have a drivers licence " \
"failure to provide this certificate will mean imprisonment in the UK and barring from countries like EU and US"

In [None]:
# set up pipeline with model and tokeniser
token_classifier = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [None]:
print(sequence)
print(len(sequence))

You must be at least 17 years old to have a drivers licence failure to provide this certificate will mean imprisonment in the UK and barring from countries like EU and US
170


In [None]:
result = token_classifier(sequence)

In [None]:
result

[{'end': 33,
  'entity_group': 'STATE',
  'score': 0.6916776,
  'start': 21,
  'word': '17 years old'},
 {'end': 51,
  'entity_group': 'PER',
  'score': 0.99765354,
  'start': 44,
  'word': 'drivers'},
 {'end': 59,
  'entity_group': 'FORM',
  'score': 0.9795044,
  'start': 52,
  'word': 'licence'},
 {'end': 95,
  'entity_group': 'FORM',
  'score': 0.98986244,
  'start': 84,
  'word': 'certificate'},
 {'end': 128,
  'entity_group': 'LOC',
  'score': 0.9997448,
  'start': 122,
  'word': 'the uk'},
 {'end': 155,
  'entity_group': 'LOC',
  'score': 0.99967575,
  'start': 146,
  'word': 'countries'},
 {'end': 163,
  'entity_group': 'LOC',
  'score': 0.9998914,
  'start': 161,
  'word': 'eu'},
 {'end': 170,
  'entity_group': 'LOC',
  'score': 0.99533564,
  'start': 168,
  'word': 'us'}]

Now try with gov.uk outputs

In [None]:
page_sents = url_get_sents(url='https://www.gov.uk/marriage-visa/documents-you-will-need')

In [None]:
page_sents

[['You must provide a passport or travel document. Your passport should be valid for the whole of your stay in the UK and contain a blank page for your visa.',
  'You can supply the following to support your application:',
  'details of the marriage or civil partnership and proof that you’ve paid money for some of its costs',
  'proof that you’re planning to get married in the UK, for example a booking confirmation or emails between you and the venue',
  'See the full list of documents you can provide to prove your eligibility.',
  'You’ll need to provide a certified translation of any documents that are not in English or Welsh.',
  'If you’ve been married before',
  'You’ll need to show proof that you’re free to marry or enter into a civil partnership again, for example a:',
  'decree absolute',
  'death certificate of a previous partner',
  'You may need to provide additional documents depending on your circumstances.']]

In [None]:
sent_len = 0
ners = []
for i in page_sents[0]:
  result = token_classifier(i)
  for j in result:
    j['start'] += sent_len
    j['end'] += sent_len
  sent_len += len(i) + 1
  ners.append(result)

In [None]:
ners_flat = [item for sublist in ners for item in sublist]

In [None]:
ners_flat

[{'end': 27,
  'entity_group': 'FORM',
  'score': 0.99917954,
  'start': 19,
  'word': 'passport'},
 {'end': 37,
  'entity_group': 'EVENT',
  'score': 0.99745935,
  'start': 31,
  'word': 'travel'},
 {'end': 61,
  'entity_group': 'FORM',
  'score': 0.9692217,
  'start': 38,
  'word': 'document. your passport'},
 {'end': 77,
  'entity_group': 'STATE',
  'score': 0.99483514,
  'start': 72,
  'word': 'valid'},
 {'end': 114,
  'entity_group': 'LOC',
  'score': 0.9943054,
  'start': 108,
  'word': 'the uk'},
 {'end': 153,
  'entity_group': 'FORM',
  'score': 0.98904836,
  'start': 144,
  'word': 'your visa'},
 {'end': 211,
  'entity_group': 'FORM',
  'score': 0.9993825,
  'start': 200,
  'word': 'application'},
 {'end': 236,
  'entity_group': 'STATE',
  'score': 0.99277097,
  'start': 228,
  'word': 'marriage'},
 {'end': 257,
  'entity_group': 'STATE',
  'score': 0.99308074,
  'start': 240,
  'word': 'civil partnership'},
 {'end': 267,
  'entity_group': 'FORM',
  'score': 0.927255,
  'start

In [None]:
res_ents = [(i['entity_group'], i['start'], i['end']) for i in ners_flat]
res_ents

[('FORM', 19, 27),
 ('EVENT', 31, 37),
 ('FORM', 38, 61),
 ('STATE', 72, 77),
 ('LOC', 108, 114),
 ('FORM', 144, 153),
 ('FORM', 200, 211),
 ('STATE', 228, 236),
 ('STATE', 240, 257),
 ('FORM', 262, 267),
 ('FINANCE', 280, 290),
 ('FINANCE', 307, 312),
 ('STATE', 347, 354),
 ('LOC', 358, 364),
 ('CONTACT', 388, 400),
 ('CONTACT', 404, 410),
 ('LOC', 431, 436),
 ('FORM', 458, 467),
 ('FORM', 536, 557),
 ('FORM', 565, 574),
 ('LOC', 602, 607),
 ('STATE', 624, 631),
 ('DATE', 632, 638),
 ('FORM', 659, 664),
 ('STATE', 707, 724),
 ('EVENT', 754, 762),
 ('FORM', 769, 780),
 ('PER', 795, 802),
 ('FORM', 838, 847),
 ('STATE', 866, 879)]

Stitch sents into one 'doc'

In [None]:
seq = ' '.join(page_sents[0])

In [None]:
seq

'You must provide a passport or travel document. Your passport should be valid for the whole of your stay in the UK and contain a blank page for your visa. You can supply the following to support your application: details of the marriage or civil partnership and proof that you’ve paid money for some of its costs proof that you’re planning to get married in the UK, for example a booking confirmation or emails between you and the venue See the full list of documents you can provide to prove your eligibility. You’ll need to provide a certified translation of any documents that are not in English or Welsh. If you’ve been married before You’ll need to show proof that you’re free to marry or enter into a civil partnership again, for example a: decree absolute death certificate of a previous partner You may need to provide additional documents depending on your circumstances.'

In [None]:
colors = {"ORG": "#7c5cdd", 
          "FORM": "#26e21c",
          "LOC": "#eee65c",
          "MONEY": "#80bab2",
          "SCHEME": "#b76d14",
          "DATE": "#bc8251",
          "STATE": "#bd4c33",
          "PER": "#c0970b",
          "FINANCE": "#debdd8",
          "FORM": "#48aba2",
          "EVENT": "#0a8dd9",
          "CONTACT": "#807388"}

In [None]:
def display_entities(text, entities):
  nlp = spacy.blank("en")
  doc = nlp(text)
  ents = []
  for ee in entities:
      ents.append(doc.char_span(ee[1], ee[2], ee[0]))
  doc.ents = ents
  options={'distance': 90, 'colors':colors}
  displacy.render(doc, style='ent', jupyter=True, options=options)

In [None]:
display_entities(text=seq, entities=res_ents)

In [None]:
def get_ners_and_flatten(sents):
  sent_len = 0
  ners = []
  for i in sents[0]:
    result = token_classifier(i)
    for j in result:
      j['start'] += sent_len
      j['end'] += sent_len
    sent_len += len(i) + 1
    ners.append(result)
  ners_flat = [item for sublist in ners for item in sublist]
  res_ents = [(i['entity_group'], i['start'], i['end']) for i in ners_flat]
  return res_ents

In [None]:
def stitch_sents(sent_list):
  seq = ' '.join(sent_list)
  return seq

In [None]:
def display_entities(text, entities):
  nlp = spacy.blank("en")
  doc = nlp(text)
  ents = []
  for ee in entities:
      ents.append(doc.char_span(ee[1], ee[2], ee[0]))
  doc.ents = ents
  options={'distance': 90, 'colors':colors}
  return displacy.render(doc, style='ent', jupyter=True, options=options)

## Pipeline

In [None]:
def url_to_spacy_viz(url):
  print(url)
  sents = url_get_sents(url)
  flat_ners = get_ners_and_flatten(sents)
  stitched = stitch_sents(sents[0])
  disp_ents = display_entities(stitched, flat_ners)
  return disp_ents

In [None]:
url_to_spacy_viz(url='https://www.gov.uk/hmrc-internal-manuals/tobacco-products-duty/tpd3180')

https://www.gov.uk/hmrc-internal-manuals/tobacco-products-duty/tpd3180


In [None]:
paths = ['/student-visa', 
         '/marriage-visa', 
         '/marriage-visa/eligibility', 
         '/marriage-visa/documents-you-will-need', 
         '/marriage-visa/apply', 
         '/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed',
         ]

In [None]:
for p in paths:
  url_p = f"http://www.gov.uk{p}"
  p_dash = p.replace("/", "_")
  fname = f"{DATA_DIR}/Images/img_{p_dash}"
  print(fname)
  disp_ents = url_to_spacy_viz(url_p)
  # output_path = Path(f"{DATA_DIR}/Images/img_{p_dash}.svg")
  # output_path.open("w", encoding="utf-8").write(disp_ents)

/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Images/img__student-visa
http://www.gov.uk/student-visa


/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Images/img__marriage-visa
http://www.gov.uk/marriage-visa


/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Images/img__marriage-visa_eligibility
http://www.gov.uk/marriage-visa/eligibility


/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Images/img__marriage-visa_documents-you-will-need
http://www.gov.uk/marriage-visa/documents-you-will-need


/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Images/img__marriage-visa_apply
http://www.gov.uk/marriage-visa/apply


/content/gdrive/Shared drives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Images/img__guidance_covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed
http://www.gov.uk/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed
