# Model Inference

## 1. Installs and Imports

In [None]:
# !pip install datasets transformers seqeval

In [None]:
import os
import random
from collections import Counter, OrderedDict, defaultdict
from datetime import date

import numpy as np
import pandas as pd

# inference
import torch
import transformers
from datasets import ClassLabel, Sequence, load_dataset, load_from_disk, load_metric
from google.colab import drive
from IPython.display import HTML, display
from seqeval.metrics import accuracy_score
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    pipeline,
)

print(transformers.__version__)

In [None]:
system = "COLAB"  # ["AWS", "COLAB"]

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{s3_bucket}/model-data/govner-data"
    for f in fs.ls(DATA_DIR):
        print(f)
    # Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session()
    sagemaker_session_bucket = s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()

    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system == "COLAB":
    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data",
    )
    MODEL_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models",
    )
    RESULTS_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/Metrics",
    )

In [None]:
print("Data Folder: {}".format(DATA_DIR))
print(os.listdir(DATA_DIR)[:3])
print("Model Folder: {}".format(MODEL_DIR))
print(os.listdir(MODEL_DIR)[:3])

## Load Model for Inference

Load model from local


In [None]:
MODEL_DIR
model_name = "distilbert-base-uncased"
task = "ner"
dataset_name = "govuk"
req_date = "13-12-2021"
dataset_type = "FULL"
chkpoint = "checkpoint-73500"

In [None]:
OUTPUT_PATH = f"{MODEL_DIR}/{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{req_date}/{chkpoint}"
OUTPUT_PATH

In [None]:
os.listdir(OUTPUT_PATH)

## Load model and tokeniser

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

In [None]:
model = AutoModelForTokenClassification.from_pretrained(OUTPUT_PATH)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_PATH)

In [None]:
sequence = (
    "You must be at least 17 years old to have a drivers licence "
    "failure to provide this certificate will mean imprisonment in the UK and barring from countries like EU and US"
)

In [None]:
inputs = tokenizer(sequence, return_tensors="pt")

In [None]:
inputs = tokenizer(sequence, return_tensors="pt")

In [None]:
inputs

In [None]:
tokens = inputs.tokens()

In [None]:
inputs.word_ids()

In [None]:
tokens

In [None]:
outputs = model(**inputs).logits

In [None]:
predictions = torch.argmax(outputs, dim=2)

In [None]:
predictions

In [None]:
for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

## Using Huggingface Pipelines

In [None]:
# Replace this with your own checkpoint
token_classifier = pipeline(
    "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple"
)

In [None]:
print(sequence)
print(len(sequence))

In [None]:
result = token_classifier(sequence)

In [None]:
result

## Visualise Entites

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sequence)
displacy.render(doc, style="ent", jupyter=True, options={"distance": 90})

In [None]:
text = "My name is John Smith and I live in Paris"
entities = [
    ("Employee", 11, 21),  # John Smith
    ("Location", 36, 41),  # Paris
]

In [None]:
import spacy


def display_entities(text, entities):
    nlp = spacy.blank("en")
    doc = nlp(text)
    ents = []
    for ee in entities:
        ents.append(doc.char_span(ee[1], ee[2], ee[0]))
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True, options={"distance": 90})


def tokenise_and_display(text):
    result = token_classifier(text)
    res_ents = [(i["entity_group"], i["start"], i["end"]) for i in result]
    display_entities(text, entities=res_ents)

In [None]:
display_entities(text, entities)

In [None]:
result

In [None]:
res_ents = [(i["entity_group"], i["start"], i["end"]) for i in result]
res_ents

In [None]:
display_entities(sequence, res_ents)

In [None]:
tokenise_and_display(sequence)

## Test With GOV.UK Pages

Now, we want to test how the model performs on inference tasks for NER on a selection of pages from GOV.UK.

The pages are:
*   Coronavirus guidance
  * https://www.gov.uk/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed

* Visitor Visa
  * Marriage visitor visa - https://www.gov.uk/marriage-visa
  * Marriage visitor visa eligibility - https://www.gov.uk/marriage-visa/eligibility
  * Marriage visitor visa documents you’ll need - https://www.gov.uk/marriage-visa/documents-you-will-need
  * Marriage visitor visa apply from outside the UK - https://www.gov.uk/marriage-visa/apply

* Study in the UK
  * https://www.gov.uk/student-visa


### Approach

1. Get content of pages into local notebook
2. Locate units of the content - title, sub-heading, main body
3. For each unit, split into sentences
4. Run each sentence of the model through the model

#### 1. Get content of pages into local notebook

Download preprocessed content store data from AWS.

In [None]:
content_path = os.path.join(
    DATA_DIR, "govuk_content/preprocessed_content_store_141221.csv"
)
content_path

In [None]:
govuk_content = pd.read_csv(
    content_path, sep="\t", nrows=100, encoding="utf-8", compression="gzip"
)

In [None]:
for i in range(len(govuk_content)):
    print(govuk_content.iloc[i, :]["base_path"])
    print()

#### 2. 

In [None]:
paths = [
    "/student-visa",
    "/marriage-visa",
    "/marriage-visa/eligibility",
    "/marriage-visa/documents-you-will-need",
    "/marriage-visa/apply",
    "/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed",
]

In [None]:
all_govuk_content = pd.read_csv(
    content_path,
    sep="\t",
    encoding="utf-8",
    compression="gzip",
    chunksize=10000,
    iterator=True,
)

In [None]:
df = pd.DataFrame()
count = 0
for chunk in all_govuk_content:
    count += 1
    print(count)
    rows = chunk[chunk["base_path"].isin(paths)]
    if rows.shape[0] > 0:
        print(rows["base_path"])
        df = df.append(rows)

In [None]:
df

In [None]:
df.iloc[0, :]

In [None]:
df.iloc[1, :]["details"]

### Scraping Approach

In [None]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.gov.uk/student-visa"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

body = soup.findAll(attrs={"class": "gem-c-govspeak"})
sent_list = []
for x in body:
    sent_list.append(x.text)

In [None]:
sent_list[0].split("\n")

In [None]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.gov.uk/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

body = soup.findAll(attrs={"class": "gem-c-govspeak"})
sents = [i.text.split("\n") for i in body]
sents_clean = [list(filter(None, i)) for i in sents]

In [None]:
for s in sents_clean[0]:
    print(s)