# Model Inference

## 1. Installs and Imports

In [None]:
!pip install datasets transformers seqeval

In [None]:
import os
import random
from collections import Counter, OrderedDict, defaultdict
from datetime import date
from pathlib import Path

import numpy as np
import pandas as pd

# scraping
import requests
import spacy

# inference
import torch
import transformers
from bs4 import BeautifulSoup
from datasets import ClassLabel, Sequence, load_dataset, load_from_disk, load_metric
from google.colab import drive
from IPython.display import HTML, display
from seqeval.metrics import accuracy_score
from spacy import displacy
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
    pipeline,
)

print(transformers.__version__)

In [None]:
system = "COLAB"  # ["AWS", "COLAB"]

In [None]:
if system == "COLAB":
    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data",
    )
    MODEL_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models",
    )
    RESULTS_DIR = os.path.join(
        "/content/gdrive/Shared drives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/Metrics",
    )

In [None]:
print("Data Folder: {}".format(DATA_DIR))
print(os.listdir(DATA_DIR)[:3])
print("Model Folder: {}".format(MODEL_DIR))
print(os.listdir(MODEL_DIR)[:3])

## 2. Scrape Govuk Content

In [None]:
def get_page_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

In [None]:
def get_sents_from_soup(soup):
    body = soup.findAll(attrs={"class": "gem-c-govspeak"})
    sents = [i.text.split("\n") for i in body]
    sents_clean = [list(filter(None, i)) for i in sents]
    return sents_clean

In [None]:
def url_get_sents(url):
    soup = get_page_soup(url)
    sents_clean = get_sents_from_soup(soup)
    return sents_clean

In [None]:
result = url_get_sents("https://www.gov.uk/student-visa")

In [None]:
result

## 3. Do Inference

### 3A. Load Model

Load model from local


In [None]:
# MODEL_DIR
# model_name = "distilbert-base-uncased"
# task = "ner"
# dataset_name = "govuk"
# req_date = "13-12-2021"
# dataset_type = 'FULL'
# chkpoint = 'checkpoint-73500'

In [None]:
# OUTPUT_PATH = f"{MODEL_DIR}/{model_name}-finetuned-{task}-{dataset_name}-{dataset_type}-{req_date}/{chkpoint}"
# OUTPUT_PATH

In [None]:
OUTPUT_PATH = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models/distilbert-base-uncased-finetuned-ner-govuk-14-01-2022-validated_train/checkpoint-1000"

In [None]:
os.listdir(OUTPUT_PATH)

Load model and tokeniser

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer

In [None]:
model = AutoModelForTokenClassification.from_pretrained(OUTPUT_PATH)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_PATH)

### 3B. Hugging Face Pipelines

Use Huggingface Pipelines

In [None]:
sequence = (
    "You must be at least 17 years old to have a drivers licence "
    "failure to provide this certificate will mean imprisonment in the UK and barring from countries like EU and US"
)

In [None]:
# set up pipeline with model and tokeniser
token_classifier = pipeline(
    "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple"
)

In [None]:
print(sequence)
print(len(sequence))

In [None]:
result = token_classifier(sequence)

In [None]:
result

Now try with gov.uk outputs

In [None]:
page_sents = url_get_sents(
    url="https://www.gov.uk/marriage-visa/documents-you-will-need"
)

In [None]:
page_sents

In [None]:
sent_len = 0
ners = []
for i in page_sents[0]:
    result = token_classifier(i)
    for j in result:
        j["start"] += sent_len
        j["end"] += sent_len
    sent_len += len(i) + 1
    ners.append(result)

In [None]:
ners_flat = [item for sublist in ners for item in sublist]

In [None]:
ners_flat

In [None]:
res_ents = [(i["entity_group"], i["start"], i["end"]) for i in ners_flat]
res_ents

Stitch sents into one 'doc'

In [None]:
seq = " ".join(page_sents[0])

In [None]:
seq

In [None]:
colors = {
    "ORG": "#7c5cdd",
    "FORM": "#26e21c",
    "LOC": "#eee65c",
    "MONEY": "#80bab2",
    "SCHEME": "#b76d14",
    "DATE": "#bc8251",
    "STATE": "#bd4c33",
    "PER": "#c0970b",
    "FINANCE": "#debdd8",
    "FORM": "#48aba2",
    "EVENT": "#0a8dd9",
    "CONTACT": "#807388",
}

In [None]:
def display_entities(text, entities):
    nlp = spacy.blank("en")
    doc = nlp(text)
    ents = []
    for ee in entities:
        ents.append(doc.char_span(ee[1], ee[2], ee[0]))
    doc.ents = ents
    options = {"distance": 90, "colors": colors}
    displacy.render(doc, style="ent", jupyter=True, options=options)

In [None]:
display_entities(text=seq, entities=res_ents)

In [None]:
def get_ners_and_flatten(sents):
    sent_len = 0
    ners = []
    for i in sents[0]:
        result = token_classifier(i)
        for j in result:
            j["start"] += sent_len
            j["end"] += sent_len
        sent_len += len(i) + 1
        ners.append(result)
    ners_flat = [item for sublist in ners for item in sublist]
    res_ents = [(i["entity_group"], i["start"], i["end"]) for i in ners_flat]
    return res_ents

In [None]:
def stitch_sents(sent_list):
    seq = " ".join(sent_list)
    return seq

In [None]:
def display_entities(text, entities):
    nlp = spacy.blank("en")
    doc = nlp(text)
    ents = []
    for ee in entities:
        ents.append(doc.char_span(ee[1], ee[2], ee[0]))
    doc.ents = ents
    options = {"distance": 90, "colors": colors}
    return displacy.render(doc, style="ent", jupyter=True, options=options)

## Pipeline

In [None]:
def url_to_spacy_viz(url):
    print(url)
    sents = url_get_sents(url)
    flat_ners = get_ners_and_flatten(sents)
    stitched = stitch_sents(sents[0])
    disp_ents = display_entities(stitched, flat_ners)
    return disp_ents

In [None]:
url_to_spacy_viz(
    url="https://www.gov.uk/hmrc-internal-manuals/tobacco-products-duty/tpd3180"
)

In [None]:
paths = [
    "/student-visa",
    "/marriage-visa",
    "/marriage-visa/eligibility",
    "/marriage-visa/documents-you-will-need",
    "/marriage-visa/apply",
    "/guidance/covid-19-coronavirus-restrictions-what-you-can-and-cannot-do#what-has-changed",
]

In [None]:
for p in paths:
    url_p = f"http://www.gov.uk{p}"
    p_dash = p.replace("/", "_")
    fname = f"{DATA_DIR}/Images/img_{p_dash}"
    print(fname)
    disp_ents = url_to_spacy_viz(url_p)
    # output_path = Path(f"{DATA_DIR}/Images/img_{p_dash}.svg")
    # output_path.open("w", encoding="utf-8").write(disp_ents)