# Existing Data Audit

The purpose of this notebook is to conduct an audit of the data currently on file from the previous iteration of the govNER project.

## 1. Installs and Permissions

In [None]:
# !pip install seqeval
# !pip install transformers
# !pip install torch==1.5.0

In [None]:
import json
import os
import re
from ast import literal_eval
from collections import Counter, OrderedDict, defaultdict
from datetime import date, datetime

import boto
import boto3
import numpy as np
import pandas as pd
import s3fs
import torch
from nltk import sent_tokenize, word_tokenize
from seqeval.metrics import accuracy_score, performance_measure
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from tqdm import notebook, tqdm
from transformers import (
    AdamW,
    BertConfig,
    BertForTokenClassification,
    BertModel,
    BertTokenizerFast,
    get_linear_schedule_with_warmup,
)

tqdm.pandas()

What's your GPU?

In [None]:
!nvidia-smi

Define seeds for reproducibility

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Load GOV.UK NER data

Paths and filenames...

In [None]:
system = "AWS"

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{bucket}/model-data/govner-data"
    print(fs.ls(DATA_DIR))
elif system == "COLAB":
    from google.colab import drive

    drive.mount("/content/gdrive")
    DATA_DIR = os.path.join(
        "/content/gdrive/My Drive", "transformer_fun/govner/roberta"
    )
elif system == "LOCAL":
    DATA_DIR = os.getenv("DATA_DIR")
DATA_DIR

## 2. Load the Data

In [None]:
print(fs.ls(DATA_DIR))

Read the data...

In [None]:
ner_data_file = os.path.join(
    DATA_DIR, "line_by_line_NER_data_sampled_12062020_more_ents.csv"
)
label_map_file = os.path.join(DATA_DIR, "label_map_12062020_more_ents.json")
ner_data_file

In [None]:
df = pd.read_csv(ner_data_file, sep="\t", low_memory=False)

## 3. Inspect the Data

In [None]:
print(df.shape)
df.head()

In [None]:
df.info()

To inspect:

1. What is the 'updated' column?
    
    _No evidence for anything useful._
    
    
2. What is the 'original_labels' column?

    _It looks like 'original_labels' is the column indicating what they were orgiginally labelled as, perhaps before they had been corrected._
    

3. What is the 'sampled' column?

    _Doesn't look like too much in it. Possibly to indicate what is being sampled when chosing a smaller dataframe size. Those with True sample flags, don't appear to have any 'Sentence: XXX' values for base_path._


4. What is the 'base path' column?

    _The url path from where the sentence was taken. There are 4,177 examples of base paths with 'Sentence: XXX', These look like random sentences taken from elsewhere._

##### 1. What is the updated column?

In [None]:
df["updated"].value_counts()

In [None]:
df[df["updated"] == True].sample(5)

In [None]:
df["updated"].value_counts()

In [None]:
df[df["updated"] != True].sample(5)

Doesn't seem to be much different.

##### 2. What is the original_labels column?

In [None]:
df["original_labels"].value_counts()

In [None]:
df[df["original_labels"].notna()].sample(5)

In [None]:
df[df["original_labels"].isna()].sample(5)

It looks like 'original_labels' is the column indicating what they were orgiginally labelled as, perhaps before they had been corrected.

##### 3.What is the 'sampled' column?

In [None]:
df["sampled"].value_counts()

In [None]:
df[df["sampled"] == True].sample(10)

In [None]:
df[df["sampled"] == False].sample(10)

Doesn't look like too much in it. Possibly to indicate what is being sampled when chosing a smaller dataframe size.

Those with True sample flags, don't appear to have any 'Sentence: XXX' values for base_path.

##### 4. What is the 'base path' column?

In [None]:
df["base_path"].value_counts()

In [None]:
df["Sentence"] = np.where(df["base_path"].str.startswith("Sentence:"), True, False)

In [None]:
df["Sentence"].value_counts()

There are 4,177 examples of sentences. These look like random sentences taken from elsewhere.

In [None]:
df[df["Sentence"] == True].sample(5)

In [None]:
df[df["Sentence"] == False].sample(5)

## 4. Inspect the Label Map

In [None]:
if system == "AWS":
    with fs.open(label_map_file, "rb") as f:
        label_name_map = json.load(f)
    print(label_name_map)
else:
    with open(label_map_file, "r") as f:
        label_name_map = json.load(f)
    print(label_name_map)

In [None]:
label_list = df["label_list"]
print(len(label_list))

In [None]:
from ast import literal_eval

label_list = list(label_list)
print(len(label_list))

In [None]:
list_of_lists = [literal_eval(i) for i in label_list]
print(len(list_of_lists))

In [None]:
flat_list = [item for sublist in list_of_lists for item in sublist]

In [None]:
print(len(flat_list))
flat_list

In [None]:
from collections import Counter

In [None]:
entity_counts = Counter(flat_list)

In [None]:
entity_counts

In [None]:
e_c_df = pd.DataFrame.from_dict(entity_counts, orient="index")
e_c_df.columns = ["entity_count"]

In [None]:
e_c_df

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
entities = e_c_df.index
counts = e_c_df["entity_count"]
ax.bar(entities, counts)
plt.title("Labelled Entity Counts")
plt.ylabel("Count")
plt.xlabel("Entity Type")
ax.set_yticks(np.arange(0, 5000000, 500000))
ax.ticklabel_format(axis="y", style="plain")
plt.xticks(rotation=90)
plt.show()

Without 'O'

In [None]:
e_c_df_no_o = e_c_df.drop("O")

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
entities = e_c_df_no_o.index
counts = e_c_df_no_o["entity_count"]
ax.bar(entities, counts)
plt.title("Labelled Entity Counts")
plt.ylabel("Count")
plt.xlabel("Entity Type")
# ax.set_yticks(np.arange(0, 5000000, 500000))
ax.ticklabel_format(axis="y", style="plain")
plt.xticks(rotation=90)
plt.show()

## Individual Investigation

In [None]:
from ast import literal_eval

pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

df["text_token"] = df["text_token"].apply(literal_eval)
df["label_list"] = df["label_list"].apply(literal_eval)

df.sample(100)

In [None]:
df["text_token_len"] = df["text_token"].apply(lambda x: len(x))
df["label_list_len"] = df["label_list"].apply(lambda x: len(x))

In [None]:
df.sample(100)

Map 'label_list' column to numeric representation.

In [None]:
label_name_map

In [None]:
df["label_list_map"] = df["label_list"].apply(
    lambda x: list(map(label_name_map.get, x))
)

In [None]:
df.sample(20)