# 015_COLAB_JanuaryTrainingDataPrep



This is a notebook to prepare the datasets for the training jobs run in January 2022 for project playback.

## 1. Installs and Imports

In [None]:
!pip install datasets transformers torch seqeval &> /dev/null

In [None]:
import json
import os
from ast import literal_eval
from collections import Counter

import numpy as np
import pandas as pd
import transformers
from datasets import Dataset, load_dataset, load_from_disk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", None)

## 2. Permissions

In [None]:
system = "COLAB"  # ["AWS", "COLAB"]

In [None]:
if system == "COLAB":
    from google.colab import drive

    drive.mount("/content/gdrive")
    # DATA_DIR = os.path.join("/content/gdrive/Shareddrives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Jan2022-Data")
    DATA_DIR = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data"

In [None]:
DATA_DIR

## 3. Load Datasets

* The combined dataset (~230,000)
* The validated dataset (~5,500)

## 3_. HF Datasets

In [None]:
validated_hf = load_from_disk(
    "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/validated_ner_hf"
)
unvalidated_hf = load_from_disk(
    "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/hf_govuk_data"
)
samp_unvalidated_hf = load_from_disk(
    "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/samp_hf_govuk_data"
)

In [None]:
validated_hf

In [None]:
for i in validated_hf["train"][0]:
    print(validated_hf["train"][i])

In [None]:
unvalidated_hf

In [None]:
for i in unvalidated_hf["train"][0]:
    print(unvalidated_hf["train"][i])

In [None]:
samp_unvalidated_hf

### 3A. Validated Dataset

In [None]:
# s3 key prefix for the data
validated = "govuk-labelled-data-ner-validated.csv"
validated_path = f"{DATA_DIR}/{validated}"
validated_path

In [None]:
validated_df = pd.read_csv(validated_path, sep=",")
print(validated_df.shape)
validated_df.head()

Literal eval

In [None]:
for i in ["labels", "labelled_entities", "label_list", "text_tokens"]:
    validated_df[i] = validated_df[i].apply(lambda x: literal_eval(x))

In [None]:
validated_df = validated_df[~validated_df.text.duplicated()]

In [None]:
validated_df = validated_df.iloc[:, 1:]

In [None]:
validated_df = validated_df.reset_index()
validated_df = validated_df.iloc[:, 1:]

In [None]:
validated_df

In [None]:
validated_labelmap = {
    "O": 0,
    "I-CONTACT": 1,
    "I-DATE": 2,
    "I-EVENT": 3,
    "I-FINANCE": 4,
    "I-FORM": 5,
    "I-LOCATION": 6,
    "I-MISC": 7,
    "I-MONEY": 8,
    "I-ORGANIZATION": 9,
    "I-PERSON": 10,
    "I-SCHEME": 11,
    "I-STATE": 12,
}

In [None]:
def list_map_func(listy, mapping):
    new_list = []
    for i in listy:
        new_list.append(mapping[i])
    return new_list

In [None]:
validated_df["new_label_list_id"] = validated_df["label_list"].apply(
    lambda x: list_map_func(x, validated_labelmap)
)

In [None]:
validated_df.head(10)

**Individual**:

O                 101033

I-ORGANIZATION      5503

I-FINANCE           4409

I-PERSON            3410

I-FORM              2715

I-EVENT             2267

I-DATE              1945

I-STATE             1482

I-LOCATION          1431

I-MISC              1064

I-CONTACT            987


**Multi-word**

ORGANIZATION    3096

PERSON          2883

FINANCE         2648

FORM            1366

EVENT           1324

LOCATION        1029

DATE             791

CONTACT          669

STATE            663

MISC             503

Take 20% of the validated data for testing.

In [None]:
validated_train, validated_test = train_test_split(
    validated_df, test_size=0.2, random_state=43
)

In [None]:
validated_test

In [None]:
mlb = MultiLabelBinarizer()
validated_test_counts = validated_test.copy()
validated_test_counts["label_list_enc"] = validated_test_counts["label_list"]
validated_test_counts = validated_test_counts.join(
    pd.DataFrame(
        mlb.fit_transform(validated_test_counts.pop("label_list_enc")),
        columns=mlb.classes_,
        index=validated_test_counts.index,
    )
)

In [None]:
validated_test_counts

In [None]:
for i in [
    "I-CONTACT",
    "I-DATE",
    "I-EVENT",
    "I-FINANCE",
    "I-FORM",
    "I-LOCATION",
    "I-MISC",
    "I-ORGANIZATION",
    "I-PERSON",
    "I-STATE",
    "O",
]:
    print(i, validated_test_counts[i].sum())

In [None]:
validated_train

In [None]:
validated_test

## 3B. Unvalidated Dataset

In [None]:
# s3 key prefix for the data
unvalidated = "govuk-labelled-data-ner.csv"
unvalidated_path = f"{DATA_DIR}/{unvalidated}"
unvalidated_path

In [None]:
unvalidated_df = pd.read_csv(unvalidated_path, sep=",")
print(unvalidated_df.shape)
unvalidated_df = unvalidated_df[
    ["text", "text_token", "labels", "label_list", "new_label_list_id"]
]
unvalidated_df = unvalidated_df.rename(columns={"text_token": "text_tokens"})
unvalidated_df.head()

In [None]:
unvalidated_df = unvalidated_df[~unvalidated_df.text.duplicated()]

Literal eval

In [None]:
for i in ["labels", "label_list", "text_tokens", "new_label_list_id"]:
    unvalidated_df[i] = unvalidated_df[i].apply(lambda x: literal_eval(x))

In [None]:
unvalidated_df = unvalidated_df.reset_index()
unvalidated_df = unvalidated_df.iloc[:, 1:]

In [None]:
print(unvalidated_df.shape)
unvalidated_df.head()

In [None]:
unvalidated_labelmap = {
    "O": 0,
    "I-CONTACT": 1,
    "I-DATE": 2,
    "I-EVENT": 3,
    "I-FINANCE": 4,
    "I-FORM": 5,
    "I-LOC": 6,
    "I-MISC": 7,
    "I-MONEY": 8,
    "I-ORG": 9,
    "I-PER": 10,
    "I-SCHEME": 11,
    "I-STATE": 12,
}

## 4. Save files

### 4A. Save CSVs

In [None]:
print(validated_train.shape)
validated_train.head()

In [None]:
print(validated_test.shape)
validated_test.head()

In [None]:
print(unvalidated_df.shape)
unvalidated_df.head()

In [None]:
DATA_DIR

In [None]:
validated_test.to_csv(
    os.path.join(DATA_DIR, "Feb22-CSV/validated_test.csv"), index=None
)
validated_train.to_csv(
    os.path.join(DATA_DIR, "Feb22-CSV/validated_train.csv"), index=None
)
unvalidated_df.to_csv(
    os.path.join(DATA_DIR, "Feb22-CSV/unvalidated_train.csv"), index=None
)

### 4B. Save HFs

Validated test

In [None]:
validated_test_df = pd.read_csv(os.path.join(DATA_DIR, "Feb22-CSV/validated_test.csv"))

In [None]:
for i in [
    "labels",
    "labelled_entities",
    "label_list",
    "text_tokens",
    "new_label_list_id",
]:
    validated_test_df[i] = validated_test_df[i].apply(lambda x: literal_eval(x))
validated_test_df = validated_test_df[["text_tokens", "new_label_list_id"]]

In [None]:
# validated_test_hf = Dataset.from_csv(os.path.join(DATA_DIR, 'Jan22-CSV/validated_test.csv'))
validated_test_hf = Dataset.from_pandas(validated_test_df)

In [None]:
validated_test_hf.save_to_disk(os.path.join(DATA_DIR, "Feb22-HF/validated_test"))

Validated train

In [None]:
validated_train_df = pd.read_csv(
    os.path.join(DATA_DIR, "Feb22-CSV/validated_train.csv")
)

In [None]:
for i in [
    "labels",
    "labelled_entities",
    "label_list",
    "text_tokens",
    "new_label_list_id",
]:
    validated_train_df[i] = validated_train_df[i].apply(lambda x: literal_eval(x))
validated_train_df = validated_train_df[["text_tokens", "new_label_list_id"]]

In [None]:
# validated_test_hf = Dataset.from_csv(os.path.join(DATA_DIR, 'Jan22-CSV/validated_test.csv'))
validated_train_hf = Dataset.from_pandas(validated_train_df)

In [None]:
validated_train_hf.save_to_disk(os.path.join(DATA_DIR, "Feb22-HF/validated_train"))

Unvalidated train

In [None]:
unvalidated_train_df = pd.read_csv(
    os.path.join(DATA_DIR, "Feb22-CSV/unvalidated_train.csv")
)

In [None]:
for i in ["labels", "label_list", "text_tokens", "new_label_list_id"]:
    unvalidated_train_df[i] = unvalidated_train_df[i].apply(lambda x: literal_eval(x))
unvalidated_train_df = unvalidated_train_df[["text_tokens", "new_label_list_id"]]

In [None]:
unvalidated_train_hf = Dataset.from_pandas(unvalidated_train_df)

In [None]:
unvalidated_train_hf.save_to_disk(os.path.join(DATA_DIR, "Feb22-HF/unvalidated_train"))

Unvalidated train sample


In [None]:
unvalidated_train_df_sample = unvalidated_train_df.sample(10000, random_state=43)
unvalidated_train_df_sample.shape

In [None]:
unvalidated_train_hf_sample = Dataset.from_pandas(unvalidated_train_df_sample)

In [None]:
unvalidated_train_hf_sample.save_to_disk(
    os.path.join(DATA_DIR, "Feb22-HF/unvalidated_train_sample")
)

### 4C. Save .json mappings

In [None]:
with open(os.path.join(DATA_DIR, "Jan22-HF/validated_labelmap.json"), "w") as outfile:
    json.dump(validated_labelmap, outfile)

In [None]:
with open(os.path.join(DATA_DIR, "Jan22-HF/unvalidated_labelmap.json"), "w") as outfile:
    json.dump(unvalidated_labelmap, outfile)

In [None]:
with open(os.path.join(DATA_DIR, "Feb22-HF/full_labelmap.json"), "w") as outfile:
    json.dump(unvalidated_labelmap, outfile)

In [None]:
validated_train_hf.load_from_disk(os.path.join(DATA_DIR, "Jan22-HF/validated_train"))
validated_test_hf.load_from_disk(os.path.join(DATA_DIR, "Jan22-HF/validated_test"))
validated_train_hf.load_from_disk(os.path.join(DATA_DIR, "Jan22-HF/unvalidated_train"))