# HuggingFace Data Preparation

This is a notebook to prepare the labelled token dataset for HuggingFace.

## 1. Installs and Imports

In [None]:
# !pip install datasets
# !pip install transformers

In [None]:
import json
import os
from ast import literal_eval

import boto3
import pandas as pd
import s3fs
import sagemaker
import transformers
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split

## 2. Permissions

In [None]:
system = "AWS"

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{s3_bucket}/model-data/govner-data"
    for f in fs.ls(DATA_DIR):
        print(f)

In [None]:
# Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sess = sagemaker.Session()
sagemaker_session_bucket = s3_bucket
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

## 3. Load Dataset

In [None]:
# s3 key prefix for the data
s3_prefix = "model-data/govner-data"

dataset1_name = "line_by_line_NER_data_sampled_12062020_more_ents.csv"
dataset2_name = "line_by_line_NER_data_sampled_09062020_more_ents.csv"

dataset1_path = f"s3://{sess.default_bucket()}/{s3_prefix}/{dataset1_name}"
dataset2_path = f"s3://{sess.default_bucket()}/{s3_prefix}/{dataset2_name}"

In [None]:
dataset1 = pd.read_csv(dataset1_path, sep="\t", low_memory=False)
dataset2 = pd.read_csv(dataset2_path, sep="\t", low_memory=False)

## 4. Exploration

In [None]:
print(f"dataset1 shape: {dataset1.shape}")
print(f"dataset2 shape: {dataset2.shape}")

In [None]:
dataset1.sample(5)

In [None]:
dataset2.sample(5)

## 5. Concatenation

We will concatenate the DaataFrames. They are likely separate for storage/memory reasons. We will combine and shuffle them anyway. We will also add a flag to show what dataset they were originally from too, for later reference.

In [None]:
dataset1["original_file"] = "line_by_line_NER_data_sampled_12062020_more_ents.csv"
dataset2["original_file"] = "line_by_line_NER_data_sampled_09062020_more_ents.csv"

In [None]:
dataset1.sample(5)

In [None]:
dataset2.sample(5)

Combine into one dataset.

In [None]:
frames = [dataset1, dataset2]
concat = pd.concat(frames)
print(concat.shape)

Shuffle dataset.

In [None]:
shuffled_df = concat.sample(frac=1).reset_index(drop=True)
print(shuffled_df.shape)

Convert string list columns to list type.

In [None]:
shuffled_df["text_token"] = shuffled_df["text_token"].apply(lambda x: literal_eval(x))
shuffled_df["labels"] = shuffled_df["labels"].apply(lambda x: literal_eval(x))
shuffled_df["label_list"] = shuffled_df["label_list"].apply(lambda x: literal_eval(x))

## 6. Label map

In [None]:
label_map1_name = "label_map_12062020_more_ents.json"
label_map2_name = "label_map_09062020_more_ents.json"

In [None]:
label_map1_path = f"s3://{sess.default_bucket()}/{s3_prefix}/{label_map1_name}"
label_map2_path = f"s3://{sess.default_bucket()}/{s3_prefix}/{label_map2_name}"

In [None]:
if system == "AWS":
    with fs.open(label_map1_path, "rb") as f:
        label_name_map = json.load(f)
    print(label_name_map)

In [None]:
if system == "AWS":
    with fs.open(label_map2_path, "rb") as f:
        label_name_map = json.load(f)
    print(label_name_map)

Alter label map.

Why:
* We dont need a label for 'PAD' that will be added later

In [None]:
new_label_map = {
    "O": 0,
    "CONTACT": 1,
    "DATE": 2,
    "EVENT": 3,
    "FINANCE": 4,
    "FORM": 5,
    "LOCATION": 6,
    "MISC": 7,
    "MONEY": 8,
    "ORGANIZATION": 9,
    "PERSON": 10,
    "SCHEME": 11,
    "STATE": 12,
}

In [None]:
shuffled_df.head()

In [None]:
test = shuffled_df["label_list"][0]
test

In [None]:
def label_list_id(labellist, dictionary):
    return [dictionary[x] for x in labellist]

In [None]:
label_list_id(labellist=test, dictionary=new_label_map)

In [None]:
shuffled_df["new_label_list_id"] = shuffled_df["label_list"].apply(
    lambda x: label_list_id(x, new_label_map)
)

In [None]:
shuffled_df.head()

Trim DataFrame to only the useful columns.

In [None]:
hf_df = shuffled_df[["text_token", "new_label_list_id"]]

In [None]:
hf_dataset = Dataset.from_pandas(hf_df)

In [None]:
hf_dataset

## 8. Train/Eval/Test Splits

We must split the data into Training, Evaluation and Test splits.

CONLL Dataset Has the following spits:
* Training: 14,041
* Evaluation: 3,250
* Test: 3,454

In [None]:
conll_training = {"name": "training", "total": 14041}
conll_evaluation = {"name": "evaluation", "total": 3250}
conll_test = {"name": "test", "total": 3454}

total = conll_training["total"] + conll_evaluation["total"] + conll_test["total"]
total

In [None]:
for i in [conll_training, conll_evaluation, conll_test]:
    i["proportion"] = (i["total"] / total) * 100
    print(i["name"], i["proportion"])

In [None]:
hf_dataset

In [None]:
hf_dataset = hf_dataset.train_test_split(train_size=0.85, seed=42)

In [None]:
hf_dataset

In [None]:
hf_dataset_clean = hf_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
hf_dataset_clean["validation"] = hf_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
hf_dataset_clean["test"] = hf_dataset["test"]

In [None]:
hf_dataset_clean

## 9. Upload splits to sagemaker_session_bucket

After we processed the datasets we are going to use the new FileSystem integration to upload our dataset to S3.

In [None]:
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
data_prefix = "model-data/huggingface_transformer_models/hf_data/"
print(data_prefix)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

# save train_dataset to s3
hf_input_path = f"s3://{sess.default_bucket()}/{data_prefix}"
hf_dataset_clean.save_to_disk(hf_input_path, fs=s3)