# HuggingFace Data Preparation

This is a notebook to prepare the labelled token dataset for HuggingFace.

## 1. Installs and Imports

In [None]:
!pip install datasets
!pip install transformers
!pip install s3fs
!pip install boto3
!pip install sagemaker

In [None]:
import json
import os
from ast import literal_eval

import boto3
import pandas as pd
import s3fs
import sagemaker
import transformers
from datasets import ClassLabel, Dataset, Sequence, load_dataset
from sklearn.model_selection import train_test_split

## 2. Permissions

In [None]:
system = "COLAB"  # ["AWS", "COLAB"]

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{s3_bucket}/model-data/govner-data"
    for f in fs.ls(DATA_DIR):
        print(f)
    # Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session()
    sagemaker_session_bucket = s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()

    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system == "COLAB":
    from google.colab import drive

    drive.mount("/content/gdrive")
    # DATA_DIR = os.path.join("/content/gdrive/My Drive", "NER/Data")
    DATA_DIR = os.path.join(
        "/content/gdrive/Shareddrives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data",
    )

In [None]:
DATA_DIR

## 3. Load Dataset

In [None]:
file_name = "govuk-labelled-data-ner.csv"

file_path = f"{DATA_DIR}/{file_name}"

print(file_path)

In [None]:
label_map_name = "new_label_map.json"

label_map_path = f"{DATA_DIR}/{label_map_name}"

print(label_map_path)

with open(label_map_path) as f:
    label_map = json.load(f)

In [None]:
label_map

In [None]:
df = pd.read_csv(file_path)

In [None]:
df.head()

Evaluate literals

In [None]:
for col in ["text_token", "label_list", "new_label_list_id"]:
    print(col)
    df[col] = df[col].map(literal_eval)

Trim DataFrame to only the useful columns.

In [None]:
df_trim = df[["text_token", "new_label_list_id"]]

In [None]:
hf_dataset = Dataset.from_pandas(df_trim)

In [None]:
hf_dataset

## 4. Dataset Exploration

In [None]:
print(hf_dataset["text_token"][9])
print(hf_dataset["new_label_list_id"][9])
# print(hf_dataset['new_label_list_id'][9])

In [None]:
for j in ["text_token", "new_label_list_id"]:
    print("{}: {}".format(j, hf_dataset.features[f"{j}"]))
print()

In [None]:
labels = [i for i in label_map.keys()]
print(len(labels))
print(labels)

In [None]:
hf_dataset.features["new_label_list_id"] = Sequence(ClassLabel(13, labels), -1, id=None)

In [None]:
hf_dataset.features["new_label_list_id"]

In [None]:
label_list = hf_dataset.features["new_label_list_id"].feature.names
label_list

## 8. Train/Eval/Test Splits

We must split the data into Training, Evaluation and Test splits.

CONLL Dataset Has the following spits:
* Training: 14,041
* Evaluation: 3,250
* Test: 3,454

In [None]:
conll_training = {"name": "training", "total": 14041}
conll_evaluation = {"name": "evaluation", "total": 3250}
conll_test = {"name": "test", "total": 3454}

total = conll_training["total"] + conll_evaluation["total"] + conll_test["total"]
total

In [None]:
for i in [conll_training, conll_evaluation, conll_test]:
    i["proportion"] = (i["total"] / total) * 100
    print(i["name"], i["proportion"])

In [None]:
hf_dataset

In [None]:
hf_dataset = hf_dataset.train_test_split(train_size=0.85, seed=42)

In [None]:
hf_dataset

Add validation split.

In [None]:
# hf_dataset_clean = hf_dataset["train"].train_test_split(train_size=0.8, seed=42)
# # Rename the default "test" split to "validation"
# hf_dataset_clean["validation"] = hf_dataset_clean.pop("test")
# # Add the "test" set to our `DatasetDict`
# hf_dataset_clean["test"] = hf_dataset["test"]

In [None]:
# hf_dataset_clean

## 9. Upload splits to gdrive

After we processed the datasets we are going to upload our dataset to gdrive.

In [None]:
dataset_name = "hf_govuk_data"

In [None]:
dataset_name_path = f"{DATA_DIR}/{dataset_name}"
dataset_name_path

In [None]:
# save train_dataset to gdrive
hf_input_path = f"{dataset_name_path}"
hf_dataset.save_to_disk(hf_input_path)

## 10. Download Splits

In [None]:
from datasets import load_dataset, load_from_disk, load_metric

hf_data = "hf_govuk_data"

hf_data_path = f"{DATA_DIR}/{hf_data}"
hf_data_path

In [None]:
datasets = load_from_disk(hf_data_path)

In [None]:
datasets

In [None]:
datasets["train"].features