# HuggingFace Data Preparation

This is a notebook to prepare the labelled token dataset for HuggingFace.

## 1. Installs and Imports

In [None]:
# !pip install datasets
# !pip install transformers
# !pip install s3fs
# !pip install boto3
# !pip install sagemaker

In [None]:
import json
import os
from ast import literal_eval

import boto3
import pandas as pd
import s3fs
import sagemaker
import transformers
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split

## 2. Permissions

In [None]:
system = "COLAB"  # ["AWS", "COLAB"]

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{s3_bucket}/model-data/govner-data"
    for f in fs.ls(DATA_DIR):
        print(f)
    # Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session()
    sagemaker_session_bucket = s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()

    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system == "COLAB":
    from google.colab import drive

    drive.mount("/content/gdrive")
    # DATA_DIR = os.path.join("/content/gdrive/My Drive", "NER/Data")
    DATA_DIR = os.path.join(
        "/content/gdrive/Shareddrives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data",
    )

In [None]:
DATA_DIR

## 3. Load Dataset

In [None]:
# s3 key prefix for the data

dataset1_name = "line_by_line_NER_data_sampled_12062020_more_ents.csv"
dataset2_name = "line_by_line_NER_data_sampled_09062020_more_ents.csv"

dataset1_path = f"{DATA_DIR}/{dataset1_name}"
dataset2_path = f"{DATA_DIR}/{dataset2_name}"

In [None]:
dataset1 = pd.read_csv(dataset1_path, sep="\t", low_memory=False)
dataset2 = pd.read_csv(dataset2_path, sep="\t", low_memory=False)

## 4. Exploration

In [None]:
print(f"dataset1 shape: {dataset1.shape}")
print(f"dataset2 shape: {dataset2.shape}")

print("total rows: {}".format(dataset1.shape[0] + dataset2.shape[0]))

In [None]:
dataset1.sample(5)

In [None]:
dataset2.sample(5)

Investigate some sapmples...

In [None]:
row = 205652

text = dataset1.loc[row]["text"]
labels = dataset1.loc[row]["labels"]
print(text)
print(labels)

In [None]:
for idx, char in enumerate(text):
    print(idx, char)

Check for duplication...

In [None]:
diff_df = pd.merge(dataset1, dataset2, how="outer", indicator="Exist")

diff_df = diff_df.loc[diff_df["Exist"] != "both"]
print(diff_df.shape)

In [None]:
diff_df

## 5. Concatenation

We will concatenate the DaataFrames. They are likely separate for storage/memory reasons. We will combine and shuffle them anyway. We will also add a flag to show what dataset they were originally from too, for later reference.

In [None]:
dataset1["original_file"] = "line_by_line_NER_data_sampled_12062020_more_ents.csv"
dataset2["original_file"] = "line_by_line_NER_data_sampled_09062020_more_ents.csv"

In [None]:
dataset1.sample(5)

In [None]:
dataset2.sample(5)

Combine into one dataset.

In [None]:
frames = [dataset1, dataset2]
concat = pd.concat(frames)
print(concat.shape)

Shuffle dataset.

In [None]:
shuffled_df = concat.sample(frac=1).reset_index(drop=True)
print(shuffled_df.shape)

Convert string list columns to list type.

In [None]:
shuffled_df["text_token"] = shuffled_df["text_token"].apply(lambda x: literal_eval(x))
shuffled_df["labels"] = shuffled_df["labels"].apply(lambda x: literal_eval(x))
shuffled_df["label_list"] = shuffled_df["label_list"].apply(lambda x: literal_eval(x))

Save to CSV file.

In [None]:
combined_name = "line_by_line_NER_data_combined.csv"
combined_path = f"{DATA_DIR}/{combined_name}"
combined_path

In [None]:
shuffled_df.to_csv(combined_path, sep="\t", index=None)

## 6. Label map

In [None]:
label_map1_name = "label_map_12062020_more_ents.json"
label_map2_name = "label_map_09062020_more_ents.json"

In [None]:
label_map1_path = f"{DATA_DIR}/{label_map1_name}"
label_map2_path = f"{DATA_DIR}/{label_map2_name}"

In [None]:
if system == "COLAB":
    with open(label_map1_path, "rb") as f:
        label_name_map = json.load(f)
    print(label_name_map)

In [None]:
if system == "COLAB":
    with open(label_map2_path, "rb") as f:
        label_name_map = json.load(f)
    print(label_name_map)

Alter label map.

Why:
* We dont need a label for 'PAD' that will be added later

In [None]:
new_label_map = {
    "O": 0,
    "CONTACT": 1,
    "DATE": 2,
    "EVENT": 3,
    "FINANCE": 4,
    "FORM": 5,
    "LOCATION": 6,
    "MISC": 7,
    "MONEY": 8,
    "ORGANIZATION": 9,
    "PERSON": 10,
    "SCHEME": 11,
    "STATE": 12,
}

Save new label map

In [None]:
new_label_map_name = "new_label_map.json"
new_label_map_path = f"{DATA_DIR}/{new_label_map_name}"

In [None]:
with open(new_label_map_path, "w") as fp:
    json.dump(new_label_map, fp)

In [None]:
shuffled_df.head()

In [None]:
test = shuffled_df["label_list"][0]
test

In [None]:
def label_list_id(labellist, dictionary):
    return [dictionary[x] for x in labellist]

In [None]:
label_list_id(labellist=test, dictionary=new_label_map)

In [None]:
shuffled_df["new_label_list_id"] = shuffled_df["label_list"].apply(
    lambda x: label_list_id(x, new_label_map)
)

In [None]:
shuffled_df.head()

## 8. Save DataFrame to gdrive

In [None]:
save_df_name = "govuk-labelled-data-ner.csv"

save_df_path = f"{DATA_DIR}/{save_df_name}"

In [None]:
save_df_path

In [None]:
shuffled_df.to_csv(save_df_path, index=None)