# HuggingFace Data Preparation

This is a notebook to prepare the labelled token dataset for HuggingFace.

## 1. Installs and Imports

In [None]:
!pip install datasets
!pip install transformers
!pip install s3fs
!pip install boto3
!pip install sagemaker

Collecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 26.5 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 331 kB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 41.0 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 45.9 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 50.8 MB/s 
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (192 kB)


In [None]:
import os
import json
import s3fs
import boto3
import sagemaker 
import transformers
import pandas as pd
from ast import literal_eval
from datasets import load_dataset
from datasets import Dataset
from datasets import ClassLabel, Sequence
from sklearn.model_selection import train_test_split



## 2. Permissions

In [None]:
system = "COLAB" #["AWS", "COLAB"]

In [None]:
if system=="AWS":
    fs = s3fs.S3FileSystem()    
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f's3://{s3_bucket}/model-data/govner-data'
    for f in fs.ls(DATA_DIR):
        print(f)
    #Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session() 
    sagemaker_session_bucket= s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()
        
    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system=="COLAB":
    from google.colab import drive
    drive.mount("/content/gdrive")
    #DATA_DIR = os.path.join("/content/gdrive/My Drive", "NER/Data")
    DATA_DIR = os.path.join("/content/gdrive/Shareddrives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data")


Mounted at /content/gdrive


In [None]:
DATA_DIR

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data'

## 3. Load Dataset

In [None]:
file_name = "govuk-labelled-data-ner.csv"

file_path = f"{DATA_DIR}/{file_name}"

print(file_path)

/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/govuk-labelled-data-ner.csv


In [None]:
label_map_name = "new_label_map.json"

label_map_path = f"{DATA_DIR}/{label_map_name}"

print(label_map_path)

with open(label_map_path) as f:
  label_map = json.load(f)

/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/new_label_map.json


In [None]:
label_map

{'CONTACT': 1,
 'DATE': 2,
 'EVENT': 3,
 'FINANCE': 4,
 'FORM': 5,
 'LOCATION': 6,
 'MISC': 7,
 'MONEY': 8,
 'O': 0,
 'ORGANIZATION': 9,
 'PERSON': 10,
 'SCHEME': 11,
 'STATE': 12}

In [None]:
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file,new_label_list_id
0,"However , to be clear , all of the other check...","['However', ',', 'to', 'be', 'clear', ',', 'al...","[[41, 47, 'EVENT'], [81, 90, 'PERSON']]",,,/government/speeches/business-secretarys-state...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",line_by_line_NER_data_sampled_12062020_more_en...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ..."
1,Organisations that met certain criteria during...,"['Organisations', 'that', 'met', 'certain', 'c...","[[65, 69, 'DATE'], [77, 85, 'FORM']]",,,/guidance/crc-energy-efficiency-scheme-qualifi...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATE...",line_by_line_NER_data_sampled_09062020_more_en...,"[0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 5, 0, 0, 0, ..."
2,Use this leaflet and form to find out about pa...,"['Use', 'this', 'leaflet', 'and', 'form', 'to'...","[[21, 25, 'FORM'], [44, 50, 'FINANCE'], [51, 8...",,,/government/publications/social-security-abroa...,True,"['O', 'O', 'O', 'O', 'FORM', 'O', 'O', 'O', 'O...",line_by_line_NER_data_sampled_12062020_more_en...,"[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 4, 4, 4, 0, 6, ..."
3,Legal visits must be booked by email : legalvi...,"['Legal', 'visits', 'must', 'be', 'booked', 'b...","[[31, 36, 'CONTACT'], [82, 94, 'PERSON'], [103...",,,/guidance/liverpool-prison,True,"['O', 'O', 'O', 'O', 'O', 'O', 'CONTACT', 'O',...",line_by_line_NER_data_sampled_12062020_more_en...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 10, 10, 0..."
4,RPC checklist for Small and Micro Business Ass...,"['RPC', 'checklist', 'for', 'Small', 'and', 'M...","[[0, 3, 'ORGANIZATION'], [34, 42, 'ORGANIZATIO...",,,/government/publications/small-and-micro-busin...,True,"['ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'ORG...",line_by_line_NER_data_sampled_12062020_more_en...,"[9, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 2, 2, 0]"


Evaluate literals

In [None]:
for col in ['text_token', 'label_list', 'new_label_list_id']:
    print(col)
    df[col] = df[col].map(literal_eval)

text_token
label_list
new_label_list_id


Trim DataFrame to only the useful columns.

In [None]:
df_trim = df[['text_token', 'new_label_list_id']]

In [None]:
hf_dataset = Dataset.from_pandas(df_trim)

In [None]:
hf_dataset

Dataset({
    features: ['text_token', 'new_label_list_id'],
    num_rows: 347216
})

## 4. Dataset Exploration

In [None]:
print(hf_dataset['text_token'][9])
print(hf_dataset['new_label_list_id'][9])
# print(hf_dataset['new_label_list_id'][9])

['New', 'Style', 'JSA', 'is', 'a', 'fortnightly', 'payment', 'that', 'can', 'be', 'claimed', 'on', 'its', 'own', 'or', 'at', 'the', 'same', 'time', 'as', 'Universal', 'Credit', '.']
[0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 4, 4, 0]


In [None]:
for j in ['text_token', 'new_label_list_id']:
  print("{}: {}".format(j, hf_dataset.features[f"{j}"]))
print()

text_token: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
new_label_list_id: Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)



In [None]:
labels = [i for i in label_map.keys()]
print(len(labels))
print(labels)

13
['O', 'CONTACT', 'DATE', 'EVENT', 'FINANCE', 'FORM', 'LOCATION', 'MISC', 'MONEY', 'ORGANIZATION', 'PERSON', 'SCHEME', 'STATE']


In [None]:
hf_dataset.features['new_label_list_id'] = Sequence(ClassLabel(13, labels), -1 , id=None)

In [None]:
hf_dataset.features['new_label_list_id']

Sequence(feature=ClassLabel(num_classes=13, names=['O', 'CONTACT', 'DATE', 'EVENT', 'FINANCE', 'FORM', 'LOCATION', 'MISC', 'MONEY', 'ORGANIZATION', 'PERSON', 'SCHEME', 'STATE'], names_file=None, id=None), length=-1, id=None)

In [None]:
label_list = hf_dataset.features["new_label_list_id"].feature.names
label_list

['O',
 'CONTACT',
 'DATE',
 'EVENT',
 'FINANCE',
 'FORM',
 'LOCATION',
 'MISC',
 'MONEY',
 'ORGANIZATION',
 'PERSON',
 'SCHEME',
 'STATE']

## 8. Train/Eval/Test Splits

We must split the data into Training, Evaluation and Test splits.

CONLL Dataset Has the following spits:
* Training: 14,041
* Evaluation: 3,250
* Test: 3,454

In [None]:
conll_training = {'name':'training', 'total':14041}
conll_evaluation = {'name':'evaluation', 'total':3250}
conll_test = {'name':'test', 'total':3454}

total = conll_training['total'] + conll_evaluation['total'] + conll_test['total']
total

20745

In [None]:
for i in [conll_training, conll_evaluation, conll_test]:
    i['proportion'] = (i['total'] / total) * 100
    print(i['name'], i['proportion'])

training 67.68377922390938
evaluation 15.66642564473367
test 16.649795131356953


In [None]:
hf_dataset

Dataset({
    features: ['text_token', 'new_label_list_id'],
    num_rows: 347216
})

In [None]:
hf_dataset = hf_dataset.train_test_split(train_size=0.85, seed=42)

In [None]:
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['text_token', 'new_label_list_id'],
        num_rows: 295133
    })
    test: Dataset({
        features: ['text_token', 'new_label_list_id'],
        num_rows: 52083
    })
})

Add validation split.

In [None]:
# hf_dataset_clean = hf_dataset["train"].train_test_split(train_size=0.8, seed=42)
# # Rename the default "test" split to "validation"
# hf_dataset_clean["validation"] = hf_dataset_clean.pop("test")
# # Add the "test" set to our `DatasetDict`
# hf_dataset_clean["test"] = hf_dataset["test"]

In [None]:
# hf_dataset_clean

## 9. Upload splits to gdrive

After we processed the datasets we are going to upload our dataset to gdrive.

In [None]:
dataset_name = "hf_govuk_data"

In [None]:
dataset_name_path = f'{DATA_DIR}/{dataset_name}'
dataset_name_path

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/hf_govuk_data'

In [None]:
# save train_dataset to gdrive
hf_input_path = f'{dataset_name_path}'
hf_dataset.save_to_disk(hf_input_path)

Flattening the indices:   0%|          | 0/296 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/53 [00:00<?, ?ba/s]

## 10. Download Splits

In [None]:
from datasets import load_dataset, load_metric, load_from_disk

hf_data = 'hf_govuk_data'

hf_data_path = f'{DATA_DIR}/{hf_data}'
hf_data_path

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/hf_govuk_data'

In [None]:
datasets = load_from_disk(hf_data_path)

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 295133
    })
    test: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 52083
    })
})

In [None]:
datasets['train'].features

{'new_label_list_id': Sequence(feature=ClassLabel(num_classes=13, names=['O', 'CONTACT', 'DATE', 'EVENT', 'FINANCE', 'FORM', 'LOCATION', 'MISC', 'MONEY', 'ORGANIZATION', 'PERSON', 'SCHEME', 'STATE'], names_file=None, id=None), length=-1, id=None),
 'text_token': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

## 11. Make smaller sampled dataset

In [None]:
df_trim = df[['text_token', 'new_label_list_id']]

In [None]:
samp_df_trim = df_trim.sample(10000).reset_index()
samp_df_trim = samp_df_trim[['text_token', 'new_label_list_id']]
samp_df_trim.shape

(10000, 2)

In [None]:
samp_df_trim.head()

Unnamed: 0,text_token,new_label_list_id
0,"[The, newspaper, did, not, say, how, it, obtai...","[0, 9, 0, 0, 0, 0, 0, 0, 0, 7, 0]"
1,"[This, is, necessary, for, wider, prescribing,...","[0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 2, 0]"
2,"[PDF, ,, 330KB, ,, 4, pages, Overview, 3, :, C...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[All, travellers, coming, from, endemic, count...","[0, 10, 0, 0, 0, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0]"
4,"[Children, ’, s, visits, are, where, children,...","[10, 0, 0, 0, 0, 0, 10, 0, 0, 10, 0, 3, 0, 0, ..."


In [None]:
samp_hf_dataset = Dataset.from_pandas(samp_df_trim)

In [None]:
samp_hf_dataset

Dataset({
    features: ['text_token', 'new_label_list_id'],
    num_rows: 10000
})

In [None]:
samp_hf_dataset[0]

{'new_label_list_id': [0, 9, 0, 0, 0, 0, 0, 0, 0, 7, 0],
 'text_token': ['The',
  'newspaper',
  'did',
  'not',
  'say',
  'how',
  'it',
  'obtained',
  'the',
  'photos',
  '.']}

In [None]:
samp_hf_dataset.features['new_label_list_id'] = Sequence(ClassLabel(13, labels), -1 , id=None)

In [None]:
samp_hf_dataset.features['new_label_list_id']

Sequence(feature=ClassLabel(num_classes=13, names=['O', 'CONTACT', 'DATE', 'EVENT', 'FINANCE', 'FORM', 'LOCATION', 'MISC', 'MONEY', 'ORGANIZATION', 'PERSON', 'SCHEME', 'STATE'], names_file=None, id=None), length=-1, id=None)

In [None]:
label_list = samp_hf_dataset.features["new_label_list_id"].feature.names
label_list

['O',
 'CONTACT',
 'DATE',
 'EVENT',
 'FINANCE',
 'FORM',
 'LOCATION',
 'MISC',
 'MONEY',
 'ORGANIZATION',
 'PERSON',
 'SCHEME',
 'STATE']

In [None]:
samp_hf_dataset = samp_hf_dataset.train_test_split(train_size=0.85, seed=42)

In [None]:
samp_hf_dataset

DatasetDict({
    train: Dataset({
        features: ['text_token', 'new_label_list_id'],
        num_rows: 8500
    })
    test: Dataset({
        features: ['text_token', 'new_label_list_id'],
        num_rows: 1500
    })
})

After we processed the datasets we are going to upload our dataset to gdrive.

In [None]:
samp_dataset_name = "samp_hf_govuk_data"

In [None]:
samp_dataset_path = f'{DATA_DIR}/{samp_dataset_name}'
samp_dataset_path

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/samp_hf_govuk_data'

In [None]:
# save train_dataset to gdrive
samp_hf_input_path = f'{samp_dataset_path}'
samp_hf_dataset.save_to_disk(samp_hf_input_path)

Flattening the indices:   0%|          | 0/9 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/2 [00:00<?, ?ba/s]