# HuggingFace Data Preparation

This is a notebook to prepare the labelled token dataset for HuggingFace.

## 1. Installs and Imports

In [2]:
# !pip install datasets
# !pip install transformers
# !pip install s3fs
# !pip install boto3
# !pip install sagemaker

In [3]:
import os
import json
import s3fs
import boto3
import sagemaker 
import transformers
import pandas as pd
from ast import literal_eval
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split



## 2. Permissions

In [4]:
system = "COLAB" #["AWS", "COLAB"]

In [5]:
if system=="AWS":
    fs = s3fs.S3FileSystem()    
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f's3://{s3_bucket}/model-data/govner-data'
    for f in fs.ls(DATA_DIR):
        print(f)
    #Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session() 
    sagemaker_session_bucket= s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()
        
    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system=="COLAB":
    from google.colab import drive
    drive.mount("/content/gdrive")
    #DATA_DIR = os.path.join("/content/gdrive/My Drive", "NER/Data")
    DATA_DIR = os.path.join("/content/gdrive/Shareddrives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data")


Mounted at /content/gdrive


In [9]:
DATA_DIR

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data'

## 3. Load Dataset

In [10]:
# s3 key prefix for the data

dataset1_name = 'line_by_line_NER_data_sampled_12062020_more_ents.csv'
dataset2_name = 'line_by_line_NER_data_sampled_09062020_more_ents.csv'

dataset1_path = f'{DATA_DIR}/{dataset1_name}'
dataset2_path = f'{DATA_DIR}/{dataset2_name}'

In [None]:
dataset1 = pd.read_csv(dataset1_path, sep="\t", low_memory=False)
dataset2 = pd.read_csv(dataset2_path, sep="\t", low_memory=False)

## 4. Exploration

In [None]:
print(f"dataset1 shape: {dataset1.shape}")
print(f"dataset2 shape: {dataset2.shape}")

print("total rows: {}".format(dataset1.shape[0] + dataset2.shape[0]))

dataset1 shape: (236641, 8)
dataset2 shape: (110575, 8)
total rows: 347216


In [None]:
dataset1.sample(5)

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list
209876,External quality assurance of apprenticeships ...,"['External', 'quality', 'assurance', 'of', 'ap...","[[69, 75, 'ORGANIZATION']]",,,/government/news/external-quality-assurance-of...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
33299,We will get back to you if that is the case fo...,"['We', 'will', 'get', 'back', 'to', 'you', 'if...","[[52, 60, 'ORGANIZATION']]",,,/guidance/money-laundering-regulations-supervi...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
162553,Please send us your feedback on the design gui...,"['Please', 'send', 'us', 'your', 'feedback', '...","[[0, 6, 'MISC'], [43, 48, 'CONTACT']]",,,/government/publications/court-and-tribunal-de...,True,"['MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'C..."
23959,There is also a Plans Privacy Statement ( PDF ...,"['There', 'is', 'also', 'a', 'Plans', 'Privacy...","[[77, 83, 'CONTACT'], [98, 119, 'ORGANIZATION'...",,,/guidance/local-plans,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
46541,Antimicrobial consumption data : validation pr...,"['Antimicrobial', 'consumption', 'data', ':', ...","[[57, 60, 'ORGANIZATION'], [67, 73, 'FINANCE']]",,,/government/publications/antimicrobial-consump...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORGANIZAT..."


In [None]:
dataset2.sample(5)

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list
42315,This is for providers who will directly delive...,"['This', 'is', 'for', 'providers', 'who', 'wil...","[[48, 62, 'EVENT'], [63, 71, 'STATE'], [76, 85...",,,/guidance/register-of-apprenticeship-training-...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'EVEN..."
20042,Collection of this evaluation data is a condit...,"['Collection', 'of', 'this', 'evaluation', 'da...","[[40, 49, 'STATE'], [57, 64, 'FINANCE']]",,,/guidance/uk-asylum-migration-and-integration-...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'STATE', '..."
7413,"If it will not be set up in time , you ’ ll ne...","['If', 'it', 'will', 'not', 'be', 'set', 'up',...","[[28, 32, 'EVENT'], [56, 63, 'DATE'], [64, 71,...",,,/guidance/pay-duty-on-biofuels-or-gas-for-road...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'EVEN..."
60342,PHE is not responsible for the supply of PPE .,"['PHE', 'is', 'not', 'responsible', 'for', 'th...","[[0, 3, 'ORGANIZATION']]",,,/government/news/phe-response-to-a-sun-newspap...,True,"['ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O',..."
21509,CHIEF automatically reduces the balance on Sta...,"['CHIEF', 'automatically', 'reduces', 'the', '...","[[32, 39, 'FINANCE'], [123, 130, 'FINANCE'], [...",,,/guidance/automatic-import-and-export-licence-...,True,"['O', 'O', 'O', 'O', 'FINANCE', 'O', 'O', 'O',..."


Investigate some sapmples...

In [None]:
row = 205652

text = dataset1.loc[row]['text']
labels = dataset1.loc[row]['labels']
print(text)
print(labels)

In [None]:
for idx, char in enumerate(text):
  print(idx, char)

Check for duplication...

In [None]:
diff_df = pd.merge(dataset1, dataset2, how='outer', indicator='Exist')

diff_df = diff_df.loc[diff_df['Exist'] != 'both']
print(diff_df.shape)

(257674, 9)


In [None]:
diff_df

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,Exist
388,The earliest that leave can be taken is 11 wee...,"['The', 'earliest', 'that', 'leave', 'can', 'b...","[[40, 48, 'DATE'], [60, 73, 'DATE'], [77, 87, ...",True,"[[107, 111, 'EVENT'], [60, 73, 'DATE'], [40, 4...",/employers-maternity-pay-leave,False,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATE...",left_only
632,Where you advertise might cause indirect discr...,"['Where', 'you', 'advertise', 'might', 'cause'...","[[90, 93, 'PERSON'], [98, 107, 'CONTACT']]",True,"[[98, 107, 'CONTACT'], [90, 93, 'PERSON']]",/employer-preventing-discrimination,False,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",left_only
1049,Decisions made before 1998 are listed by year ...,"['Decisions', 'made', 'before', '1998', 'are',...","[[22, 26, 'DATE'], [41, 45, 'DATE']]",True,"[[41, 45, 'DATE'], [22, 26, 'DATE']]",/search-patent-decisions,False,"['O', 'O', 'O', 'DATE', 'O', 'O', 'O', 'DATE',...",left_only
1222,Mae ’ n rhaid i chi wneud cais am Bensiwn y Wl...,"['Mae', '’', 'n', 'rhaid', 'i', 'chi', 'wneud'...","[[34, 55, 'LOCATION'], [101, 112, 'LOCATION'],...",True,"[[34, 55, 'LOCATION'], [101, 112, 'LOCATION'],...",/pensiwn-sylfaenol-y-wladwriaeth,False,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",left_only
1249,Officers Officers in your crew must have the n...,"['Officers', 'Officers', 'in', 'your', 'crew',...","[[0, 17, 'PERSON'], [26, 30, 'PERSON'], [55, 8...",True,"[[84, 88, 'FORM'], [55, 81, 'FORM'], [26, 30, ...",/hiring-crew,False,"['PERSON', 'PERSON', 'O', 'O', 'PERSON', 'O', ...",left_only
...,...,...,...,...,...,...,...,...,...
302440,The report comes as Afghan officials announced...,"['The', 'report', 'comes', 'as', 'Afghan', 'of...","[[4, 10, 'CONTACT'], [20, 26, 'STATE'], [56, 6...",,,Sentence: 47953,False,"['O', 'CONTACT', 'O', 'O', 'STATE', 'O', 'O', ...",right_only
302441,A U.S.-backed operation pushed the Taleban fro...,"['A', 'U.S.-backed', 'operation', 'pushed', 't...","[[35, 42, 'ORGANIZATION'], [68, 75, 'PERSON'],...",,,Sentence: 47954,False,"['O', 'O', 'O', 'O', 'O', 'ORGANIZATION', 'O',...",right_only
302442,Indian border security forces are accusing the...,"['Indian', 'border', 'security', 'forces', 'ar...","[[0, 6, 'STATE'], [49, 58, 'STATE'], [119, 125...",,,Sentence: 47955,False,"['STATE', 'O', 'O', 'O', 'O', 'O', 'O', 'STATE...",right_only
302443,Indian officials said no one was injured in Sa...,"['Indian', 'officials', 'said', 'no', 'one', '...","[[0, 6, 'STATE'], [44, 52, 'DATE'], [56, 64, '...",,,Sentence: 47956,False,"['STATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...",right_only


## 5. Concatenation

We will concatenate the DaataFrames. They are likely separate for storage/memory reasons. We will combine and shuffle them anyway. We will also add a flag to show what dataset they were originally from too, for later reference.

In [None]:
dataset1['original_file'] = 'line_by_line_NER_data_sampled_12062020_more_ents.csv'
dataset2['original_file'] = 'line_by_line_NER_data_sampled_09062020_more_ents.csv'

In [None]:
dataset1.sample(5)

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file
207228,This includes supporting adoption of a Nationa...,"['This', 'includes', 'supporting', 'adoption',...","[[25, 33, 'EVENT'], [48, 60, 'ORGANIZATION'], ...",,,/government/speeches/uk-statement-in-response-...,True,"['O', 'O', 'O', 'EVENT', 'O', 'O', 'O', 'ORGAN...",line_by_line_NER_data_sampled_12062020_more_en...
187605,Company Tax Return guide ( CT600 Guide ( 2007 ...,"['Company', 'Tax', 'Return', 'guide', '(', 'CT...","[[0, 18, 'FORM'], [19, 24, 'CONTACT'], [41, 45...",,,/government/publications/corporation-tax-compa...,True,"['FORM', 'FORM', 'FORM', 'CONTACT', 'O', 'O', ...",line_by_line_NER_data_sampled_12062020_more_en...
69988,A Community Approach on the Prevention of Natu...,"['A', 'Community', 'Approach', 'on', 'the', 'P...","[[2, 11, 'PERSON']]",,,/government/publications/eu-funding-programmes...,True,"['O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', ...",line_by_line_NER_data_sampled_12062020_more_en...
25071,You should instead read the separate guide App...,"['You', 'should', 'instead', 'read', 'the', 's...","[[37, 42, 'CONTACT'], [43, 48, 'FORM'], [84, 9...",,,/guidance/childminders-report-new-adults-in-th...,True,"['O', 'O', 'O', 'O', 'O', 'O', 'CONTACT', 'FOR...",line_by_line_NER_data_sampled_12062020_more_en...
192100,Check if you need to pay excise duty on road f...,"['Check', 'if', 'you', 'need', 'to', 'pay', 'e...","[[21, 24, 'FINANCE'], [25, 36, 'FINANCE'], [72...",,,/guidance/register-for-road-fuel-gas-duty,True,"['O', 'O', 'O', 'O', 'O', 'FINANCE', 'FINANCE'...",line_by_line_NER_data_sampled_12062020_more_en...


In [None]:
dataset2.sample(5)

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file
110098,"Meanwhile , Ethiopia 's rival parties signed a...","['Meanwhile', ',', 'Ethiopia', ""'s"", 'rival', ...","[[12, 20, 'LOCATION'], [30, 37, 'PERSON'], [52...",,,Sentence: 47454,False,"['O', 'O', 'LOCATION', 'O', 'O', 'PERSON', 'O'...",line_by_line_NER_data_sampled_09062020_more_en...
31372,If your current scheme doesn ’ t qualify then ...,"['If', 'your', 'current', 'scheme', 'doesn', '...","[[16, 22, 'SCHEME'], [95, 101, 'DATE']]",,,/guidance/pension-rules-for-charities,True,"['O', 'O', 'O', 'SCHEME', 'O', 'O', 'O', 'O', ...",line_by_line_NER_data_sampled_09062020_more_en...
25749,Read the full DCMS Sectors Economic Estimates ...,"['Read', 'the', 'full', 'DCMS', 'Sectors', 'Ec...","[[14, 18, 'ORGANIZATION'], [51, 57, 'CONTACT']]",,,/government/news/digital-sector-worth-more-tha...,True,"['O', 'O', 'O', 'ORGANIZATION', 'O', 'O', 'O',...",line_by_line_NER_data_sampled_09062020_more_en...
17349,But the process for notifying the UK authoriti...,"['But', 'the', 'process', 'for', 'notifying', ...","[[8, 15, 'EVENT'], [30, 36, 'LOCATION'], [37, ...",,,/guidance/moving-balai-directive-animals-semen...,True,"['O', 'O', 'EVENT', 'O', 'O', 'LOCATION', 'LOC...",line_by_line_NER_data_sampled_09062020_more_en...
65618,We will carry out a wider review of the whole ...,"['We', 'will', 'carry', 'out', 'a', 'wider', '...","[[26, 32, 'CONTACT'], [59, 65, 'MISC']]",,,/guidance/sellafield-nuclear-regulation,True,"['O', 'O', 'O', 'O', 'O', 'O', 'CONTACT', 'O',...",line_by_line_NER_data_sampled_09062020_more_en...


Combine into one dataset.

In [None]:
frames = [dataset1, dataset2]
concat = pd.concat(frames)
print(concat.shape)

(347216, 9)


Shuffle dataset.

In [None]:
shuffled_df = concat.sample(frac=1).reset_index(drop=True)
print(shuffled_df.shape)

(347216, 9)


Convert string list columns to list type.

In [None]:
shuffled_df['text_token'] = shuffled_df['text_token'].apply(lambda x: literal_eval(x))
shuffled_df['labels'] = shuffled_df['labels'].apply(lambda x: literal_eval(x))
shuffled_df['label_list'] = shuffled_df['label_list'].apply(lambda x: literal_eval(x))

Save to CSV file.

In [None]:
combined_name = 'line_by_line_NER_data_combined.csv'
combined_path = f'{DATA_DIR}/{combined_name}'
combined_path

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/line_by_line_NER_data_combined.csv'

In [None]:
shuffled_df.to_csv(combined_path, sep='\t', index=None)

## 6. Label map

In [None]:
label_map1_name = 'label_map_12062020_more_ents.json'
label_map2_name = 'label_map_09062020_more_ents.json'

In [None]:
label_map1_path = f'{DATA_DIR}/{label_map1_name}'
label_map2_path = f'{DATA_DIR}/{label_map2_name}'

In [None]:
if system == "COLAB":
    with open(label_map1_path, 'rb') as f:
        label_name_map = json.load(f)
    print(label_name_map)

{'PAD': 0, 'O': 1, 'CONTACT': 2, 'DATE': 3, 'EVENT': 4, 'FINANCE': 5, 'FORM': 6, 'LOCATION': 7, 'MISC': 8, 'MONEY': 9, 'ORGANIZATION': 10, 'PERSON': 11, 'SCHEME': 12, 'STATE': 13}


In [None]:
if system == "COLAB":
    with open(label_map2_path, 'rb') as f:
        label_name_map = json.load(f)
    print(label_name_map)

{'PAD': 0, 'O': 1, 'CONTACT': 2, 'DATE': 3, 'EVENT': 4, 'FINANCE': 5, 'FORM': 6, 'LOCATION': 7, 'MISC': 8, 'MONEY': 9, 'ORGANIZATION': 10, 'PERSON': 11, 'SCHEME': 12, 'STATE': 13}


Alter label map.

Why:
* We dont need a label for 'PAD' that will be added later

In [None]:
new_label_map = {'O': 0,
                'CONTACT': 1,
                'DATE': 2,
                'EVENT': 3,
                'FINANCE': 4,
                'FORM': 5,
                'LOCATION': 6,
                'MISC': 7,
                'MONEY': 8,
                'ORGANIZATION': 9,
                'PERSON': 10,
                'SCHEME': 11,
                'STATE': 12}

Save new label map

In [None]:
new_label_map_name = 'new_label_map.json'
new_label_map_path = f'{DATA_DIR}/{new_label_map_name}'

In [None]:
with open(new_label_map_path, 'w') as fp:
    json.dump(new_label_map, fp)

In [None]:
shuffled_df.head()

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file
0,"However , to be clear , all of the other check...","[However, ,, to, be, clear, ,, all, of, the, o...","[[41, 47, EVENT], [81, 90, PERSON]]",,,/government/speeches/business-secretarys-state...,True,"[O, O, O, O, O, O, O, O, O, O, EVENT, O, O, O,...",line_by_line_NER_data_sampled_12062020_more_en...
1,Organisations that met certain criteria during...,"[Organisations, that, met, certain, criteria, ...","[[65, 69, DATE], [77, 85, FORM]]",,,/guidance/crc-energy-efficiency-scheme-qualifi...,True,"[O, O, O, O, O, O, O, O, DATE, O, O, FORM, O, ...",line_by_line_NER_data_sampled_09062020_more_en...
2,Use this leaflet and form to find out about pa...,"[Use, this, leaflet, and, form, to, find, out,...","[[21, 25, FORM], [44, 50, FINANCE], [51, 83, F...",,,/government/publications/social-security-abroa...,True,"[O, O, O, O, FORM, O, O, O, O, FINANCE, FINANC...",line_by_line_NER_data_sampled_12062020_more_en...
3,Legal visits must be booked by email : legalvi...,"[Legal, visits, must, be, booked, by, email, :...","[[31, 36, CONTACT], [82, 94, PERSON], [103, 10...",,,/guidance/liverpool-prison,True,"[O, O, O, O, O, O, CONTACT, O, O, O, O, O, PER...",line_by_line_NER_data_sampled_12062020_more_en...
4,RPC checklist for Small and Micro Business Ass...,"[RPC, checklist, for, Small, and, Micro, Busin...","[[0, 3, ORGANIZATION], [34, 42, ORGANIZATION],...",,,/government/publications/small-and-micro-busin...,True,"[ORGANIZATION, O, O, O, O, O, ORGANIZATION, O,...",line_by_line_NER_data_sampled_12062020_more_en...


In [None]:
test = shuffled_df['label_list'][0]
test

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'EVENT',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'PERSON',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [None]:
def label_list_id(labellist, dictionary):
    return [dictionary[x] for x in labellist]

In [None]:
label_list_id(labellist=test, dictionary=new_label_map)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 10,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
shuffled_df['new_label_list_id'] = shuffled_df['label_list'].apply(lambda x: label_list_id(x, new_label_map))

In [None]:
shuffled_df.head()

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file,new_label_list_id
0,"However , to be clear , all of the other check...","[However, ,, to, be, clear, ,, all, of, the, o...","[[41, 47, EVENT], [81, 90, PERSON]]",,,/government/speeches/business-secretarys-state...,True,"[O, O, O, O, O, O, O, O, O, O, EVENT, O, O, O,...",line_by_line_NER_data_sampled_12062020_more_en...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ..."
1,Organisations that met certain criteria during...,"[Organisations, that, met, certain, criteria, ...","[[65, 69, DATE], [77, 85, FORM]]",,,/guidance/crc-energy-efficiency-scheme-qualifi...,True,"[O, O, O, O, O, O, O, O, DATE, O, O, FORM, O, ...",line_by_line_NER_data_sampled_09062020_more_en...,"[0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 5, 0, 0, 0, ..."
2,Use this leaflet and form to find out about pa...,"[Use, this, leaflet, and, form, to, find, out,...","[[21, 25, FORM], [44, 50, FINANCE], [51, 83, F...",,,/government/publications/social-security-abroa...,True,"[O, O, O, O, FORM, O, O, O, O, FINANCE, FINANC...",line_by_line_NER_data_sampled_12062020_more_en...,"[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 4, 4, 4, 0, 6, ..."
3,Legal visits must be booked by email : legalvi...,"[Legal, visits, must, be, booked, by, email, :...","[[31, 36, CONTACT], [82, 94, PERSON], [103, 10...",,,/guidance/liverpool-prison,True,"[O, O, O, O, O, O, CONTACT, O, O, O, O, O, PER...",line_by_line_NER_data_sampled_12062020_more_en...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 10, 10, 0..."
4,RPC checklist for Small and Micro Business Ass...,"[RPC, checklist, for, Small, and, Micro, Busin...","[[0, 3, ORGANIZATION], [34, 42, ORGANIZATION],...",,,/government/publications/small-and-micro-busin...,True,"[ORGANIZATION, O, O, O, O, O, ORGANIZATION, O,...",line_by_line_NER_data_sampled_12062020_more_en...,"[9, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 2, 2, 0]"


## 8. Save DataFrame to gdrive

In [None]:
save_df_name = 'govuk-labelled-data-ner.csv'

save_df_path = f'{DATA_DIR}/{save_df_name}'

In [None]:
save_df_path

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/govuk-labelled-data-ner.csv'

In [None]:
shuffled_df.to_csv(save_df_path, index=None)