# 015_COLAB_JanuaryTrainingDataPrep



This is a notebook to prepare the datasets for the training jobs run in January 2022 for project playback.

## 1. Installs and Imports

In [1]:
!pip install datasets transformers torch seqeval &> /dev/null

In [7]:
import os
import json
import transformers
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter
from datasets import Dataset, load_dataset, load_from_disk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

## 2. Permissions

In [2]:
system = "COLAB" #["AWS", "COLAB"]

In [41]:
if system=="COLAB":
  from google.colab import drive
  drive.mount("/content/gdrive")
  #DATA_DIR = os.path.join("/content/gdrive/Shareddrives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Jan2022-Data")
  DATA_DIR = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data"


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [42]:
DATA_DIR

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Feb2022-Data'

## 3. Load Datasets

* The combined dataset (~230,000)
* The validated dataset (~5,500)

## 3_. HF Datasets

In [None]:
validated_hf = load_from_disk('/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/validated_ner_hf')
unvalidated_hf = load_from_disk('/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/hf_govuk_data')
samp_unvalidated_hf = load_from_disk('/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/samp_hf_govuk_data')

In [None]:
validated_hf

DatasetDict({
    train: Dataset({
        features: ['label_list', 'new_label_list_id', 'text_token'],
        num_rows: 5703
    })
    test: Dataset({
        features: ['label_list', 'new_label_list_id', 'text_token'],
        num_rows: 1426
    })
})

In [None]:
for i in validated_hf['train'][0]:
  print(validated_hf['train'][i])

[['O', 'O', 'O', 'O', 'O', 'O', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'O', 'I-EVENT', 'O', 'O', 'O', 'I-EVENT', 'I-EVENT', 'I-EVENT', 'I-EVENT', 'I-EVENT', 'I-EVENT', 'O'], ['O', 'O', 'O', 'O', 'I-ORGANIZATION', 'O', 'O', 'I-ORGANIZATION', 'O', 'I-ORGANIZATION', 'I-ORGANIZATION', 'O'], ['I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-FINANCE', 'I-FINANCE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PERSON', 'I-PERSON', 'O', 'I-CONTACT', 'I-CONTACT', 'I-CONTACT', 'I-CONTACT', 'I-CONTACT', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-CONTACT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOCATION', 'I-LOCATION', 'O']

In [None]:
unvalidated_hf

DatasetDict({
    train: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 295133
    })
    test: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 52083
    })
})

In [None]:
for i in unvalidated_hf['train'][0]:
  print(unvalidated_hf['train'][i])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
samp_unvalidated_hf

DatasetDict({
    train: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 8500
    })
    test: Dataset({
        features: ['new_label_list_id', 'text_token'],
        num_rows: 1500
    })
})

### 3A. Validated Dataset

In [5]:
# s3 key prefix for the data
validated = 'govuk-labelled-data-ner-validated.csv'
validated_path = f'{DATA_DIR}/{validated}'
validated_path

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Jan2022-Data/govuk-labelled-data-ner-validated.csv'

In [8]:
validated_df = pd.read_csv(validated_path, sep=',')
print(validated_df.shape)
validated_df.head()

(7129, 6)


Unnamed: 0.1,Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens
0,0,If you decide not to be paid Child Benefit you...,"[[29, 42, 'FINANCE'], [48, 70, 'FINANCE']]","[('Child Benefit', 'FINANCE'), ('Guardian ’ s ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-FINANCE...","['If', 'you', 'decide', 'not', 'to', 'be', 'pa..."
1,1,Moving somewhere to study does not count as no...,"[[7, 16, 'LOCATION']]","[('somewhere', 'LOCATION')]","['O', 'I-LOCATION', 'O', 'O', 'O', 'O', 'O', '...","['Moving', 'somewhere', 'to', 'study', 'does',..."
2,2,Your partner must apply to their own employer ...,"[[5, 12, 'PERSON'], [37, 45, 'ORGANIZATION'], ...","[('partner', 'PERSON'), ('employer', 'ORGANIZA...","['O', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'I-...","['Your', 'partner', 'must', 'apply', 'to', 'th..."
3,3,You have to pay tax on it if your income is ov...,"[[34, 40, 'FINANCE'], [53, 71, 'FINANCE']]","[('income', 'FINANCE'), ('Personal Allowance',...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['You', 'have', 'to', 'pay', 'tax', 'on', 'it'..."
4,4,Apply for Widowed Parent ’ s Allowance within ...,"[[10, 38, 'FINANCE'], [46, 54, 'DATE'], [70, 7...","[('Widowed Parent ’ s Allowance', 'FINANCE'), ...","['O', 'O', 'I-FINANCE', 'I-FINANCE', 'I-FINANC...","['Apply', 'for', 'Widowed', 'Parent', '’', 's'..."


Literal eval

In [9]:
for i in ['labels', 'labelled_entities', 'label_list', 'text_tokens']:
  validated_df[i] = validated_df[i].apply(lambda x: literal_eval(x))

In [10]:
validated_df = validated_df[~validated_df.text.duplicated()]

In [11]:
validated_df = validated_df.iloc[:, 1:]

In [12]:
validated_df = validated_df.reset_index()
validated_df = validated_df.iloc[:, 1:]

In [13]:
validated_df

Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens
0,If you decide not to be paid Child Benefit you...,"[[29, 42, FINANCE], [48, 70, FINANCE]]","[(Child Benefit, FINANCE), (Guardian ’ s Allow...","[O, O, O, O, O, O, O, I-FINANCE, I-FINANCE, O,...","[If, you, decide, not, to, be, paid, Child, Be..."
1,Moving somewhere to study does not count as no...,"[[7, 16, LOCATION]]","[(somewhere, LOCATION)]","[O, I-LOCATION, O, O, O, O, O, O, O, O, O, O]","[Moving, somewhere, to, study, does, not, coun..."
2,Your partner must apply to their own employer ...,"[[5, 12, PERSON], [37, 45, ORGANIZATION], [64,...","[(partner, PERSON), (employer, ORGANIZATION), ...","[O, I-PERSON, O, O, O, O, O, I-ORGANIZATION, O...","[Your, partner, must, apply, to, their, own, e..."
3,You have to pay tax on it if your income is ov...,"[[34, 40, FINANCE], [53, 71, FINANCE]]","[(income, FINANCE), (Personal Allowance, FINAN...","[O, O, O, O, O, O, O, O, O, I-FINANCE, O, O, O...","[You, have, to, pay, tax, on, it, if, your, in..."
4,Apply for Widowed Parent ’ s Allowance within ...,"[[10, 38, FINANCE], [46, 54, DATE], [70, 75, E...","[(Widowed Parent ’ s Allowance, FINANCE), (3 m...","[O, O, I-FINANCE, I-FINANCE, I-FINANCE, I-FINA...","[Apply, for, Widowed, Parent, ’, s, Allowance,..."
...,...,...,...,...,...
5900,It ’ s against the law for a school or other e...,"[[29, 35, ORGANIZATION], [45, 63, ORGANIZATION...","[(school, ORGANIZATION), (education provider, ...","[O, O, O, O, O, O, O, O, I-ORGANIZATION, O, O,...","[It, ’, s, against, the, law, for, a, school, ..."
5901,This may mean you have difficulty getting a mo...,"[[44, 52, FINANCE], [65, 69, LOCATION]]","[(mortgage, FINANCE), (home, LOCATION)]","[O, O, O, O, O, O, O, O, I-FINANCE, O, O, O, I...","[This, may, mean, you, have, difficulty, getti..."
5902,The qualified person can include you if they a...,"[[4, 20, PERSON], [62, 81, STATE]]","[(qualified person, PERSON), (permanent reside...","[O, I-PERSON, I-PERSON, O, O, O, O, O, O, O, O...","[The, qualified, person, can, include, you, if..."
5903,You must see a border officer when you arrive ...,"[[15, 29, PERSON], [53, 55, LOCATION]]","[(border officer, PERSON), (UK, LOCATION)]","[O, O, O, O, I-PERSON, I-PERSON, O, O, O, O, O...","[You, must, see, a, border, officer, when, you..."


In [17]:
validated_labelmap = {"O": 0,
                      "I-CONTACT": 1,
                      "I-DATE": 2,
                      "I-EVENT": 3,
                      "I-FINANCE": 4,
                      "I-FORM": 5,
                      "I-LOCATION": 6,
                      "I-MISC": 7,
                      "I-MONEY": 8,
                      "I-ORGANIZATION": 9,
                      "I-PERSON": 10,
                      "I-SCHEME": 11,
                      "I-STATE": 12}

In [18]:
def list_map_func(listy, mapping):
  new_list = []
  for i in listy:
    new_list.append(mapping[i])
  return new_list

In [19]:
validated_df['new_label_list_id'] = validated_df['label_list'].apply(lambda x: list_map_func(x, validated_labelmap))

In [20]:
validated_df.head(10)

Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens,new_label_list_id
0,If you decide not to be paid Child Benefit you...,"[[29, 42, FINANCE], [48, 70, FINANCE]]","[(Child Benefit, FINANCE), (Guardian ’ s Allow...","[O, O, O, O, O, O, O, I-FINANCE, I-FINANCE, O,...","[If, you, decide, not, to, be, paid, Child, Be...","[0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 4, 4, 4, 4, 0, ..."
1,Moving somewhere to study does not count as no...,"[[7, 16, LOCATION]]","[(somewhere, LOCATION)]","[O, I-LOCATION, O, O, O, O, O, O, O, O, O, O]","[Moving, somewhere, to, study, does, not, coun...","[0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Your partner must apply to their own employer ...,"[[5, 12, PERSON], [37, 45, ORGANIZATION], [64,...","[(partner, PERSON), (employer, ORGANIZATION), ...","[O, I-PERSON, O, O, O, O, O, I-ORGANIZATION, O...","[Your, partner, must, apply, to, their, own, e...","[0, 10, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 9, 0, 5, 0]"
3,You have to pay tax on it if your income is ov...,"[[34, 40, FINANCE], [53, 71, FINANCE]]","[(income, FINANCE), (Personal Allowance, FINAN...","[O, O, O, O, O, O, O, O, O, I-FINANCE, O, O, O...","[You, have, to, pay, tax, on, it, if, your, in...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 4, 0]"
4,Apply for Widowed Parent ’ s Allowance within ...,"[[10, 38, FINANCE], [46, 54, DATE], [70, 75, E...","[(Widowed Parent ’ s Allowance, FINANCE), (3 m...","[O, O, I-FINANCE, I-FINANCE, I-FINANCE, I-FINA...","[Apply, for, Widowed, Parent, ’, s, Allowance,...","[0, 0, 4, 4, 4, 4, 4, 0, 2, 2, 0, 0, 0, 0, 3, ..."
5,This page is also available in Welsh ( Cymraeg...,"[[31, 36, MISC], [39, 46, MISC]]","[(Welsh, MISC), (Cymraeg, MISC)]","[O, O, O, O, O, O, I-MISC, O, I-MISC, O, O]","[This, page, is, also, available, in, Welsh, (...","[0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0]"
6,Contact the National Concessionary Fuel Office...,"[[12, 46, ORGANIZATION], [49, 53, ORGANIZATION]]","[(National Concessionary Fuel Office, ORGANIZA...","[O, O, I-ORGANIZATION, I-ORGANIZATION, I-ORGAN...","[Contact, the, National, Concessionary, Fuel, ...","[0, 0, 9, 9, 9, 9, 0, 9, 0, 0, 0, 0]"
7,You may want to get legal advice or contact Th...,"[[44, 92, ORGANIZATION]]",[(The Human Fertilisation and Embryology Autho...,"[O, O, O, O, O, O, O, O, O, I-ORGANIZATION, I-...","[You, may, want, to, get, legal, advice, or, c...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, ..."
8,Your application will be sent to an independen...,"[[48, 63, ORGANIZATION], [105, 117, PERSON]]","[(fostering panel, ORGANIZATION), (foster care...","[O, O, O, O, O, O, O, O, I-ORGANIZATION, I-ORG...","[Your, application, will, be, sent, to, an, in...","[0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 0, 0, 0, 0, 0, ..."
9,Once the Child Maintenance Service calculates ...,"[[9, 34, ORGANIZATION], [50, 61, FINANCE], [69...","[(Child Maintenance Service, ORGANIZATION), (m...","[O, O, I-ORGANIZATION, I-ORGANIZATION, I-ORGAN...","[Once, the, Child, Maintenance, Service, calcu...","[0, 0, 9, 9, 9, 0, 0, 4, 0, 4, 0, 0, 0, 0, 10, 0]"


**Individual**:

O                 101033

I-ORGANIZATION      5503

I-FINANCE           4409

I-PERSON            3410

I-FORM              2715

I-EVENT             2267

I-DATE              1945

I-STATE             1482

I-LOCATION          1431

I-MISC              1064

I-CONTACT            987


**Multi-word**

ORGANIZATION    3096

PERSON          2883

FINANCE         2648

FORM            1366

EVENT           1324

LOCATION        1029

DATE             791

CONTACT          669

STATE            663

MISC             503

Take 20% of the validated data for testing.

In [21]:
validated_train, validated_test = train_test_split(validated_df, test_size=0.2, random_state=43)

In [22]:
validated_test

Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens,new_label_list_id
4956,Contact your local council for more information .,"[[19, 26, ORGANIZATION]]","[(council, ORGANIZATION)]","[O, O, O, I-ORGANIZATION, O, O, O, O]","[Contact, your, local, council, for, more, inf...","[0, 0, 0, 9, 0, 0, 0, 0]"
5486,Long - term Staff Apply for this visa if you ’...,"[[0, 17, PERSON], [33, 37, FORM], [141, 148, O...","[(Long - term Staff, PERSON), (visa, FORM), (c...","[I-PERSON, I-PERSON, I-PERSON, I-PERSON, O, O,...","[Long, -, term, Staff, Apply, for, this, visa,...","[10, 10, 10, 10, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0,..."
398,Pay the court fee Pay by credit or debit card ...,"[[8, 17, FINANCE], [25, 45, FINANCE], [67, 79,...","[(court fee, FINANCE), (credit or debit card, ...","[O, O, I-FINANCE, I-FINANCE, O, O, I-FINANCE, ...","[Pay, the, court, fee, Pay, by, credit, or, de...","[0, 0, 4, 4, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, ..."
495,After you ’ ve signed the declaration or form ...,"[[26, 37, FINANCE], [75, 82, PERSON], [95, 106...","[(declaration, FINANCE), (members, PERSON), (l...","[O, O, O, O, O, O, I-FINANCE, O, O, O, O, O, O...","[After, you, ’, ve, signed, the, declaration, ...","[0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1366,There ’ s a different form for individuals and...,"[[31, 42, PERSON], [47, 56, ORGANIZATION], [68...","[(individuals, PERSON), (companies, ORGANIZATI...","[O, O, O, O, O, O, O, I-PERSON, O, I-ORGANIZAT...","[There, ’, s, a, different, form, for, individ...","[0, 0, 0, 0, 0, 0, 0, 10, 0, 9, 0, 0, 4, 4, 4,..."
...,...,...,...,...,...,...
150,Child Benefit can only be backdated for up to ...,"[[0, 13, FINANCE], [26, 45, EVENT], [46, 54, D...","[(Child Benefit, FINANCE), (backdated for up t...","[I-FINANCE, I-FINANCE, O, O, O, I-EVENT, I-EVE...","[Child, Benefit, can, only, be, backdated, for...","[4, 4, 0, 0, 0, 3, 3, 3, 3, 2, 2, 0]"
4091,If you only get the family element of Child Ta...,"[[38, 54, FINANCE], [60, 93, EVENT]]","[(Child Tax Credit, FINANCE), (payments will b...","[O, O, O, O, O, O, O, O, I-FINANCE, I-FINANCE,...","[If, you, only, get, the, family, element, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 0, 3, 3, 3, ..."
1877,You must apply to the court within 5 working d...,"[[22, 27, ORGANIZATION]]","[(court, ORGANIZATION)]","[O, O, O, O, O, I-ORGANIZATION, O, O, O, O, O,...","[You, must, apply, to, the, court, within, 5, ...","[0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2865,You must give a declaration to each charity yo...,"[[16, 27, CONTACT], [36, 43, ORGANIZATION]]","[(declaration, CONTACT), (charity, ORGANIZATION)]","[O, O, O, O, I-CONTACT, O, O, I-ORGANIZATION, ...","[You, must, give, a, declaration, to, each, ch...","[0, 0, 0, 0, 1, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
mlb = MultiLabelBinarizer()
validated_test_counts = validated_test.copy()
validated_test_counts['label_list_enc'] = validated_test_counts['label_list']
validated_test_counts = validated_test_counts.join(pd.DataFrame(mlb.fit_transform(validated_test_counts.pop('label_list_enc')),
                                                                columns=mlb.classes_,
                                                                index=validated_test_counts.index))

In [None]:
validated_test_counts

Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens,new_label_list_id,I-CONTACT,I-DATE,I-EVENT,I-FINANCE,I-FORM,I-LOCATION,I-MISC,I-ORGANIZATION,I-PERSON,I-STATE,O
4956,Contact your local council for more information .,"[[19, 26, ORGANIZATION]]","[(council, ORGANIZATION)]","[O, O, O, I-ORGANIZATION, O, O, O, O]","[Contact, your, local, council, for, more, inf...","[0, 0, 0, 8, 0, 0, 0, 0]",0,0,0,0,0,0,0,1,0,0,1
5486,Long - term Staff Apply for this visa if you ’...,"[[0, 17, PERSON], [33, 37, FORM], [141, 148, O...","[(Long - term Staff, PERSON), (visa, FORM), (c...","[I-PERSON, I-PERSON, I-PERSON, I-PERSON, O, O,...","[Long, -, term, Staff, Apply, for, this, visa,...","[9, 9, 9, 9, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,1,0,0,1,1,0,1
398,Pay the court fee Pay by credit or debit card ...,"[[8, 17, FINANCE], [25, 45, FINANCE], [67, 79,...","[(court fee, FINANCE), (credit or debit card, ...","[O, O, I-FINANCE, I-FINANCE, O, O, I-FINANCE, ...","[Pay, the, court, fee, Pay, by, credit, or, de...","[0, 0, 4, 4, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, ...",0,0,0,1,1,0,0,0,0,0,1
495,After you ’ ve signed the declaration or form ...,"[[26, 37, FINANCE], [75, 82, PERSON], [95, 106...","[(declaration, FINANCE), (members, PERSON), (l...","[O, O, O, O, O, O, I-FINANCE, O, O, O, O, O, O...","[After, you, ’, ve, signed, the, declaration, ...","[0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1,0,0,0,0,1,0,1
1366,There ’ s a different form for individuals and...,"[[31, 42, PERSON], [47, 56, ORGANIZATION], [68...","[(individuals, PERSON), (companies, ORGANIZATI...","[O, O, O, O, O, O, O, I-PERSON, O, I-ORGANIZAT...","[There, ’, s, a, different, form, for, individ...","[0, 0, 0, 0, 0, 0, 0, 9, 0, 8, 0, 0, 4, 4, 4, ...",0,0,0,1,0,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Child Benefit can only be backdated for up to ...,"[[0, 13, FINANCE], [26, 45, EVENT], [46, 54, D...","[(Child Benefit, FINANCE), (backdated for up t...","[I-FINANCE, I-FINANCE, O, O, O, I-EVENT, I-EVE...","[Child, Benefit, can, only, be, backdated, for...","[4, 4, 0, 0, 0, 3, 3, 3, 3, 2, 2, 0]",0,1,1,1,0,0,0,0,0,0,1
4091,If you only get the family element of Child Ta...,"[[38, 54, FINANCE], [60, 93, EVENT]]","[(Child Tax Credit, FINANCE), (payments will b...","[O, O, O, O, O, O, O, O, I-FINANCE, I-FINANCE,...","[If, you, only, get, the, family, element, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 0, 3, 3, 3, ...",0,0,1,1,0,0,0,0,0,0,1
1877,You must apply to the court within 5 working d...,"[[22, 27, ORGANIZATION]]","[(court, ORGANIZATION)]","[O, O, O, O, O, I-ORGANIZATION, O, O, O, O, O,...","[You, must, apply, to, the, court, within, 5, ...","[0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,0,0,0,0,1,0,0,1
2865,You must give a declaration to each charity yo...,"[[16, 27, CONTACT], [36, 43, ORGANIZATION]]","[(declaration, CONTACT), (charity, ORGANIZATION)]","[O, O, O, O, I-CONTACT, O, O, I-ORGANIZATION, ...","[You, must, give, a, declaration, to, each, ch...","[0, 0, 0, 0, 1, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,0,0,0,0,1,0,0,1


In [None]:
for i in ['I-CONTACT', 'I-DATE', 'I-EVENT', 'I-FINANCE', 'I-FORM', 'I-LOCATION', 'I-MISC', 'I-ORGANIZATION', 'I-PERSON', 'I-STATE',	'O']:
  print(i, validated_test_counts[i].sum())

I-CONTACT 104
I-DATE 105
I-EVENT 198
I-FINANCE 305
I-FORM 177
I-LOCATION 144
I-MISC 56
I-ORGANIZATION 424
I-PERSON 394
I-STATE 87
O 1180


In [23]:
validated_train

Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens,new_label_list_id
4623,You ’ ll need to restore your company to claim...,"[[30, 37, ORGANIZATION]]","[(company, ORGANIZATION)]","[O, O, O, O, O, O, O, I-ORGANIZATION, O, O, O,...","[You, ’, ll, need, to, restore, your, company,...","[0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, ..."
2336,An accounting period is 3 months unless you ’ ...,"[[24, 32, DATE], [63, 85, ORGANIZATION], [88, ...","[(3 months, DATE), (HM Revenue and Customs, OR...","[O, O, O, O, I-DATE, I-DATE, O, O, O, O, O, O,...","[An, accounting, period, is, 3, months, unless...","[0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 9, 9, 9, ..."
5061,Your search may return no results if HM Land R...,"[[5, 11, EVENT], [37, 53, ORGANIZATION], [83, ...","[(search, EVENT), (HM Land Registry, ORGANIZAT...","[O, I-EVENT, O, O, O, O, O, I-ORGANIZATION, I-...","[Your, search, may, return, no, results, if, H...","[0, 3, 0, 0, 0, 0, 0, 9, 9, 9, 0, 0, 0, 0, 0, ..."
2694,Your supplier may give you a form for this .,"[[5, 13, ORGANIZATION], [29, 33, FORM]]","[(supplier, ORGANIZATION), (form, FORM)]","[O, I-ORGANIZATION, O, O, O, O, I-FORM, O, O, O]","[Your, supplier, may, give, you, a, form, for,...","[0, 9, 0, 0, 0, 0, 5, 0, 0, 0]"
3087,You can apply for HM Revenue and Customs ( HMR...,"[[18, 40, ORGANIZATION]]","[(HM Revenue and Customs, ORGANIZATION)]","[O, O, O, O, I-ORGANIZATION, I-ORGANIZATION, I...","[You, can, apply, for, HM, Revenue, and, Custo...","[0, 0, 0, 0, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
307,This means you will not need to pay Income Tax...,"[[36, 46, FINANCE], [50, 55, FINANCE], [90, 10...","[(Income Tax, FINANCE), (money, FINANCE), (acc...","[O, O, O, O, O, O, O, O, I-FINANCE, I-FINANCE,...","[This, means, you, will, not, need, to, pay, I...","[0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 4, 0, 0, 0, ..."
2064,Example You were given income from the trust i...,"[[23, 29, FINANCE], [39, 44, FINANCE], [48, 59...","[(income, FINANCE), (trust, FINANCE), (August ...","[O, O, O, O, I-FINANCE, O, O, I-FINANCE, O, I-...","[Example, You, were, given, income, from, the,...","[0, 0, 0, 0, 4, 0, 0, 4, 0, 2, 2, 0]"
2325,Download a Bereavement Support Payment pack ( ...,"[[46, 55, FORM], [101, 115, ORGANIZATION]]","[(form BSP1, FORM), (Jobcentre Plus, ORGANIZAT...","[O, O, O, O, O, O, O, I-FORM, I-FORM, O, O, O,...","[Download, a, Bereavement, Support, Payment, p...","[0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, ..."
2303,Check with your pension provider to find out h...,"[[16, 32, ORGANIZATION], [61, 66, FINANCE]]","[(pension provider, ORGANIZATION), (taxed, FIN...","[O, O, O, I-ORGANIZATION, I-ORGANIZATION, O, O...","[Check, with, your, pension, provider, to, fin...","[0, 0, 0, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0]"


In [24]:
validated_test

Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens,new_label_list_id
4956,Contact your local council for more information .,"[[19, 26, ORGANIZATION]]","[(council, ORGANIZATION)]","[O, O, O, I-ORGANIZATION, O, O, O, O]","[Contact, your, local, council, for, more, inf...","[0, 0, 0, 9, 0, 0, 0, 0]"
5486,Long - term Staff Apply for this visa if you ’...,"[[0, 17, PERSON], [33, 37, FORM], [141, 148, O...","[(Long - term Staff, PERSON), (visa, FORM), (c...","[I-PERSON, I-PERSON, I-PERSON, I-PERSON, O, O,...","[Long, -, term, Staff, Apply, for, this, visa,...","[10, 10, 10, 10, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0,..."
398,Pay the court fee Pay by credit or debit card ...,"[[8, 17, FINANCE], [25, 45, FINANCE], [67, 79,...","[(court fee, FINANCE), (credit or debit card, ...","[O, O, I-FINANCE, I-FINANCE, O, O, I-FINANCE, ...","[Pay, the, court, fee, Pay, by, credit, or, de...","[0, 0, 4, 4, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, ..."
495,After you ’ ve signed the declaration or form ...,"[[26, 37, FINANCE], [75, 82, PERSON], [95, 106...","[(declaration, FINANCE), (members, PERSON), (l...","[O, O, O, O, O, O, I-FINANCE, O, O, O, O, O, O...","[After, you, ’, ve, signed, the, declaration, ...","[0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1366,There ’ s a different form for individuals and...,"[[31, 42, PERSON], [47, 56, ORGANIZATION], [68...","[(individuals, PERSON), (companies, ORGANIZATI...","[O, O, O, O, O, O, O, I-PERSON, O, I-ORGANIZAT...","[There, ’, s, a, different, form, for, individ...","[0, 0, 0, 0, 0, 0, 0, 10, 0, 9, 0, 0, 4, 4, 4,..."
...,...,...,...,...,...,...
150,Child Benefit can only be backdated for up to ...,"[[0, 13, FINANCE], [26, 45, EVENT], [46, 54, D...","[(Child Benefit, FINANCE), (backdated for up t...","[I-FINANCE, I-FINANCE, O, O, O, I-EVENT, I-EVE...","[Child, Benefit, can, only, be, backdated, for...","[4, 4, 0, 0, 0, 3, 3, 3, 3, 2, 2, 0]"
4091,If you only get the family element of Child Ta...,"[[38, 54, FINANCE], [60, 93, EVENT]]","[(Child Tax Credit, FINANCE), (payments will b...","[O, O, O, O, O, O, O, O, I-FINANCE, I-FINANCE,...","[If, you, only, get, the, family, element, of,...","[0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 0, 3, 3, 3, ..."
1877,You must apply to the court within 5 working d...,"[[22, 27, ORGANIZATION]]","[(court, ORGANIZATION)]","[O, O, O, O, O, I-ORGANIZATION, O, O, O, O, O,...","[You, must, apply, to, the, court, within, 5, ...","[0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2865,You must give a declaration to each charity yo...,"[[16, 27, CONTACT], [36, 43, ORGANIZATION]]","[(declaration, CONTACT), (charity, ORGANIZATION)]","[O, O, O, O, I-CONTACT, O, O, I-ORGANIZATION, ...","[You, must, give, a, declaration, to, each, ch...","[0, 0, 0, 0, 1, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, ..."


## 3B. Unvalidated Dataset

In [28]:
# s3 key prefix for the data
unvalidated = 'govuk-labelled-data-ner.csv'
unvalidated_path = f'{DATA_DIR}/{unvalidated}'
unvalidated_path

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Jan2022-Data/govuk-labelled-data-ner.csv'

In [29]:
unvalidated_df = pd.read_csv(unvalidated_path, sep=',')
print(unvalidated_df.shape)
unvalidated_df = unvalidated_df[['text', 'text_token', 'labels', 'label_list', 'new_label_list_id']]
unvalidated_df = unvalidated_df.rename(columns={"text_token":"text_tokens"})
unvalidated_df.head()

(347216, 10)


Unnamed: 0,text,text_tokens,labels,label_list,new_label_list_id
0,"However , to be clear , all of the other check...","['However', ',', 'to', 'be', 'clear', ',', 'al...","[[41, 47, 'EVENT'], [81, 90, 'PERSON']]","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ..."
1,Organisations that met certain criteria during...,"['Organisations', 'that', 'met', 'certain', 'c...","[[65, 69, 'DATE'], [77, 85, 'FORM']]","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATE...","[0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 5, 0, 0, 0, ..."
2,Use this leaflet and form to find out about pa...,"['Use', 'this', 'leaflet', 'and', 'form', 'to'...","[[21, 25, 'FORM'], [44, 50, 'FINANCE'], [51, 8...","['O', 'O', 'O', 'O', 'FORM', 'O', 'O', 'O', 'O...","[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 4, 4, 4, 0, 6, ..."
3,Legal visits must be booked by email : legalvi...,"['Legal', 'visits', 'must', 'be', 'booked', 'b...","[[31, 36, 'CONTACT'], [82, 94, 'PERSON'], [103...","['O', 'O', 'O', 'O', 'O', 'O', 'CONTACT', 'O',...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 10, 10, 0..."
4,RPC checklist for Small and Micro Business Ass...,"['RPC', 'checklist', 'for', 'Small', 'and', 'M...","[[0, 3, 'ORGANIZATION'], [34, 42, 'ORGANIZATIO...","['ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'ORG...","[9, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 2, 2, 0]"


In [30]:
unvalidated_df = unvalidated_df[~unvalidated_df.text.duplicated()]

Literal eval

In [31]:
for i in ['labels', 'label_list', 'text_tokens', 'new_label_list_id']:
  unvalidated_df[i] = unvalidated_df[i].apply(lambda x: literal_eval(x))

In [32]:
unvalidated_df = unvalidated_df.reset_index()
unvalidated_df = unvalidated_df.iloc[:, 1:]

In [33]:
print(unvalidated_df.shape)
unvalidated_df.head()

(277345, 5)


Unnamed: 0,text,text_tokens,labels,label_list,new_label_list_id
0,"However , to be clear , all of the other check...","[However, ,, to, be, clear, ,, all, of, the, o...","[[41, 47, EVENT], [81, 90, PERSON]]","[O, O, O, O, O, O, O, O, O, O, EVENT, O, O, O,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ..."
1,Organisations that met certain criteria during...,"[Organisations, that, met, certain, criteria, ...","[[65, 69, DATE], [77, 85, FORM]]","[O, O, O, O, O, O, O, O, DATE, O, O, FORM, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 5, 0, 0, 0, ..."
2,Use this leaflet and form to find out about pa...,"[Use, this, leaflet, and, form, to, find, out,...","[[21, 25, FORM], [44, 50, FINANCE], [51, 83, F...","[O, O, O, O, FORM, O, O, O, O, FINANCE, FINANC...","[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 4, 4, 4, 0, 6, ..."
3,Legal visits must be booked by email : legalvi...,"[Legal, visits, must, be, booked, by, email, :...","[[31, 36, CONTACT], [82, 94, PERSON], [103, 10...","[O, O, O, O, O, O, CONTACT, O, O, O, O, O, PER...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 10, 10, 0..."
4,RPC checklist for Small and Micro Business Ass...,"[RPC, checklist, for, Small, and, Micro, Busin...","[[0, 3, ORGANIZATION], [34, 42, ORGANIZATION],...","[ORGANIZATION, O, O, O, O, O, ORGANIZATION, O,...","[9, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 2, 2, 0]"


In [34]:
unvalidated_labelmap = {"O": 0,
                        "I-CONTACT": 1,
                        "I-DATE": 2,
                        "I-EVENT": 3,
                        "I-FINANCE": 4,
                        "I-FORM": 5,
                        "I-LOC": 6,
                        "I-MISC": 7,
                        "I-MONEY": 8,
                        "I-ORG": 9,
                        "I-PER": 10,
                        "I-SCHEME": 11,
                        "I-STATE": 12}

## 4. Save files

### 4A. Save CSVs

In [35]:
print(validated_train.shape)
validated_train.head()

(4724, 6)


Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens,new_label_list_id
4623,You ’ ll need to restore your company to claim...,"[[30, 37, ORGANIZATION]]","[(company, ORGANIZATION)]","[O, O, O, O, O, O, O, I-ORGANIZATION, O, O, O,...","[You, ’, ll, need, to, restore, your, company,...","[0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, ..."
2336,An accounting period is 3 months unless you ’ ...,"[[24, 32, DATE], [63, 85, ORGANIZATION], [88, ...","[(3 months, DATE), (HM Revenue and Customs, OR...","[O, O, O, O, I-DATE, I-DATE, O, O, O, O, O, O,...","[An, accounting, period, is, 3, months, unless...","[0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 9, 9, 9, ..."
5061,Your search may return no results if HM Land R...,"[[5, 11, EVENT], [37, 53, ORGANIZATION], [83, ...","[(search, EVENT), (HM Land Registry, ORGANIZAT...","[O, I-EVENT, O, O, O, O, O, I-ORGANIZATION, I-...","[Your, search, may, return, no, results, if, H...","[0, 3, 0, 0, 0, 0, 0, 9, 9, 9, 0, 0, 0, 0, 0, ..."
2694,Your supplier may give you a form for this .,"[[5, 13, ORGANIZATION], [29, 33, FORM]]","[(supplier, ORGANIZATION), (form, FORM)]","[O, I-ORGANIZATION, O, O, O, O, I-FORM, O, O, O]","[Your, supplier, may, give, you, a, form, for,...","[0, 9, 0, 0, 0, 0, 5, 0, 0, 0]"
3087,You can apply for HM Revenue and Customs ( HMR...,"[[18, 40, ORGANIZATION]]","[(HM Revenue and Customs, ORGANIZATION)]","[O, O, O, O, I-ORGANIZATION, I-ORGANIZATION, I...","[You, can, apply, for, HM, Revenue, and, Custo...","[0, 0, 0, 0, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, ..."


In [36]:
print(validated_test.shape)
validated_test.head()

(1181, 6)


Unnamed: 0,text,labels,labelled_entities,label_list,text_tokens,new_label_list_id
4956,Contact your local council for more information .,"[[19, 26, ORGANIZATION]]","[(council, ORGANIZATION)]","[O, O, O, I-ORGANIZATION, O, O, O, O]","[Contact, your, local, council, for, more, inf...","[0, 0, 0, 9, 0, 0, 0, 0]"
5486,Long - term Staff Apply for this visa if you ’...,"[[0, 17, PERSON], [33, 37, FORM], [141, 148, O...","[(Long - term Staff, PERSON), (visa, FORM), (c...","[I-PERSON, I-PERSON, I-PERSON, I-PERSON, O, O,...","[Long, -, term, Staff, Apply, for, this, visa,...","[10, 10, 10, 10, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0,..."
398,Pay the court fee Pay by credit or debit card ...,"[[8, 17, FINANCE], [25, 45, FINANCE], [67, 79,...","[(court fee, FINANCE), (credit or debit card, ...","[O, O, I-FINANCE, I-FINANCE, O, O, I-FINANCE, ...","[Pay, the, court, fee, Pay, by, credit, or, de...","[0, 0, 4, 4, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, ..."
495,After you ’ ve signed the declaration or form ...,"[[26, 37, FINANCE], [75, 82, PERSON], [95, 106...","[(declaration, FINANCE), (members, PERSON), (l...","[O, O, O, O, O, O, I-FINANCE, O, O, O, O, O, O...","[After, you, ’, ve, signed, the, declaration, ...","[0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1366,There ’ s a different form for individuals and...,"[[31, 42, PERSON], [47, 56, ORGANIZATION], [68...","[(individuals, PERSON), (companies, ORGANIZATI...","[O, O, O, O, O, O, O, I-PERSON, O, I-ORGANIZAT...","[There, ’, s, a, different, form, for, individ...","[0, 0, 0, 0, 0, 0, 0, 10, 0, 9, 0, 0, 4, 4, 4,..."


In [37]:
print(unvalidated_df.shape)
unvalidated_df.head()

(277345, 5)


Unnamed: 0,text,text_tokens,labels,label_list,new_label_list_id
0,"However , to be clear , all of the other check...","[However, ,, to, be, clear, ,, all, of, the, o...","[[41, 47, EVENT], [81, 90, PERSON]]","[O, O, O, O, O, O, O, O, O, O, EVENT, O, O, O,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ..."
1,Organisations that met certain criteria during...,"[Organisations, that, met, certain, criteria, ...","[[65, 69, DATE], [77, 85, FORM]]","[O, O, O, O, O, O, O, O, DATE, O, O, FORM, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 5, 0, 0, 0, ..."
2,Use this leaflet and form to find out about pa...,"[Use, this, leaflet, and, form, to, find, out,...","[[21, 25, FORM], [44, 50, FINANCE], [51, 83, F...","[O, O, O, O, FORM, O, O, O, O, FINANCE, FINANC...","[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 4, 4, 4, 0, 6, ..."
3,Legal visits must be booked by email : legalvi...,"[Legal, visits, must, be, booked, by, email, :...","[[31, 36, CONTACT], [82, 94, PERSON], [103, 10...","[O, O, O, O, O, O, CONTACT, O, O, O, O, O, PER...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 10, 10, 0..."
4,RPC checklist for Small and Micro Business Ass...,"[RPC, checklist, for, Small, and, Micro, Busin...","[[0, 3, ORGANIZATION], [34, 42, ORGANIZATION],...","[ORGANIZATION, O, O, O, O, O, ORGANIZATION, O,...","[9, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 2, 2, 0]"


In [38]:
DATA_DIR

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/Jan2022-Data'

In [45]:
validated_test.to_csv(os.path.join(DATA_DIR, 'Feb22-CSV/validated_test.csv'), index=None)
validated_train.to_csv(os.path.join(DATA_DIR, 'Feb22-CSV/validated_train.csv'), index=None)
unvalidated_df.to_csv(os.path.join(DATA_DIR, 'Feb22-CSV/unvalidated_train.csv'), index=None)

### 4B. Save HFs

Validated test

In [48]:
validated_test_df = pd.read_csv(os.path.join(DATA_DIR, 'Feb22-CSV/validated_test.csv'))

In [49]:
for i in ['labels', 'labelled_entities', 'label_list', 'text_tokens', 'new_label_list_id']:
  validated_test_df[i] = validated_test_df[i].apply(lambda x: literal_eval(x))
validated_test_df = validated_test_df[['text_tokens', 'new_label_list_id']]

In [50]:
# validated_test_hf = Dataset.from_csv(os.path.join(DATA_DIR, 'Jan22-CSV/validated_test.csv'))
validated_test_hf = Dataset.from_pandas(validated_test_df)

In [51]:
validated_test_hf.save_to_disk(os.path.join(DATA_DIR, 'Feb22-HF/validated_test'))

Validated train

In [52]:
validated_train_df = pd.read_csv(os.path.join(DATA_DIR, 'Feb22-CSV/validated_train.csv'))

In [53]:
for i in ['labels', 'labelled_entities', 'label_list', 'text_tokens', 'new_label_list_id']:
  validated_train_df[i] = validated_train_df[i].apply(lambda x: literal_eval(x))
validated_train_df = validated_train_df[['text_tokens', 'new_label_list_id']]

In [54]:
# validated_test_hf = Dataset.from_csv(os.path.join(DATA_DIR, 'Jan22-CSV/validated_test.csv'))
validated_train_hf = Dataset.from_pandas(validated_train_df)

In [55]:
validated_train_hf.save_to_disk(os.path.join(DATA_DIR, 'Feb22-HF/validated_train'))

Unvalidated train

In [56]:
unvalidated_train_df = pd.read_csv(os.path.join(DATA_DIR, 'Feb22-CSV/unvalidated_train.csv'))

In [57]:
for i in ['labels', 'label_list', 'text_tokens', 'new_label_list_id']:
  unvalidated_train_df[i] = unvalidated_train_df[i].apply(lambda x: literal_eval(x))
unvalidated_train_df = unvalidated_train_df[['text_tokens', 'new_label_list_id']]

In [58]:
unvalidated_train_hf = Dataset.from_pandas(unvalidated_train_df)

In [59]:
unvalidated_train_hf.save_to_disk(os.path.join(DATA_DIR, 'Feb22-HF/unvalidated_train'))

Unvalidated train sample


In [60]:
unvalidated_train_df_sample = unvalidated_train_df.sample(10000, random_state=43)
unvalidated_train_df_sample.shape

(10000, 2)

In [61]:
unvalidated_train_hf_sample = Dataset.from_pandas(unvalidated_train_df_sample)

In [62]:
unvalidated_train_hf_sample.save_to_disk(os.path.join(DATA_DIR, 'Feb22-HF/unvalidated_train_sample'))

### 4C. Save .json mappings

In [None]:
with open(os.path.join(DATA_DIR, "Jan22-HF/validated_labelmap.json"), "w") as outfile:
    json.dump(validated_labelmap, outfile)

In [None]:
with open(os.path.join(DATA_DIR, "Jan22-HF/unvalidated_labelmap.json"), "w") as outfile:
    json.dump(unvalidated_labelmap, outfile)

In [63]:
with open(os.path.join(DATA_DIR, "Feb22-HF/full_labelmap.json"), "w") as outfile:
    json.dump(unvalidated_labelmap, outfile)

In [None]:
validated_train_hf.load_from_disk(os.path.join(DATA_DIR, 'Jan22-HF/validated_train'))
validated_test_hf.load_from_disk(os.path.join(DATA_DIR, 'Jan22-HF/validated_test'))
validated_train_hf.load_from_disk(os.path.join(DATA_DIR, 'Jan22-HF/unvalidated_train'))

Dataset({
    features: ['text_tokens', 'new_label_list_id'],
    num_rows: 277345
})