# Literal Eval Conversions

In [None]:
# !pip install datasets
# !pip install transformers

In [None]:
import json
import os
from ast import literal_eval
from collections import OrderedDict

import pandas as pd
import transformers
from datasets import ClassLabel, Dataset, Sequence, load_dataset
from sklearn.model_selection import train_test_split

In [None]:
system = "COLAB"

if system == "COLAB":
    from google.colab import drive

    drive.mount("/content/gdrive")
    # DATA_DIR = os.path.join("/content/gdrive/My Drive", "NER/Data")
    DATA_DIR = os.path.join(
        "/content/gdrive/Shareddrives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data",
    )

## 1. New Label Map

In [None]:
old_label_map = {
    "O": 0,
    "CONTACT": 1,
    "DATE": 2,
    "EVENT": 3,
    "FINANCE": 4,
    "FORM": 5,
    "LOCATION": 6,
    "MISC": 7,
    "MONEY": 8,
    "ORGANIZATION": 9,
    "PERSON": 10,
    "SCHEME": 11,
    "STATE": 12,
}

In [None]:
new_label_map_mapping = {
    "CONTACT": "I-CONTACT",
    "DATE": "I-DATE",
    "EVENT": "I-EVENT",
    "FINANCE": "I-FINANCE",
    "FORM": "I-FORM",
    "LOCATION": "I-LOC",
    "MISC": "I-MISC",
    "MONEY": "I-MONEY",
    "O": "O",
    "ORGANIZATION": "I-ORG",
    "PERSON": "I-PER",
    "SCHEME": "I-SCHEME",
    "STATE": "I-STATE",
}

## 2. Convert Label Convention

Read in dataset from CSV...

In [None]:
save_df_name = "govuk-labelled-data-ner.csv"
save_df_path = f"{DATA_DIR}/{save_df_name}"
save_df_path

In [None]:
all_data = pd.read_csv(save_df_path)

Evaluate literals...

In [None]:
all_data["text_token"] = all_data["text_token"].apply(lambda x: literal_eval(x))
all_data["labels"] = all_data["labels"].apply(lambda x: literal_eval(x))
all_data["label_list"] = all_data["label_list"].apply(lambda x: literal_eval(x))

In [None]:
all_data.head()

Casting the replacement tags. The tags will need replacing in the following columns...

In [None]:
df_IO = all_data

In [None]:
def list_rep(lists, lookup):
    new_list = []
    for l in lists:
        new_l = []
        for i in l:
            if isinstance(i, str):
                i = lookup[i]
            else:
                i = i
            new_l.append(i)
        new_list.append(new_l)
    return new_list

In [None]:
df_IO["labels"] = df_IO["labels"].apply(
    lambda x: list_rep(x, lookup=new_label_map_mapping)
)
df_IO["label_list"] = df_IO["label_list"].apply(
    lambda x: list(map(new_label_map_mapping.get, x))
)

In [None]:
df_IO.head(10)

In [None]:
samp_df_IO = df_IO.sample(10000)

Save main to file...

In [None]:
save_path = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/IO-annotation_scheme"
save_name = os.path.join(save_path, "i-o_labelled_data.csv")
df_IO.to_csv(save_name, index=None)

Save sampled to file...

In [None]:
save_path = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/IO-annotation_scheme"
save_name = os.path.join(save_path, "i-o_labelled_data_samp.csv")
samp_df_IO.to_csv(save_name, index=None)

Save mappings to json file 

In [None]:
label_map = {
    "O": 0,
    "I-CONTACT": 1,
    "I-DATE": 2,
    "I-EVENT": 3,
    "I-FINANCE": 4,
    "I-FORM": 5,
    "I-LOC": 6,
    "I-MISC": 7,
    "I-MONEY": 8,
    "I-ORG": 9,
    "I-PER": 10,
    "I-SCHEME": 11,
    "I-STATE": 12,
}

In [None]:
save_path = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/IO-annotation_scheme"
save_name = os.path.join(save_path, "i-o_label_map.json")

In [None]:
with open(save_name, "w") as fp:
    json.dump(label_map, fp)

## 3. Huggingface Preparation

### 3.1 Main Data

In [None]:
file_path = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/IO-annotation_scheme"
file_name = os.path.join(save_path, "i-o_labelled_data.csv")
df = pd.read_csv(file_name)

In [None]:
df_trim = df[["text_token", "new_label_list_id"]]
df_trim.head()

In [None]:
hf_dataset = Dataset.from_pandas(df)

In [None]:
labels = [i for i in label_map.keys()]
labels

In [None]:
for j in ["text_token", "new_label_list_id"]:
    print("{}: {}".format(j, hf_dataset.features[f"{j}"]))
print()

hf_dataset.features["new_label_list_id"] = Sequence(ClassLabel(13, labels), -1, id=None)

In [None]:
hf_dataset = hf_dataset.train_test_split(train_size=0.85, seed=42)

In [None]:
hf_dataset

In [None]:
dataset_name = "hf_govuk_data_i-o"

In [None]:
dataset_name_path = f"{file_path}/{dataset_name}"
dataset_name_path

In [None]:
# save train_dataset to gdrive
hf_input_path = f"{dataset_name_path}"
hf_dataset.save_to_disk(hf_input_path)

### 3.2 Sampled Data

In [None]:
file_path = "/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data/IO-annotation_scheme"
file_name = os.path.join(save_path, "i-o_labelled_data_samp.csv")
df = pd.read_csv(file_name)

In [None]:
hf_dataset = Dataset.from_pandas(df)

In [None]:
hf_dataset = hf_dataset.train_test_split(train_size=0.85, seed=42)

In [None]:
hf_dataset

In [None]:
for j in ["text_token", "new_label_list_id"]:
    print("{}: {}".format(j, hf_dataset.features[f"{j}"]))
print()

labels = [i for i in label_map.keys()]

hf_dataset.features["new_label_list_id"] = Sequence(ClassLabel(13, labels), -1, id=None)

In [None]:
dataset_name = "samp_hf_govuk_data_i-o"

In [None]:
dataset_name_path = f"{file_path}/{dataset_name}"
dataset_name_path

In [None]:
# save train_dataset to gdrive
hf_input_path = f"{dataset_name_path}"
hf_dataset.save_to_disk(hf_input_path)