# COLAB_LabelledDataAnalysis



This is a notebook to prepare the labelled token dataset for HuggingFace.

## 1. Installs and Imports

In [None]:
# !pip install datasets
# !pip install transformers
# !pip install s3fs
# !pip install boto3
# !pip install sagemaker

In [None]:
import json
import os
from ast import literal_eval
from collections import Counter

import boto3
import numpy as np
import pandas as pd
import s3fs
import sagemaker
import transformers
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", None)

## 2. Permissions

In [None]:
system = "COLAB"  # ["AWS", "COLAB"]

In [None]:
if system == "AWS":
    fs = s3fs.S3FileSystem()
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f"s3://{s3_bucket}/model-data/govner-data"
    for f in fs.ls(DATA_DIR):
        print(f)
    # Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session()
    sagemaker_session_bucket = s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()

    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system == "COLAB":
    from google.colab import drive

    drive.mount("/content/gdrive")
    # DATA_DIR = os.path.join("/content/gdrive/My Drive", "NER/Data")
    DATA_DIR = os.path.join(
        "/content/gdrive/Shareddrives/",
        "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data",
    )

In [None]:
DATA_DIR

## 3. Load Dataset

In [None]:
# s3 key prefix for the data
dataset_name = "line_by_line_NER_data_combined.csv"

dataset_path = f"{DATA_DIR}/{dataset_name}"

In [None]:
df = pd.read_csv(dataset_path, sep="\t")

In [None]:
df.shape

In [None]:
df.head()

Literal eval

In [None]:
for i in ["text_token", "labels", "label_list"]:
    df[i] = df[i].apply(lambda x: literal_eval(x))

In [None]:
print(df["text_token"][0])
print(df["text_token"][0][0])

## Labelled Counts

Now, get the labelled counts for each row.

In [None]:
with open(f"{DATA_DIR}/new_label_map.json") as f:
    data = json.load(f)

label_map = data

In [None]:
labels = list(label_map.keys())

In [None]:
labels

In [None]:
test_labs = ["O", "O", "FINANCE", "PERSON", "O", "O", "FINANCE", "O"]

In [None]:
df_mini = df.head(5)
df_mini

In [None]:
df_mini

In [None]:
mlb = MultiLabelBinarizer()
df_mini["label_list_enc"] = df_mini["label_list"]
df_mini = df_mini.join(
    pd.DataFrame(
        mlb.fit_transform(df_mini.pop("label_list_enc")),
        columns=mlb.classes_,
        index=df_mini.index,
    )
)

In [None]:
df_mini

In [None]:
for i in range(len(df_mini)):
    print(df_mini["label_list"][i])

In [None]:
def occurence_flag(df, column):
    mlb = MultiLabelBinarizer()
    df_copy = df
    df_copy[f"{column}_cop"] = df_copy[f"{column}"]
    df_copy = df_copy.join(
        pd.DataFrame(
            mlb.fit_transform(df_copy.pop(f"{column}_cop")),
            columns=mlb.classes_,
            index=df_copy.index,
        )
    )
    return df_copy

In [None]:
df_counts = occurence_flag(df_mini, column="label_list")

In [None]:
df_counts

Apply to full dataframe

In [None]:
df_counts = occurence_flag(df, column="label_list")

In [None]:
df_counts.head(20)

## Inspect Elements

In [None]:
main_df = pd.DataFrame()

In [None]:
main_df = pd.DataFrame()
for l in labels:
    print(l)
    new_df = df_counts[df_counts[l] == 1]
    new_df = new_df.sample(2000)
    new_df["sample"] = l
    main_df = main_df.append(new_df)

In [None]:
main_df.shape

In [None]:
main_df["zip_tok_ent"] = main_df.apply(
    lambda x: list(zip(x.text_token, x.label_list)), axis=1
)

In [None]:
main_df = main_df.reset_index()
main_df = main_df.drop(["index"], axis=1)
main_df

In [None]:
main_df["zip_tok_ent"][:5]

In [None]:
main_df.to_excel(f"{DATA_DIR}/NER_data_combined_BREAKDOWN.xlsx", index=None)