### Exploring Masking Algorithm

**Objective:** Mask an specific percentage of words that are label related using whole-word masking 

In [53]:
import pandas as pd
import math
import random
import numpy as np
import collections
from typing import Tuple
from ast import literal_eval
import re
import os

In [2]:
# Uncomment this cell if working in collab
# from google.colab import drive
# drive.mount('/content/drive')

# path = "/content/drive/MyDrive/Qbot-gpt/data"

In [3]:
# Uncomment this line if working locally
path = "/Users/algiraldoh/Development/qbot-gpt/data"

### 1. Load the necessary data

In [4]:

# labelled skills
labeled_skills_file = os.path.join(path, "skills_sample.csv")
df_skills = pd.read_csv(labeled_skills_file)
# filter some columns out
df_skills = df_skills.iloc[:,1:3].copy()

#  linkedin data
linkedin_file = os.path.join(path, "jobs_230612.csv")
df_desc = pd.read_csv(linkedin_file, escapechar="\\")

# matched testing data
matches_file = os.path.join(path,'matches.csv')
df_matches = pd.read_csv(matches_file)
df_matches['0_'] = df_matches['0'].apply(lambda x: literal_eval(str(x)))

### 2. Cleansing the data

Starting with the skills column from the skills_data df.

In [5]:
def skills_cleaning(
        df: pd.DataFrame,
        skills_column: str,
        label_column: str,
        re_preparent: re.Pattern=re.compile(r'^([\w\s]*) \('),
        re_parent: re.Pattern=re.compile(r'\(([a-z\s]+)\)'),
        re_hasparent: re.Pattern=re.compile(r'\([a-z\s]+\)'),
) -> pd.DataFrame:
    """
    Function to clean the skills scraped from LinkedIn. The function
    removes the "(programming language)" from the skills. It identifies
    further skills containing parentheses, the splits these in two rows.
    One row with the text before the parentheses and another row with
    text within the parentheses. It also gets rid of duplicate rows and
    rows with missing labels.

    :param df: pandas.DataFrame containing the skills to be cleaned
    :param skills_column: string value of the header of the column with
    the skills
    :param label_column: string value of the header of the column with
    the labels
    :param re_preparent: re.Pattern regex to match text in front of a
    parentheses
    :param re_parent: re.Pattern regex to match text inside a parentheses
    :param re_hasparent: re.Pattern to match if a skill has parentheses
    :return: pandas.DataFrame with the cleaned skills column
    """

    # copying the DF, so we won't overwrite the original
    dfc = df.copy()

    dfc = dfc.dropna(subset=label_column).reset_index(drop=True)

    dfc[skills_column] = dfc[skills_column].str.replace(
        ' (programming language)',
        '',
        regex=False,
    )
    # creating two arrays with the texts from before the parentheses and
    # from within the parentheses
    preparent = dfc[skills_column].str.extract(re_preparent)
    parent = dfc[skills_column].str.extract(re_parent)

    # creating a boolean mask to note which rows have parentheses
    mask = dfc[skills_column].str.contains(re_hasparent, regex=True)

    # if there are parentheses, we create a copy of the parentheses rows
    # and append it to the modified original DF
    if np.any(mask):
        df_new = dfc.loc[mask].copy()
        # the original parentheses rows will have the preparentheses text
        dfc.loc[mask, skills_column] = preparent.loc[mask, 0]
        # the new rows will have the post parentheses text
        df_new.loc[:, skills_column] = parent.loc[mask, 0]

        # appending the new DF to the old, also dropping duplicates
        return pd.concat([dfc, df_new]).drop_duplicates(subset=skills_column)
    else:
        # if no changes are needed, we just drop the duplicates
        return dfc.drop_duplicates(subset=skills_column)

In [6]:
def description_cleaning(
        df: pd.DataFrame,
        desc_column: str,
        re_punc: re.Pattern=re.compile(
            r'([!"#$%\'()*+,./:;<=>?@\]\[\\^_`{|}~])'
        ),
        re_spac: re.Pattern=re.compile(r'[\n\s]+'),
        re_apos: re.Pattern=re.compile(r'\' '),
) -> pd.DataFrame:
    """
    Function to clean the job descriptions, making them ready to be
    "tokenised" by the label creation function. It pads punctuation marks
    with spaces, eliminates redundant white spaces and removes the "about
    the job" header.

    :param df: pandas.DataFrame to clean
    :param desc_column: string value of the column header with the
    description to clean
    :param re_punc: re.Pattern regex to match punctuation marks
    :param re_spac: re.Pattern regex to match white spaces
    :param re_apos: re.Pattern regex to match apostrophes
    :return: pandas.DataFrame with the cleaned job description column
    """

    # copying the DF, so the original gets preserved
    dfc = df.copy()
    dfc[desc_column] = dfc[desc_column].str.lower()
    # padding punctuation marks with white spaces
    dfc[desc_column] = dfc[desc_column].str.replace(
        re_punc,
        r' \1 ', # inserting the first captured group (padded)
        regex=True,
    )
    # removing redundant white spaces
    dfc[desc_column] = dfc[desc_column].str.replace(re_spac, ' ', regex=True)
    # removing the second padding space from apostrophes. so "job ' s"
    # becomes "job 's"
    dfc[desc_column] = dfc[desc_column].str.replace(re_apos, '\'', regex=True)
    dfc[desc_column] = dfc[desc_column].str.replace(
        'about the job',
        '',
        regex=False,
    ).str.strip()

    return dfc

In [7]:
# Cleaning the skills
cleaned_skills_df = skills_cleaning(df=df_skills, skills_column='skills', label_column='label_gen')
# Cleaning job description
cleaned_desc_df = description_cleaning(df=df_desc, desc_column='job_description')



In [8]:
# Additional pre-processing
cleaned_skills_df = cleaned_skills_df.dropna().reset_index(drop=True)

In [9]:
cleaned_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6672 entries, 0 to 6671
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   skills     6672 non-null   object
 1   label_gen  6672 non-null   object
dtypes: object(2)
memory usage: 104.4+ KB


### 3. Labelling

In [10]:
category_dict = {
    'Business': ('B-BUS', 'I-BUS'),
    'Technology': ('B-TECHNOLOGY', 'I-TECHNOLOGY'),
    'Technical': ('B-TECHNICAL', 'I-TECHNICAL'),
    'Soft': ('B-SOFT', 'I-SOFT'),
}

id2label = {
    0: "O",
    1: "B-BUS",
    2: "I-BUS",
    3: "B-TECHNOLOGY",
    4: "I-TECHNOLOGY",
    5: "B-TECHNICAL",
    6: "I-TECHNICAL",
    7: "B-SOFT",
    8: "I-SOFT",
}

label2id = {
    "O": 0,
    "B-BUS": 1,
    "I-BUS": 2,
    "B-TECHNOLOGY": 3,
    "I-TECHNOLOGY": 4,
    "B-TECHNICAL": 5,
    "I-TECHNICAL": 6,
    "B-SOFT": 7,
    "I-SOFT": 8,
}

In [11]:
def make_skills_label_dict(
        df: pd.DataFrame,
        skills_column: str,
        cat_column: str,
        label_dict: dict,
) -> dict:
    """
    Function that takes in a DF with the skills and their categories along with a dictionary with keys corrsponding
    to the skill categories. The function creates a dictionary with the skills as keys and the beginning and
    intermadiate labels as values.

    :param df: pandas.DataFrame with the skills and their categories
    :param skills_column: string value of the column header containing the skills
    :param cat_column: string value of the column header containing the skill category
    :param label_dict: dictionary with the category names as keys and the label tuples as values
    :return: dictionary where the skills are the keys and values are the label tuples based on their category
    """

    skills_label_dict = dict()

    for i,s in enumerate(df[skills_column]):
        value_range = np.arange(len(s.split(" ")))
        bool_array = value_range == 0
        labelled_array = np.where(bool_array, label2id[category_dict[df[cat_column].iloc[i]][0]], label2id[category_dict[df[cat_column].iloc[i]][1]])
        skills_label_dict[s] = list(labelled_array)

    return skills_label_dict

In [12]:

skills_label_dict = make_skills_label_dict(
    df=cleaned_skills_df,
    skills_column='skills',
    cat_column='label_gen',
    label_dict=category_dict,
)


Finding the matches of skills in every description text

In [13]:
def wrap_in_spaces(
    string_val: str,
    re_string: re.Pattern=re.compile(r'([\w\s]+)'),
) -> str:
    """
    Simple function to wrap the input string in spaces.

    :param string_val: string value to wrap
    :param re_string: re.Pattern regex matching word characters and
    spaces
    :return: cleaned string value
    """

    return re.sub(re_string, r' \1 ', string_val)

In [14]:
def get_skills_boolean_matrix(
    df: pd.DataFrame,
    desc_column: str,
    skills_dictionary: dict,
    *args,
    **kwargs,
) -> np.ndarray:
    """
    Function returning a boolean array indicating if the skills found in
    the keys of the input dictionary are in the data frame for each row.
    Therefore, the shape of the output array is
    (df.shape[0], len(skills_dictionary)).

    :param df: pandas.DataFrame to check
    :param desc_column: name of the column to perform the matching on
    :param skills_dictionary: dictionary whose keys will be used for
    matching
    :param args: positional arguments
    :param kwargs: keyword arguments
    :return: numpy.ndarray containing the boolean mask indicating whether
    the skill is present in a row or not
    """

    # creating a container for boolean arrays for each skill
    arrays = []
    # iterating through the skills and checking if they ar present in the
    # descriptions
    for skill_key in list(skills_dictionary.keys()):
        # checking whether the skill is present in each deascription
        array = (
            df[desc_column]
            .str
            .contains(
                # wrapping the skill in spaces, so only relevant matches
                # are picked up
                wrap_in_spaces(
                    string_val=skill_key,
                    *args,
                    **kwargs,
                ),
                regex=False,
            )
        )

        arrays.append(array)

    # we arrange the arrays so there is a row corresponding to each DF row with
    # a boolean corresponding to each skill
    return np.column_stack(arrays)




In [15]:
def label_entry(
        skill: str,
        skills_label_dictionary: dict,
        description_split: list,
        label_list: list,
) -> None:
    """
    Function to detect the position of a skill in a list of strings from the
    description and write the corresponding labels into the labels list -
    corresponding to the description list.

    :param skill: string value of the skill
    :param skills_label_dictionary: dictionary where the keys are the skills and
    values are the associated labels
    :param description_split: list of words from the description (split on white spaces)
    :param label_list: list of labels corresponding to the description_split list (initially all 'O'-s)
    :return: None
    """

    skill_split = skill.split(' ')
    len_ds = len(description_split)
    len_ss = len(skill_split)

    for i in range(len_ds - len_ss + 1):
        if description_split[i: i + len_ss] == skill_split:
            #print(f'{i}, {len_ds=}, {len_ss=}, {len(label_list)=}')
            label_list[i] = skills_label_dictionary[skill][0]
            label_list[i + 1: i + len_ss] = [skills_label_dictionary[skill][1] for _ in range(len_ss - 1)]
        else:
            pass

In [16]:
def label_df(
        df: pd.DataFrame,
        desc_column: str,
        skills_list: list,
        boolean_masks: np.ndarray,
        *args,
        **kwargs,
) -> tuple:
    """
    Function to iterate through a data frame's description column returning two lists, one with the split description
    and one with the corresponding labels.

    :param df: pandas.DataFrame with the description to label
    :param desc_column: string value of the column header
    :param skills_list: list of string values of skills
    :param boolean_masks: numpy.ndarray of boolean values of shape (df.shape[0], len(skills_list)). it corresponds to
    whether a skill is present in a row of the data frame
    :param args: positional arguments
    :param kwargs: keyword arguments
    :return: tuple of lists. one with the split description strings, one with the corresponding label strings
    """

    input_array = []
    label_array = []

    for i in range(df.shape[0]):
        description_split = df[desc_column].values[i].split(' ')
        label_list = [0 for _ in range(len(description_split))]

        for skill in skills_list[boolean_masks[i, :]]:
            label_entry(
                skill=skill,
                description_split=description_split,
                label_list=label_list,
                *args,
                **kwargs,
            )

        input_array.append(description_split)
        label_array.append(label_list)

    return input_array, label_array

In [17]:
re_string = re.compile(r'([\w\s]+)')

bool_masks = get_skills_boolean_matrix(
    df=cleaned_desc_df,
    desc_column='job_description',
    skills_dictionary=skills_label_dict,
    re_string=re_string,
)

bool_masks.shape

(15301, 6672)

In [18]:
skills_all = np.array(list(skills_label_dict.keys()))

input_array, label_array = label_df(
    df=cleaned_desc_df,
    desc_column='job_description',
    skills_list=skills_all,
    boolean_masks=bool_masks,
    skills_label_dictionary=skills_label_dict,
)

In [19]:
# Creating the dataset for the model

df_labelled_desc = pd.DataFrame()
df_labelled_desc['text'] = input_array
df_labelled_desc['ner_tags'] = label_array

In [20]:
df_labelled_desc

Unnamed: 0,text,ner_tags
0,"[full, time, ,, in, person, joblocation, :, ne...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[role, :, python, developer, /, professional, ...","[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[jjob, overviewbased, in, guildford, ,, dps, g...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, ..."
3,"[my, client, is, seeking, a, skilled, and, mot...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,"[crossover, is, the, world, 's, #, 1, source, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
15296,"[job, title, :, manufacturing, quality, engine...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
15297,"[the, client, are, a, global, designer, and, m...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, ..."
15298,"[recruit, central, ,, are, currently, on, the,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, ..."
15299,"[software, engineer, ,, systematic, equities, ...","[5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### 4. Chunking and Tokenising

In [21]:
# ! pip install transformers datasets scikit-learn huggingface-hub evaluate

In [22]:
# Loading hugging face libraries

from transformers import AutoModelForTokenClassification, TrainingArguments\
    , Trainer, AutoTokenizer, DataCollatorForTokenClassification\
    , DataCollatorWithPadding, AutoModelForTokenClassification, pipeline\
    , AutoModelForMaskedLM, default_data_collator

from datasets import ClassLabel, Features, Sequence, Value, Dataset, DatasetDict
from huggingface_hub import notebook_login

import evaluate
import datasets

from sklearn.model_selection import train_test_split




In [23]:

def split_lists_into_chunks(row:pd.Series, n:int, columns:list) -> None:
    # Determine the length of each chunk

    text = row['text']
    labels = row['ner_tags']

    # Split the lists into chunks
    chunks_text = [(text[i:i+n], labels[i:i+n]) for i in range(0, len(text), n)]
    chunks_df = pd.DataFrame(chunks_text, columns=columns)

    return chunks_df



In [24]:

n = 25
df_columns = ['tokens', 'ner_tags']
arrays = []
stop = df_labelled_desc.index[-1]

output = df_labelled_desc.apply(split_lists_into_chunks
                                     , args=(n,df_columns)
                                     , axis=1)




In [25]:
df_tokens = pd.concat([df for df in output], ignore_index=True)

In [26]:
# Setting the features of the dataset object
features = Features(
    {'tokens': Sequence(
        feature=Value(dtype='string')
     ),
     'ner_tags': Sequence(
         feature=ClassLabel(
             num_classes=9,
             names=[
                 'O',
                 'B-BUS',
                 'I-BUS',
                 'B-TECHNOLOGY',
                 'I-TECHNOLOGY',
                 'B-TECHNICAL',
                 'I-TECHNICAL',
                 'B-SOFT',
                 'I-SOFT',
             ],
         )
     )}
)

features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-BUS', 'I-BUS', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-TECHNICAL', 'I-TECHNICAL', 'B-SOFT', 'I-SOFT'], id=None), length=-1, id=None)}

In [27]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert the pandas DataFrame to a Hugging Face Dataset
train_test_valid_dataset = datasets.Dataset.from_pandas(df_tokens, features=features)


In [28]:
ner_feature = train_test_valid_dataset.features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-BUS', 'I-BUS', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-TECHNICAL', 'I-TECHNICAL', 'B-SOFT', 'I-SOFT'], id=None), length=-1, id=None)

In [29]:
label_names = ner_feature.feature.names
label_names

['O',
 'B-BUS',
 'I-BUS',
 'B-TECHNOLOGY',
 'I-TECHNOLOGY',
 'B-TECHNICAL',
 'I-TECHNICAL',
 'B-SOFT',
 'I-SOFT']

In [30]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [31]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    list_word_ids = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
        list_word_ids.append(word_ids)
    tokenized_inputs["ner_labels"] = new_labels
    tokenized_inputs["word_ids"] = list_word_ids

    return tokenized_inputs

In [32]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert the pandas DataFrame to a Hugging Face Dataset
train_test_valid_dataset = datasets.Dataset.from_pandas(df_tokens, features=features)


In [33]:
# Tokenise and align all the tokens and labels

ner_tokenised_datasets = train_test_valid_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_test_valid_dataset.column_names,
)

Map:   0%|          | 0/361321 [00:00<?, ? examples/s]

In [64]:
ner_tokenised_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 252924
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 54199
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 54198
    })
})

In [71]:
# Input validation after tokenization and label alignment process
import random

idx = random.randint(0, ner_tokenised_datasets['train'].num_rows-1)
print(idx)
inputs = ner_tokenised_datasets['train']['input_ids'][idx]
print(inputs)
print(ner_tokenised_datasets['train']['ner_labels'][idx])
print([id2label[v] if v not in [-100, 0] else 0 for v in ner_tokenised_datasets['train']['ner_labels'][idx]])

labeled_idx = [inputs[i] for i,label in enumerate(ner_tokenised_datasets['train']['ner_labels'][idx]) if label not in [-100, 0]]
print(tokenizer.decode(labeled_idx))


85410
[101, 2559, 2000, 5926, 1011, 2707, 2115, 2476, 1999, 2622, 2968, 2306, 1996, 3435, 1011, 13823, 2088, 1997, 2009, 1029, 2298, 2053, 2582, 2084, 23600, 2102, 3806, 1005, 2658, 4746, 2007, 29003, 102]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 5, -100]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 'B-BUS', 'I-BUS', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'B-BUS', 0, 0, 0, 0, 'B-TECHNICAL', 0]
project management gas agile


In [38]:
# add new column with the mask labels
ner_tokenised_datasets = ner_tokenised_datasets.add_column("labels", ner_tokenised_datasets['input_ids'])

In [39]:
# Split trainning/val/test set
# 70% train, 30% test + validation
train_test_valid_dataset = ner_tokenised_datasets.train_test_split(test_size=0.3)

# Split the 30% test + valid in half test, half valid
test_valid_dataset = train_test_valid_dataset['test'].train_test_split(test_size=0.5)

# Organise to have a single DatasetDict
ner_tokenised_datasets = DatasetDict({
    'train': train_test_valid_dataset['train'],
    'test': test_valid_dataset['test'],
    'valid': test_valid_dataset['train']})

In [40]:
ner_tokenised_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 252924
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 54199
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 54198
    })
})

In [41]:
input_ids = ner_tokenised_datasets["train"]["input_ids"][0][:16]
sample_word_ids = ner_tokenised_datasets["train"]["word_ids"][0][:16]
labels = ner_tokenised_datasets["train"]["labels"][0][:16]

In [42]:
# Sample description of the datasets

print(f"Length {len(input_ids)} | {input_ids}")
print(f"Length {len(labels)} | {labels}")
print(f"Length {len(sample_word_ids)} | {sample_word_ids}")
print(tokenizer.decode(input_ids))


Length 16 | [101, 1005, 2128, 2467, 2330, 2000, 6594, 1012, 2061, 1010, 2065, 2017, 1005, 2128, 7568, 2055]
Length 16 | [101, 1005, 2128, 2467, 2330, 2000, 6594, 1012, 2061, 1010, 2065, 2017, 1005, 2128, 7568, 2055]
Length 16 | [None, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 12]
[CLS]'re always open to discussion. so, if you're excited about


#### Data Collation Exploration


---

#### Sample Test Custom Masking 
Using random masking in word related labels

In [45]:
for i in zip(ner_tokenised_datasets["train"]['input_ids'][:2], ner_tokenised_datasets["train"]['labels'][:2]):
  print(len(i[0]), len(i[1]))

31 31
27 27


In [46]:
# Percentage of words for masking
wwm_probability = 0.2

In [47]:
# Take the sample from dataset to create the mapping
sample = ner_tokenised_datasets["train"][1]

word_ids = sample.pop("word_ids")

# Create a map between words and corresponding token indices
mapping = collections.defaultdict(list)
current_word_index = -1
current_word = None
for idx, word_id in enumerate(word_ids):
    if word_id is not None:
        if word_id != current_word:
            current_word = word_id
            current_word_index += 1
        mapping[current_word_index].append(idx)

In [49]:
# Create the n-dimension random-mask and original word based dimension array

original_array = [idx for idx in mapping.keys()]
random_mask = np.zeros(len(original_array), dtype=int)

# Array with the indexes of words allowed to mask
subset = [idx for idx,label in enumerate(sample["ner_labels"]) if label in [-100,0]]

# Binomial distribution of masking words
mask = np.random.binomial(1, wwm_probability, (len(subset),))


mask_words_index = np.array(subset)
mask_words_index = mask_words_index[mask==1]

print(mask)
print(subset)
print(f"Words idxs to be masked: {mask_words_index}\n")




[0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0
 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0
 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0]
[0, 1, 2, 4, 5, 7, 8, 11, 12, 13, 16, 17, 19, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 125, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 1

In [53]:
print(tokenizer.decode(sample['input_ids']))
print(sample['input_ids'])
print(sample['ner_labels'])

[CLS] [UNK] - software [UNK] - c #, asp. net, mvc, web api, javascript, react, sql, [UNK] - hybrid ( travel to their northampton office twice per week ) [UNK] - salary up to £50, 000 d. o. e digital waffle is partnered with a leading law firm who are looking out for a software engineer who will be an integral part of a team with a focus on development and maintenance for their it systems and services. in your day to day you will be operating in all aspects of the development for business changing solutions such as, designing, developing, and maintaining software applications and testing to resolve all issues and ensure the systems operate efficiently. what skillset you will hold : proven back - end experience with c #, asp. net, mvc, webapicompetent on the front end with javascript, react / angular, html, csssolid understanding of sql and t - sqlgreat problem solving and analytical abilities what's on offer? salary up to £50, 000 d. o. ehybrid working ( travel to the office twice per [

In [87]:
# Create a boolean matrix with words space dimension "(len(mapping))"

label_word_id = None
input_ids = sample["input_ids"]
labels = sample["labels"]
new_labels = [-100] * len(labels)

for i,j in enumerate(mask_words_index):

  if label_word_id ==  word_ids[j]: # Don't search the mapping again since the word was already masked
    continue

  label_word_id = word_ids[j]

  # Iterate over the mapped array to keep the labels in the specified tokens
  for idx in mapping[label_word_id]:
    new_labels[idx] = labels[idx]

    # Mask the input
    input_ids[idx] = tokenizer.mask_token_id

sample['labels'] = new_labels


In [88]:
# Validate masking against the real labels
val_slice = mask_words_index[4]

print(sample['input_ids'][:val_slice+1])
print(sample['labels'][:val_slice+1])
print(word_ids[:val_slice+1])

[101, 103, 1997, 2740, 1004, 3808, 7040, 1010, 103, 103, 103, 5918, 1010, 11163, 7706, 2487, 1998, 11163, 26776, 2629, 6515, 6078, 1998, 7882, 103, 103]
[-100, 4824, -100, -100, -100, -100, -100, -100, 2522, 4095, 2232, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 6370, 1011]
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 18, 19, 19]


In [89]:
# Checking dimension of objects
for k,v in sample.items():
  print(k, " : ", len(v))

print(f"word ids: {len(word_ids)}")
print(f"word mapping: {len(mapping.keys())}")


input_ids  :  228
attention_mask  :  228
ner_labels  :  228
labels  :  228
word ids: 228
word mapping: 200


In [60]:
wwm_probability = 0.25

def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None

        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                    
                mapping[current_word_index].append(idx)

        # Create the n-dimension random-mask and original word based dimension array

        original_array = [idx for idx in mapping.keys()]
        random_mask = np.zeros(len(original_array), dtype=int)

        # Array with the indexes of words allowed to mask
        subset = [idx for idx,label in enumerate(feature["ner_labels"]) if label not in [-100,0]]

        # Binomial distribution of masking words
        mask = np.random.binomial(1, wwm_probability, (len(subset),))


        mask_words_index = np.array(subset)
        mask_words_index = mask_words_index[mask==1]

        
        # Create a boolean matrix with words space dimension "(len(mapping))"

        label_word_id = None
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)

        for i,j in enumerate(mask_words_index):

            if label_word_id ==  word_ids[j]: # Don't search the mapping again since the word was already masked
                continue

            label_word_id = word_ids[j]

            # Iterate over the mapped array to keep the labels in the specified tokens
            for idx in mapping[label_word_id]:
                new_labels[idx] = labels[idx]

                # Mask the input
                input_ids[idx] = tokenizer.mask_token_id

        feature['labels'] = new_labels

    return default_data_collator(features)

In [74]:
samples = ner_tokenised_datasets['train'][85410]
batch = whole_word_masking_data_collator([samples])

In [62]:
for chunk, label in zip(batch["input_ids"], lm_datasets['train']['input_ids'][:10]):
    print(f"\n'>>> {tokenizer.decode(chunk)}'")
    print(f"\n'>>> {tokenizer.decode(label)}'")



'>>> [CLS] [MASK] [MASK] always [MASK] to [MASK] [MASK] so [MASK] if you [MASK] [MASK] excited [MASK] [MASK] with [MASK] [MASK] but [MASK] [MASK] [MASK] sure'

'>>> [CLS]'re always open to discussion. so, if you're excited about working with us, but aren't sure'

'>>> if you're 100 [SEP] [CLS] first to [MASK]. responsibilities include [MASK] architecture, design [MASK] [MASK] [MASK] delivery of an enterprise [MASK]'

'>>> if you're 100 [SEP] [CLS] first to know. responsibilities include : architecture, design, development and delivery of an enterprise grade'

'>>> business intelligence solutions based on the microsoft [SEP] [CLS], predominantly [MASK] the [MASK], with a [MASK] of offices including australia, spain and'

'>>> business intelligence solutions based on the microsoft [SEP] [CLS], predominantly in the uk, with a number of offices including australia, spain and'

'>>> the united states. [MASK] commitment to yousecuring and [SEP] [CLS] [MASK] track record of building models (

#### Sample Test Data Collator
After masking the words, we need to fetch a batch, pad it and input it into the model.

In [47]:
# Initiate variables
result = {}

In [None]:
# DEfinning the chunk size
chunk_size = 25

In [None]:
# Concatenate all the texts into a single list
concatenated_examples = {k: sum(ner_tokenised_datasets["train"][k][:100], []) for k in list(ner_tokenised_datasets["train"].features)}

In [None]:
# This sample is the same object "example" that would be passed into the map function
sample = {k:concatenated_examples[k][:630] for k in list(ner_tokenised_datasets["train"].features)}


In [None]:
for k in sample.keys():
    print(k, len(sample[k]))

In [None]:
# Identifying the pad token id
tokenizer.pad_token_id

In [None]:
# Compute length of concatenated texts
total_length = len(sample[list(sample.keys())[0]])
total_length

In [None]:
# We drop the last chunk if it's smaller than chunk_size
total_chunks_length = (total_length // chunk_size) * chunk_size
total_chunks_length

In [None]:
# Get the difference between the full length and the chunks total length
remaining_chunk_length = total_length - total_chunks_length
remaining_chunk_length

In [None]:
result = {
    k: [t[i : i + chunk_size] for i in range(0, total_chunks_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

In [None]:
# Output from the chunks is equal to the cocient between the full length of concatenated
# texts and the chunk_size

len(result['input_ids']) == (total_length // chunk_size)


In [None]:
# Verifying the remaining chunk length matches 
len(sample['input_ids'][total_chunks_length:])

In [None]:
# For the last chunk of text
if remaining_chunk_length > 0:

    pad_token_id = tokenizer.pad_token_id
    # Create the padded inputs
    pad_output =  {
        k: [t[i : i + remaining_chunk_length] + [pad_token_id]*(chunk_size - remaining_chunk_length)
            for i in range(total_chunks_length, total_length, remaining_chunk_length)]
            for k, t in concatenated_examples.items()
        }
    
    # append the padded inputs
    {
        k: t.append(pad_output[k][0])
        for k,t in result.items()
    }
    


In [None]:
for k in result.keys():
    print(f"{k} | {len(result[k])} | {len(result[k][0])}")

In [None]:
# Get the examples to pad
# This dictionary comprehension will go from the last iterated item that belongs to a chunk 
# and will go to total length of the concatenated original text in steps of the remaining chunk 
# length to pick up what was left out.

pad_input =  {
        k: [t[i : i + chunk_size] for i in range(total_chunks_length, total_length, remaining_chunk_length)]
        for k, t in sample.items()
        }


In [48]:
def group_texts(examples):
    # Setting padding token
    pad_token_id = tokenizer.pad_token_id
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_chunks_length = (total_length // chunk_size) * chunk_size
    # Get the difference between the full length and the chunks total length
    remaining_chunk_length = total_length - total_chunks_length
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_chunks_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }

    # For the last chunk of text
    if remaining_chunk_length > 0:
        # Create the padded inputs
        pad_inputs =  {
            k: [t[i : i + remaining_chunk_length] + [pad_token_id]*(chunk_size - remaining_chunk_length)
                for i in range(total_chunks_length, total_length, remaining_chunk_length)]
                for k, t in concatenated_examples.items()
            }
        
        # append the padded inputs to the result array
        {
            k: t.append(pad_inputs[k][0])
            for k,t in result.items()
        }
    

    return result

In [49]:
lm_datasets = ner_tokenised_datasets.map(group_texts, batched=True)

Map:   0%|          | 0/252924 [00:00<?, ? examples/s]

Map:   0%|          | 0/54199 [00:00<?, ? examples/s]

Map:   0%|          | 0/54198 [00:00<?, ? examples/s]

In [50]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 299491
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 64096
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 64241
    })
})

In [66]:
# Validate operations
def get_chunk_dimensions(example):
    result =  {
    k: [len(chunk)
        for chunk in t ]
        for k, t in example.items()
    }

    return result
    
    

In [67]:
val_datasets = lm_datasets.map(get_chunk_dimensions, batched=True)

Map:   0%|          | 0/55132 [00:00<?, ? examples/s]

Map:   0%|          | 0/11826 [00:00<?, ? examples/s]

Map:   0%|          | 0/11758 [00:00<?, ? examples/s]

In [77]:
print(val_datasets['valid']['input_ids'])

[128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,