### Exploring a crafted-pipeline for the data processing


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import math
import random
import numpy as np
from typing import Tuple
from ast import literal_eval
import re
import os

### 1. Load the necessary data

In [None]:
path = "/content/drive/MyDrive/Qbot-gpt/data"

In [None]:

# labelled skills
labeled_skills_file = os.path.join(path, "skills_sample.csv")
df_skills = pd.read_csv(labeled_skills_file)
# filter some columns out
df_skills = df_skills.iloc[:,1:3].copy()

#  linkedin data
linkedin_file = os.path.join(path, "jobs_230612.csv")
df_desc = pd.read_csv(linkedin_file, escapechar="\\")

# matched testing data
matches_file = os.path.join(path,'matches.csv')
df_matches = pd.read_csv(matches_file)
df_matches['0_'] = df_matches['0'].apply(lambda x: literal_eval(str(x)))

### 2. Cleansing the data

Starting with the skills column from the skills_data df.

In [None]:
def skills_cleaning(
        df: pd.DataFrame,
        skills_column: str,
        label_column: str,
        re_preparent: re.Pattern=re.compile(r'^([\w\s]*) \('),
        re_parent: re.Pattern=re.compile(r'\(([a-z\s]+)\)'),
        re_hasparent: re.Pattern=re.compile(r'\([a-z\s]+\)'),
) -> pd.DataFrame:
    """
    Function to clean the skills scraped from LinkedIn. The function
    removes the "(programming language)" from the skills. It identifies
    further skills containing parentheses, the splits these in two rows.
    One row with the text before the parentheses and another row with
    text within the parentheses. It also gets rid of duplicate rows and
    rows with missing labels.

    :param df: pandas.DataFrame containing the skills to be cleaned
    :param skills_column: string value of the header of the column with
    the skills
    :param label_column: string value of the header of the column with
    the labels
    :param re_preparent: re.Pattern regex to match text in front of a
    parentheses
    :param re_parent: re.Pattern regex to match text inside a parentheses
    :param re_hasparent: re.Pattern to match if a skill has parentheses
    :return: pandas.DataFrame with the cleaned skills column
    """

    # copying the DF, so we won't overwrite the original
    dfc = df.copy()

    dfc = dfc.dropna(subset=label_column).reset_index(drop=True)

    dfc[skills_column] = dfc[skills_column].str.replace(
        ' (programming language)',
        '',
        regex=False,
    )
    # creating two arrays with the texts from before the parentheses and
    # from within the parentheses
    preparent = dfc[skills_column].str.extract(re_preparent)
    parent = dfc[skills_column].str.extract(re_parent)

    # creating a boolean mask to note which rows have parentheses
    mask = dfc[skills_column].str.contains(re_hasparent, regex=True)

    # if there are parentheses, we create a copy of the parentheses rows
    # and append it to the modified original DF
    if np.any(mask):
        df_new = dfc.loc[mask].copy()
        # the original parentheses rows will have the preparentheses text
        dfc.loc[mask, skills_column] = preparent.loc[mask, 0]
        # the new rows will have the post parentheses text
        df_new.loc[:, skills_column] = parent.loc[mask, 0]

        # appending the new DF to the old, also dropping duplicates
        return pd.concat([dfc, df_new]).drop_duplicates(subset=skills_column)
    else:
        # if no changes are needed, we just drop the duplicates
        return dfc.drop_duplicates(subset=skills_column)

In [None]:
def description_cleaning(
        df: pd.DataFrame,
        desc_column: str,
        re_punc: re.Pattern=re.compile(
            r'([!"#$%\'()*+,./:;<=>?@\]\[\\^_`{|}~])'
        ),
        re_spac: re.Pattern=re.compile(r'[\n\s]+'),
        re_apos: re.Pattern=re.compile(r'\' '),
) -> pd.DataFrame:
    """
    Function to clean the job descriptions, making them ready to be
    "tokenised" by the label creation function. It pads punctuation marks
    with spaces, eliminates redundant white spaces and removes the "about
    the job" header.

    :param df: pandas.DataFrame to clean
    :param desc_column: string value of the column header with the
    description to clean
    :param re_punc: re.Pattern regex to match punctuation marks
    :param re_spac: re.Pattern regex to match white spaces
    :param re_apos: re.Pattern regex to match apostrophes
    :return: pandas.DataFrame with the cleaned job description column
    """

    # copying the DF, so the original gets preserved
    dfc = df.copy()
    dfc[desc_column] = dfc[desc_column].str.lower()
    # padding punctuation marks with white spaces
    dfc[desc_column] = dfc[desc_column].str.replace(
        re_punc,
        r' \1 ', # inserting the first captured group (padded)
        regex=True,
    )
    # removing redundant white spaces
    dfc[desc_column] = dfc[desc_column].str.replace(re_spac, ' ', regex=True)
    # removing the second padding space from apostrophes. so "job ' s"
    # becomes "job 's"
    dfc[desc_column] = dfc[desc_column].str.replace(re_apos, '\'', regex=True)
    dfc[desc_column] = dfc[desc_column].str.replace(
        'about the job',
        '',
        regex=False,
    ).str.strip()

    return dfc

In [None]:
# Cleaning the skills
cleaned_skills_df = skills_cleaning(df=df_skills, skills_column='skills', label_column='label_gen')
# Cleaning job description
cleaned_desc_df = description_cleaning(df=df_desc, desc_column='job_description')



In [None]:
# Additional pre-processing
cleaned_skills_df = cleaned_skills_df.dropna().reset_index(drop=True)

In [None]:
cleaned_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6672 entries, 0 to 6671
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   skills     6672 non-null   object
 1   label_gen  6672 non-null   object
dtypes: object(2)
memory usage: 104.4+ KB


### 3. Labelling

In [None]:
category_dict = {
    'Business': ('B-BUS', 'I-BUS'),
    'Technology': ('B-TECHNOLOGY', 'I-TECHNOLOGY'),
    'Technical': ('B-TECHNICAL', 'I-TECHNICAL'),
    'Soft': ('B-SOFT', 'I-SOFT'),
}

id2label = {
    0: "O",
    1: "B-BUS",
    2: "I-BUS",
    3: "B-TECHNOLOGY",
    4: "I-TECHNOLOGY",
    5: "B-TECHNICAL",
    6: "I-TECHNICAL",
    7: "B-SOFT",
    8: "I-SOFT",
}

label2id = {
    "O": 0,
    "B-BUS": 1,
    "I-BUS": 2,
    "B-TECHNOLOGY": 3,
    "I-TECHNOLOGY": 4,
    "B-TECHNICAL": 5,
    "I-TECHNICAL": 6,
    "B-SOFT": 7,
    "I-SOFT": 8,
}

In [None]:
def make_skills_label_dict(
        df: pd.DataFrame,
        skills_column: str,
        cat_column: str,
        label_dict: dict,
) -> dict:
    """
    Function that takes in a DF with the skills and their categories along with a dictionary with keys corrsponding
    to the skill categories. The function creates a dictionary with the skills as keys and the beginning and
    intermadiate labels as values.

    :param df: pandas.DataFrame with the skills and their categories
    :param skills_column: string value of the column header containing the skills
    :param cat_column: string value of the column header containing the skill category
    :param label_dict: dictionary with the category names as keys and the label tuples as values
    :return: dictionary where the skills are the keys and values are the label tuples based on their category
    """

    skills_label_dict = dict()

    for i,s in enumerate(df[skills_column]):
        value_range = np.arange(len(s.split(" ")))
        bool_array = value_range == 0
        labelled_array = np.where(bool_array, label2id[category_dict[df[cat_column].iloc[i]][0]], label2id[category_dict[df[cat_column].iloc[i]][1]])
        skills_label_dict[s] = list(labelled_array)

    return skills_label_dict

In [None]:

skills_label_dict = make_skills_label_dict(
    df=cleaned_skills_df,
    skills_column='skills',
    cat_column='label_gen',
    label_dict=category_dict,
)


In [None]:
skills_label_dict

{'ilm': [1],
 'energy systems': [1, 2],
 'help desk support': [3, 4, 4],
 'currency': [1],
 'business planning': [1, 2],
 'open-source development': [3, 4],
 'zeromq': [3],
 'manuscript': [1],
 'formal verification': [1, 2],
 'templating': [1],
 'environmental reports': [1, 2],
 'escalation process': [1, 2],
 'video game production': [3, 4, 4],
 'interior architecture': [1, 2],
 'trading software': [3, 4],
 'electronic medical record': [3, 4, 4],
 'mathematics': [5],
 'general administration': [1, 2],
 'aws glue': [3, 4],
 'tariffs': [1],
 'go': [3],
 'webgl': [3],
 'imperative programming': [3, 4],
 'sql database administration': [3, 4, 4],
 'energy markets': [1, 2],
 'precision medicine': [1, 2],
 'requirements engineering': [1, 2],
 'hedge funds': [1, 2],
 'indemnity': [1],
 'library services': [3, 4],
 'report preparation': [1, 2],
 'budget forecasting': [1, 2],
 'mechanical systems': [1, 2],
 'occupational health': [1, 2],
 'nosql': [3],
 'software coding': [3, 4],
 'market knowle

Finding the matches of skills in every description text

In [None]:
def wrap_in_spaces(
    string_val: str,
    re_string: re.Pattern=re.compile(r'([\w\s]+)'),
) -> str:
    """
    Simple function to wrap the input string in spaces.

    :param string_val: string value to wrap
    :param re_string: re.Pattern regex matching word characters and
    spaces
    :return: cleaned string value
    """

    return re.sub(re_string, r' \1 ', string_val)

In [None]:
def get_skills_boolean_matrix(
    df: pd.DataFrame,
    desc_column: str,
    skills_dictionary: dict,
    *args,
    **kwargs,
) -> np.ndarray:
    """
    Function returning a boolean array indicating if the skills found in
    the keys of the input dictionary are in the data frame for each row.
    Therefore, the shape of the output array is
    (df.shape[0], len(skills_dictionary)).

    :param df: pandas.DataFrame to check
    :param desc_column: name of the column to perform the matching on
    :param skills_dictionary: dictionary whose keys will be used for
    matching
    :param args: positional arguments
    :param kwargs: keyword arguments
    :return: numpy.ndarray containing the boolean mask indicating whether
    the skill is present in a row or not
    """

    # creating a container for boolean arrays for each skill
    arrays = []
    # iterating through the skills and checking if they ar present in the
    # descriptions
    for skill_key in list(skills_dictionary.keys()):
        # checking whether the skill is present in each deascription
        array = (
            df[desc_column]
            .str
            .contains(
                # wrapping the skill in spaces, so only relevant matches
                # are picked up
                wrap_in_spaces(
                    string_val=skill_key,
                    *args,
                    **kwargs,
                ),
                regex=False,
            )
        )

        arrays.append(array)

    # we arrange the arrays so there is a row corresponding to each DF row with
    # a boolean corresponding to each skill
    return np.column_stack(arrays)




In [None]:
def label_entry(
        skill: str,
        skills_label_dictionary: dict,
        description_split: list,
        label_list: list,
) -> None:
    """
    Function to detect the position of a skill in a list of strings from the
    description and write the corresponding labels into the labels list -
    corresponding to the description list.

    :param skill: string value of the skill
    :param skills_label_dictionary: dictionary where the keys are the skills and
    values are the associated labels
    :param description_split: list of words from the description (split on white spaces)
    :param label_list: list of labels corresponding to the description_split list (initially all 'O'-s)
    :return: None
    """

    skill_split = skill.split(' ')
    len_ds = len(description_split)
    len_ss = len(skill_split)

    for i in range(len_ds - len_ss + 1):
        if description_split[i: i + len_ss] == skill_split:
            #print(f'{i}, {len_ds=}, {len_ss=}, {len(label_list)=}')
            label_list[i] = skills_label_dictionary[skill][0]
            label_list[i + 1: i + len_ss] = [skills_label_dictionary[skill][1] for _ in range(len_ss - 1)]
        else:
            pass

In [None]:
def label_df(
        df: pd.DataFrame,
        desc_column: str,
        skills_list: list,
        boolean_masks: np.ndarray,
        *args,
        **kwargs,
) -> tuple:
    """
    Function to iterate through a data frame's description column returning two lists, one with the split description
    and one with the corresponding labels.

    :param df: pandas.DataFrame with the description to label
    :param desc_column: string value of the column header
    :param skills_list: list of string values of skills
    :param boolean_masks: numpy.ndarray of boolean values of shape (df.shape[0], len(skills_list)). it corresponds to
    whether a skill is present in a row of the data frame
    :param args: positional arguments
    :param kwargs: keyword arguments
    :return: tuple of lists. one with the split description strings, one with the corresponding label strings
    """

    input_array = []
    label_array = []

    for i in range(df.shape[0]):
        description_split = df[desc_column].values[i].split(' ')
        label_list = [0 for _ in range(len(description_split))]

        for skill in skills_list[boolean_masks[i, :]]:
            label_entry(
                skill=skill,
                description_split=description_split,
                label_list=label_list,
                *args,
                **kwargs,
            )

        input_array.append(description_split)
        label_array.append(label_list)

    return input_array, label_array

In [None]:
re_string = re.compile(r'([\w\s]+)')

bool_masks = get_skills_boolean_matrix(
    df=cleaned_desc_df,
    desc_column='job_description',
    skills_dictionary=skills_label_dict,
    re_string=re_string,
)

bool_masks.shape

(15301, 6672)

In [None]:
skills_all = np.array(list(skills_label_dict.keys()))

input_array, label_array = label_df(
    df=cleaned_desc_df,
    desc_column='job_description',
    skills_list=skills_all,
    boolean_masks=bool_masks,
    skills_label_dictionary=skills_label_dict,
)

In [None]:
# Creating the dataset for the model

df_labelled_desc = pd.DataFrame()
df_labelled_desc['text'] = input_array
df_labelled_desc['ner_tags'] = label_array

In [None]:
df_labelled_desc

Unnamed: 0,text,ner_tags
0,"[full, time, ,, in, person, joblocation, :, ne...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[role, :, python, developer, /, professional, ...","[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[jjob, overviewbased, in, guildford, ,, dps, g...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, ..."
3,"[my, client, is, seeking, a, skilled, and, mot...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,"[crossover, is, the, world, 's, #, 1, source, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
15296,"[job, title, :, manufacturing, quality, engine...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ..."
15297,"[the, client, are, a, global, designer, and, m...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, ..."
15298,"[recruit, central, ,, are, currently, on, the,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, ..."
15299,"[software, engineer, ,, systematic, equities, ...","[5, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### 4. Chunking and Tokenising

In [None]:
! pip install transformers datasets scikit-learn huggingface-hub evaluate

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━

In [None]:
# Loading hugging face libraries

from transformers import AutoModelForTokenClassification, TrainingArguments\
    , Trainer, AutoTokenizer, DataCollatorForTokenClassification\
    , DataCollatorWithPadding, AutoModelForTokenClassification, pipeline\
    , AutoModelForMaskedLM

from datasets import ClassLabel, Features, Sequence, Value, Dataset, DatasetDict
from huggingface_hub import notebook_login

import evaluate
import datasets

from sklearn.model_selection import train_test_split




In [None]:

def split_lists_into_chunks(row:pd.Series, n:int, columns:list) -> None:
    # Determine the length of each chunk

    text = row['text']
    labels = row['ner_tags']

    # Split the lists into chunks
    chunks_text = [(text[i:i+n], labels[i:i+n]) for i in range(0, len(text), n)]
    chunks_df = pd.DataFrame(chunks_text, columns=columns)

    return chunks_df



In [None]:

n = 200
df_columns = ['tokens', 'ner_tags']
arrays = []
stop = df_labelled_desc.index[-1]

output = df_labelled_desc.apply(split_lists_into_chunks
                                     , args=(n,df_columns)
                                     , axis=1)




In [None]:
df_tokens = pd.concat([df for df in output], ignore_index=True)

In [None]:
# Setting the features of the dataset object
features = Features(
    {'tokens': Sequence(
        feature=Value(dtype='string')
     ),
     'ner_tags': Sequence(
         feature=ClassLabel(
             num_classes=9,
             names=[
                 'O',
                 'B-BUS',
                 'I-BUS',
                 'B-TECHNOLOGY',
                 'I-TECHNOLOGY',
                 'B-TECHNICAL',
                 'I-TECHNICAL',
                 'B-SOFT',
                 'I-SOFT',
             ],
         )
     )}
)

features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-BUS', 'I-BUS', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-TECHNICAL', 'I-TECHNICAL', 'B-SOFT', 'I-SOFT'], id=None), length=-1, id=None)}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert the pandas DataFrame to a Hugging Face Dataset
train_test_valid_dataset = datasets.Dataset.from_pandas(df_tokens, features=features)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
ner_feature = train_test_valid_dataset.features["ner_tags"]
ner_feature

AttributeError: ignored

In [None]:
label_names = ner_feature.feature.names
label_names

['O',
 'B-BUS',
 'I-BUS',
 'B-TECHNOLOGY',
 'I-TECHNOLOGY',
 'B-TECHNICAL',
 'I-TECHNICAL',
 'B-SOFT',
 'I-SOFT']

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    list_word_ids = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
        list_word_ids.append(word_ids)
    tokenized_inputs["ner_labels"] = new_labels
    tokenized_inputs["word_ids"] = list_word_ids

    return tokenized_inputs

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert the pandas DataFrame to a Hugging Face Dataset
train_test_valid_dataset = datasets.Dataset.from_pandas(df_tokens, features=features)


In [None]:
# Tokenise and align all the tokens and labels

ner_tokenised_datasets = train_test_valid_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=train_test_valid_dataset.column_names,
)

Map:   0%|          | 0/51837 [00:00<?, ? examples/s]

In [None]:
ner_tokenised_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids'],
    num_rows: 51837
})

In [None]:
# add new column with the mask labels
ner_tokenised_datasets = ner_tokenised_datasets.add_column("labels", ner_tokenised_datasets['input_ids'])

In [None]:
# Split trainning/val/test set
# 70% train, 30% test + validation
train_test_valid_dataset = ner_tokenised_datasets.train_test_split(test_size=0.3)

# Split the 30% test + valid in half test, half valid
test_valid_dataset = train_test_valid_dataset['test'].train_test_split(test_size=0.5)

# Organise to have a single DatasetDict
ner_tokenised_datasets = DatasetDict({
    'train': train_test_valid_dataset['train'],
    'test': test_valid_dataset['test'],
    'valid': test_valid_dataset['train']})

In [None]:
ner_tokenised_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 36285
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 7776
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'ner_labels', 'word_ids', 'labels'],
        num_rows: 7776
    })
})

In [None]:
input_ids = ner_tokenised_datasets["train"]["input_ids"][0][:16]
sample_word_ids = ner_tokenised_datasets["train"]["word_ids"][0][:16]
labels = ner_tokenised_datasets["train"]["labels"][0][:16]

In [None]:
# Sample description of the datasets

print(f"Length {len(input_ids)} | {input_ids}")
print(f"Length {len(labels)} | {labels}")
print(f"Length {len(sample_word_ids)} | {sample_word_ids}")
print(tokenizer.decode(input_ids))


Length 16 | [101, 6887, 27292, 2050, 24501, 8162, 6129, 7300, 1006, 11409, 7476, 10975, 2015, 1007, 2038, 1996]
Length 16 | [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Length 16 | [None, 0, 0, 0, 1, 1, 1, 2, 3, 4, 4, 5, 5, 6, 7, 8]
[CLS] pharma resourcing solutions ( linical prs ) has the


In [None]:
tokenizer.decode([3891, 10615])

'risk governance'

#### Data Collation Exploration


---

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["tokens"], is_split_into_words=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = linkedin_dataset.map(
    tokenize_function, batched=True, remove_columns=["tokens", "ner_tags"]
)
tokenized_datasets

Map:   0%|          | 0/36285 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/7776 [00:00<?, ? examples/s]

Map:   0%|          | 0/7776 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 36285
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 7776
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 7776
    })
})

In [None]:
# Definning manageable size
chunk_size = 128

In [None]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets['train'][:5]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 230'
'>>> Review 1 length: 191'
'>>> Review 2 length: 134'
'>>> Review 3 length: 214'
'>>> Review 4 length: 242'


In [None]:
# Concatenating all the tokens into a single list of tokens
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 1011'


In [None]:
concatenated_examples["input_ids"][:10]

[101, 6887, 27292, 2050, 24501, 8162, 6129, 7300, 1006, 11409]

In [None]:
# Split all the tokens into manageable chunks
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 115'


In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/36285 [00:00<?, ? examples/s]

Map:   0%|          | 0/7776 [00:00<?, ? examples/s]

Map:   0%|          | 0/7776 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 55048
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 11837
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 11779
    })
})

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
# Checking out random masking
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
     _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] pharm [MASK] [MASK]ourcing solutions [MASK] linical prs ) [MASK] the experience and a clear [MASK] of the importance of our clients ’ resourcing strategies. with the flexibility that [MASK] level of service requires [MASK] linical is able to [MASK] a [MASK] [MASK] of resourcing requirements as per our clients ’ needs and expectations. linical prs is supported by a strong and continuous recruitment activity [MASK] accompanied by [MASK] and highly experienced line [MASK] personnel as well [MASK] strong leadership oversight throughout the [MASK]. thus, prs can guarantee success and commitment to the project objectives of our clients [MASK] we truly believe people are the real [MASK] of a [MASK] and'

'>>> we gambia [MASK] their performance and [MASK] professional satisfaction. description the ep methodistmiology & real world evidence function [MASK] [MASK] [MASK] contributes to the successful development and [MASK]ization of new therapies in rare disease, providing ł and exper

In [None]:
import collections
import numpy as np

from transformers.data.data_collator import tf_default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return tf_default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]

for feature in samples:
    word_ids = feature.pop("word_ids")

    # Create a map between words and corresponding token indices
    mapping = collections.defaultdict(list)
    current_word_index = -1
    current_word = None
    for idx, word_id in enumerate(word_ids):
        if word_id is not None:
            if word_id != current_word:
                current_word = word_id
                current_word_index += 1
            mapping[current_word_index].append(idx)

In [None]:
print(len(lm_datasets["train"][1]['input_ids']))
print(len(mapping))

128
110


In [None]:
# Randomly mask words
wwm_probability = 0.2

mask = np.random.binomial(1, wwm_probability, (len(mapping),))
input_ids = feature["input_ids"]
labels = feature["labels"]
new_labels = [-100] * len(labels)
for word_id in np.where(mask)[0]:
    print(word_id)
    word_id = word_id.item()
    for idx in mapping[word_id]:
        new_labels[idx] = labels[idx]
        print("\t", idx, mapping[word_id], labels[idx])
        input_ids[idx] = tokenizer.mask_token_id
feature["labels"] = new_labels


1
	 1 [1] 15697
8
	 8 [8] 9967
12
	 12 [12, 13, 14, 15] 4958
	 13 [12, 13, 14, 15] 5178
	 14 [12, 13, 14, 15] 4328
	 15 [12, 13, 14, 15] 6779
17
	 20 [20] 3853
25
	 28 [28] 2458
30
	 34 [34, 35, 36] 1996
	 35 [34, 35, 36] 2527
	 36 [34, 35, 36] 13046
32
	 38 [38] 4678
34
	 40 [40] 1010
36
	 42 [42] 4105
37
	 43 [43] 1998
41
	 47 [47] 1998
43
	 49 [49] 2613
48
	 54 [54] 1996
50
	 56 [56] 1997
54
	 60 [60] 1996
61
	 70 [70] 3026
63
	 72 [72] 1997
64
	 73 [73] 1996
68
	 77 [77] 15873
70
	 80 [80] 2470
75
	 85 [85] 2458
86
	 98 [98] 12353
89
	 103 [103] 1025
91
	 105 [105] 2000
106
	 124 [124] 1998
107
	 125 [125] 5475


In [None]:
print(new_labels)
print(len(new_labels))

[-100, 15697, -100, -100, -100, -100, -100, -100, 9967, -100, -100, -100, 4958, 5178, 4328, 6779, -100, -100, -100, -100, 3853, -100, -100, -100, -100, -100, -100, -100, 2458, -100, -100, -100, -100, -100, 1996, 2527, 13046, -100, 4678, -100, 1010, -100, 4105, 1998, -100, -100, -100, 1998, -100, 2613, -100, -100, -100, -100, 1996, -100, 1997, -100, -100, -100, 1996, -100, -100, -100, -100, -100, -100, -100, -100, -100, 3026, -100, 1997, 1996, -100, -100, -100, 15873, -100, -100, 2470, -100, -100, -100, -100, 2458, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 12353, -100, -100, -100, -100, 1025, -100, 2000, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1998, 5475, -100, -100]
128


In [None]:
# Masking only labeled related words
wwm_probability = 0.2

In [None]:
#original_array = [idx for idx in mapping.keys()]


In [None]:
random_mask = np.zeros(len(original_array), dtype=int)

In [None]:
ner_tokenised_datasets["train"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'word_ids'],
    num_rows: 36285
})

In [None]:
# Array with the indexes of words allowed to mask
subset = [0, 3, 4]

# Creating a boolean array of length original word space
bool_array = np.isin(np.arange(len(original_array)), subset)

# Array with indexes to pick from
labels_subset = np.where(bool_array)[0]

idx_label = np.random.choice(labels_subset)
random_mask[idx_label] = 1



In [None]:
len(mapping)

110

In [None]:
random_mask.shape

(110,)

#### Sample Test Custom Collator Execution
Using random masking in word related labels and padding.

In [None]:
for i in zip(ner_tokenised_datasets["train"]['input_ids'][:2], ner_tokenised_datasets["train"]['labels'][:2]):
  print(len(i[0]), len(i[1]))

123 123
187 187


In [None]:
# Percentage of words for masking
wwm_probability = 0.2

In [None]:
# Take the sample from dataset to create the mapping
sample = ner_tokenised_datasets["train"][1]

word_ids = sample.pop("word_ids")

# Create a map between words and corresponding token indices
mapping = collections.defaultdict(list)
current_word_index = -1
current_word = None
for idx, word_id in enumerate(word_ids):
    if word_id is not None:
        if word_id != current_word:
            current_word = word_id
            current_word_index += 1
        mapping[current_word_index].append(idx)

In [None]:
# Create the n-dimension random-mask and original word based dimension array

original_array = [idx for idx in mapping.keys()]
random_mask = np.zeros(len(original_array), dtype=int)

# Array with the indexes of words allowed to mask
subset = [idx for idx,label in enumerate(sample["ner_labels"]) if label in [-100,0]]

# Binomial distribution of masking words
mask = np.random.binomial(1, wwm_probability, (len(subset),))


mask_words_index = np.array(subset)
mask_words_index = mask_words_index[mask==1]

print(mask)
print(subset)
print(f"Words idxs to be masked: {mask_words_index}\n")




[0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0]
[0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 15

In [None]:
for word_id in np.where(mask)[0]:
    print(word_id)
    word_id = word_id.item()
    for idx in mapping[word_id]:
        new_labels[idx] = labels[idx]
        print("\t", idx, mapping[word_id], labels[idx])
        input_ids[idx] = tokenizer.mask_token_id
feature["labels"] = new_labels

In [None]:
# Create a boolean matrix with words space dimension "(len(mapping))"

label_word_id = None
input_ids = sample["input_ids"]
labels = sample["labels"]
new_labels = [-100] * len(labels)

for i,j in enumerate(mask_words_index):

  if label_word_id ==  word_ids[j]: # Don't search the mapping again since the word was already masked
    continue

  label_word_id = word_ids[j]

  # Iterate over the mapped array to keep the labels in the specified tokens
  for idx in mapping[label_word_id]:
    new_labels[idx] = labels[idx]

    # Mask the input
    input_ids[idx] = tokenizer.mask_token_id

sample['labels'] = new_labels


In [None]:
# Validate masking against the real labels
val_slice = mask_words_index[4]

print(sample['input_ids'][:val_slice+1])
print(sample['labels'][:val_slice+1])
print(word_ids[:val_slice+1])

[101, 2951, 7316, 4813, 103, 7812, 103, 2097, 2031, 103, 2204, 4824, 1997, 1996, 11573, 2791, 103, 103]
[-100, -100, -100, -100, 2019, -100, 4018, -100, -100, 1037, -100, -100, -100, -100, -100, -100, 7312, 2552]
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15]


In [None]:
# Checking dimension of objects
for k,v in sample.items():
  print(k, " : ", len(v))

print(f"word ids: {len(word_ids)}")
print(f"word mapping: {len(mapping.keys())}")


input_ids  :  187
attention_mask  :  187
ner_labels  :  187
labels  :  187
word ids: 187
word mapping: 171


### 5. Training

In [None]:
task = "domain-adaptation"
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(linkedin_mask_dataset["train"]) // batch_size
model_checkpoint = "distilbert-base-uncased"
model_name = model_checkpoint.split("/")[-1]

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,

)

In [None]:
# ! huggingface-cli login

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want t

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=linkedin_mask_dataset["train"],
    eval_dataset=linkedin_mask_dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

/Users/algiraldoh/Development/qbot-gpt/distilbert-base-uncased-finetuned-domain-adaptation is already a clone of https://huggingface.co/algiraldohe/distilbert-base-uncased-finetuned-domain-adaptation. Make sure you pull the latest changes with `repo.git_pull()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



  0%|          | 0/1701 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# Save the trained model
trainer.save_model(f"./{model_name}-finetuned-{task}")

In [None]:
# ! pip install git-lfs

In [None]:
trainer.push_to_hub()