In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
from data.data_utils import get_offset_mapping,clean_text
from data.dataset import FeedbackDataset

In [4]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

%env TOKENIZERS_PARALLELISM = true

env: TOKENIZERS_PARALLELISM=true


# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'pii-masking-200k.csv',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [6]:
sample_df = pd.read_csv(data_path/'sample_submission.csv')
sample_df.shape

(26, 4)

In [7]:
sample_df.head(5)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT


In [8]:
sample_df.label.unique()

array(['B-NAME_STUDENT', 'I-NAME_STUDENT'], dtype=object)

In [9]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [10]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [11]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
len(LABEL2TYPE)

8

In [12]:
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])>0)))*1

In [13]:
df['nb_labels'] = df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))

In [14]:
df['nb_labels'].value_counts() 

0     5862
2      599
4      108
1       86
3       52
6       46
8       14
5       10
12       6
10       5
11       3
9        3
15       2
14       2
21       2
7        1
23       1
18       1
17       1
26       1
34       1
22       1
Name: nb_labels, dtype: int64

In [15]:
df[list(LABEL2TYPE)[:-1]+['nb_labels']].sum()

NAME_STUDENT       891
EMAIL               24
USERNAME             5
ID_NUM              33
PHONE_NUM            4
URL_PERSONAL        72
STREET_ADDRESS       2
nb_labels         2739
dtype: int64

In [16]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [17]:
seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

In [18]:
df.groupby(name)[list(LABEL2TYPE)[:-1]].sum()

Unnamed: 0_level_0,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS
fold_msk_5_seed_42,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,178,5,1,7,1,14,0
1,178,5,1,7,1,14,0
2,179,5,1,6,1,15,1
3,178,4,1,6,0,15,1
4,178,5,1,7,1,14,0


In [19]:
att = "ID_NUM"

In [20]:
list_of = df[(df[att]>0)]

In [21]:
list_of

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,nb_labels,fold_msk_5_seed_42
29,609,Date:14-09-2021\n\nNEWS PAPER\n\nProject: News...,"[Date:14, -, 09, -, 2021, \n\n, NEWS, PAPER, \...","[False, False, False, False, False, False, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,1,0,0,0,26,1
99,2926,NAME: Ignacia Hernandez Roll No. : 932353568...,"[NAME, :, Ignacia, Hernandez, , Roll, No, .,...","[False, True, True, True, False, True, False, ...","[O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O...",1,0,0,1,0,0,0,3,3
124,3565,Project: Experiment sheet (5W+H)\n\nInterview...,"[Project, :, , Experiment, sheet, (, 5W+H, ),...","[False, True, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",1,0,0,1,0,0,0,6,3
213,4717,STORY TELLING\n\nPIN NO. :163133980712 NAME ...,"[STORY, TELLING, \n\n, PIN, NO, ., :, 16313398...","[True, False, False, True, False, True, False,...","[O, O, O, O, O, O, O, B-ID_NUM, O, O, O, B-NAM...",1,0,0,1,0,0,0,3,0
231,4913,DESIGN THINKING\n\nEXPERIMENT – 2\n\nName: Niz...,"[DESIGN, THINKING, \n\n, EXPERIMENT, –, 2, \n\...","[True, False, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",1,0,0,1,0,0,0,9,4
237,4971,USELESS TO USEFULL\n\nThe colony I live had th...,"[USELESS, TO, USEFULL, \n\n, The, colony, I, l...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,1,0,0,0,9,4
240,5023,Project: Shopping Website\n\nInterviewer Name:...,"[Project, :, Shopping, Website, \n\n, Intervie...","[False, True, True, False, False, True, False,...","[O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAM...",1,0,0,1,0,0,0,6,0
246,5069,Marwadi University\n\nAl Akhawayn University\n...,"[Marwadi, University, \n\n, Al, Akhawayn, Univ...","[True, False, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,1,0,0,0,1,2
289,5606,Project: SPECTACLES\n\nInterviewer Name: Vijay...,"[Project, :, SPECTACLES, \n\n, Interviewer, Na...","[False, True, False, False, True, False, True,...","[O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAME_S...",1,0,0,1,0,0,0,23,0
295,5653,This was entirely a new experience for me and ...,"[This, was, entirely, a, new, experience, for,...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,1,0,0,0,3,2


In [22]:
def get_labels(x,att):
    offset_mapping = get_offset_mapping(x.full_text, x.tokens)
    offset_mapping_ = [x for (x,y) in zip(offset_mapping,x.labels) if att in y]
    v = [x.full_text[max(int(offset[0]),0):(int(offset[1]))] for offset in offset_mapping_]
    t = [x for (x,y) in zip(x.tokens,x.labels) if att in y]
    
    return (v,t)

In [23]:
list_of["v"] = list_of.apply(lambda x:get_labels(x,att),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  list_of["v"] = list_of.apply(lambda x:get_labels(x,att),axis=1)


In [24]:
for x,t in list_of["v"]:
    print(f"{t} : {x}")

['860632713425', '530670102508', '530670102508', '875673967537', '860632713425', '557349702179', '784372734211', '054176622314', '674915248960'] : ['860632713425', '530670102508', '530670102508', '875673967537', '860632713425', '557349702179', '784372734211', '054176622314', '674915248960']
['932353568953'] : ['932353568953']
['982645662261', '409046248321'] : ['982645662261', '409046248321']
['163133980712', '186941941714'] : ['163133980712', '186941941714']
['159531167997', '159531167997', '046922558887'] : ['159531167997', '159531167997', '046922558887']
['943063077874', '792389774673', '167695383458'] : ['943063077874', '792389774673', '167695383458']
['Iz.:999893751750', 'Kl.:838901042770'] : ['Iz.:999893751750', 'Kl.:838901042770']
['06EYD876'] : ['06EYD876']
['143860010348', 'Ei:556799175487', '143860010348', 'Un:705491035775', '143860010348', 'Kh:360595695159', '143860010348', 'Kh:217952887271'] : ['143860010348', 'Ei:556799175487', '143860010348', 'Un:705491035775', '143860010

In [15]:
from spacy.lang.en import English
import re
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}


def strip_offset_mapping(text, offset_mapping):
    ret = []
    for start, end in offset_mapping:
        match = list(re.finditer('\S+', text[start:end]))
        if len(match) == 0:
            ret.append((start, end))
        else:
            span_start, span_end = match[0].span()
            ret.append((start + span_start, start + span_end))
    return np.array(ret)

In [17]:
text= " Janet.Koch_8388"
tokenize_with_spacy(text, tokenizer=en_tokenizer)

{'tokens': [' ', 'Janet', '.', 'Koch_8388'],
 'offset_mapping': [(0, 1), (1, 6), (6, 7), (7, 16)]}

In [25]:
import re
import torch
import random
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from data.data_utils import clean_text,get_start_end,get_offset_mapping,get_start_end_offset,create_mapper_n_clean,tokenize_with_spacy,en_tokenizer

from tqdm.auto import tqdm


LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
TYPE2LABEL = {t: l for l, t in enumerate(LABEL2TYPE)}
ID_TYPE = {"0-0":0,"0-1":1,
           "1-0":2,"1-1":3,
           "2-0":4,"2-1":5,
           "3-0":6,"3-1":7,
           "4-0":8,"4-1":9,
           "5-0":10,"5-1":11,
           "6-0":12,"6-1":13
          }

ID_NAME = {"0-0":"B-NAME_STUDENT","0-1":"I-NAME_STUDENT",
           "1-0":"B-EMAIL","1-1":"I-EMAIL",
           "2-0":"B-USERNAME","2-1":"I-USERNAME",
           "3-0":"B-ID_NUM","3-1":"I-ID_NUM",
           "4-0":"B-PHONE_NUM","4-1":"I-PHONE_NUM",
           "5-0":"B-URL_PERSONAL","5-1":"I-URL_PERSONAL",
           "6-0":"B-STREET_ADDRESS","6-1":"I-STREET_ADDRESS",
           "7-0":"O","7-1":"O"
          }

RE_ID_PHONE = r"""(\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(\s*\d{3}\s*\)\s*\d{3}\s*\-\s*\d{4}\s*\w{0,3}(\s*\d{1,8}\s*)?|\b\d{2,}-\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{2,}-\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\d{3}\s*\.\s*\d{3}\s*\.\s*\d{1,5}|\d{3}\s*\-\s*\d{3}\s*\-\s*\d{1,5}|\d{3}\s*x\s*\d{3}\s*x\s*\d{1,5}|\d{1,3}\s{0,2}\d{1,}\s{0,2}\d{1,}|\b\d{1,}\s*\d{1,}\s*\d{1,}|\b\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{1,}-\d{1,}|[\w\.\:\-\_\|]*\d{6,})"""
# Compile the regex pattern
REGEX_COMPILE = re.compile(RE_ID_PHONE)


## =============================================================================== ##
class FeedbackDataset(Dataset):
    def __init__(self,
                 df,
                 tokenizer,
                 mask_prob=0.0,
                 mask_ratio=0.0,
                 ):
        
        self.tokenizer = tokenizer

        if len(self.tokenizer.encode("\n\n"))==2:
            print("Warning : n SEP will be replace by | ")
            df["full_text_clean"] = df['full_text'].transform(lambda x:x.str.replace("\n\n"," | ").replace("\n"," [BR] "))
            df["tokens_clean"] = df['tokens'].transform(lambda x:[str(i).replace("\n\n"," | ").replace("\n"," [BR] ") for i in x])

        self.df = self.prepare_df(df)

        print(f'Loaded {len(self)} samples.')

        assert 0 <= mask_prob <= 1
        assert 0 <= mask_ratio <= 1
        self.mask_prob = mask_prob
        self.mask_ratio = mask_ratio

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        df = self.df.iloc[index]
        text = df['full_text_clean']
        text_id = df['document']
        labels = df['labels']

        tokens = self.tokenizer(text, return_offsets_mapping=True)
        input_ids = torch.LongTensor(tokens['input_ids'])
        attention_mask = torch.LongTensor(tokens['attention_mask'])
        offset_mapping = np.array(tokens['offset_mapping'])
        # offset_mapping = self.strip_offset_mapping(text, offset_mapping)
        num_tokens = len(input_ids)

        # token slices of words
        woff = np.array(df['offset_mapping_init'])
        toff = offset_mapping
        wx1, wx2 = woff.T
        tx1, tx2 = toff.T
        ix1 = np.maximum(wx1[..., None], tx1[None, ...])
        ix2 = np.minimum(wx2[..., None], tx2[None, ...])
        ux1 = np.minimum(wx1[..., None], tx1[None, ...])
        ux2 = np.maximum(wx2[..., None], tx2[None, ...])
        ious = (ix2 - ix1).clip(min=0) / (ux2 - ux1)
#         assert (ious > 0).any(-1).all()

        word_boxes = []
#         err = []
        for i,row in enumerate(ious):
            inds = row.nonzero()[0]
            try:
                word_boxes.append([inds[0], 0, inds[-1] + 1, 1])
            except:
                word_boxes.append([-100, 0, -99, 1])
#                 err.append(i)
                
        word_boxes = torch.FloatTensor(word_boxes)

        # word slices of ground truth spans
        gt_spans = []        
        for i,label in enumerate(labels) :
            gt_spans.append([i,TYPE2LABEL[label.split('-')[-1] if label!="O" else "O"],0 if label.split('-')[0]=="B" else 1])
            
        gt_spans = torch.LongTensor(gt_spans)

        # random mask augmentation
        if np.random.random() < self.mask_prob:
            all_inds = np.arange(1, len(input_ids) - 1)
            n_mask = max(int(len(all_inds) * self.mask_ratio), 1)
            np.random.shuffle(all_inds)
            mask_inds = all_inds[:n_mask]
            input_ids[mask_inds] = self.tokenizer.mask_token_id

        return dict(
                    text_id=text_id,
                    text=text,
                    labels = labels,
                    tokens = df['tokens'],
                    tokens_clean = df['tokens_clean'],
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    word_boxes=word_boxes,
                    gt_spans=gt_spans)
    
    def prepare_df(self,test_df):
        test_df['full_text_clean'] = test_df['full_text_clean'].transform(clean_text)        
        test_df['tokens_clean'] = test_df['tokens_clean'].transform(lambda x:[clean_text(i) for i in x])
        test_df['offset_mapping_init'] = test_df["offset_mapping"] if "offset_mapping" in test_df.columns else test_df.apply(get_start_end_offset('tokens'),axis=1)
        return test_df
    
    def strip_offset_mapping(self, text, offset_mapping):
        ret = []
        for start, end in offset_mapping:
            match = list(re.finditer('\S+', text[start:end]))
            if len(match) == 0:
                ret.append((start, end))
            else:
                span_start, span_end = match[0].span()
                ret.append((start + span_start, start + span_end))
        return np.array(ret)

    def get_word_offsets(self, text):
        matches = re.finditer("\S+", text)
        spans = []
        words = []
        for match in matches:
            span = match.span()
            word = match.group()
            spans.append(span)
            words.append(word)
        assert tuple(words) == tuple(text.split())
        return np.array(spans)
    
## =============================================================================== ##
class CustomCollator(object):
    def __init__(self, tokenizer, model):
        self.pad_token_id = tokenizer.pad_token_id
        if hasattr(model.config, 'attention_window'):
            # For longformer
            # https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/longformer/modeling_longformer.py#L1548
            self.attention_window = (model.config.attention_window
                                     if isinstance(
                                         model.config.attention_window, int)
                                     else max(model.config.attention_window))
        else:
            self.attention_window = None

    def __call__(self, samples):
        batch_size = len(samples)
        assert batch_size == 1, f'Only batch_size=1 supported, got batch_size={batch_size}.'

        sample = samples[0]

        max_seq_length = len(sample['input_ids'])
        if self.attention_window is not None:
            attention_window = self.attention_window
            padded_length = (attention_window -
                             max_seq_length % attention_window
                             ) % attention_window + max_seq_length
        else:
            padded_length = max_seq_length

        input_shape = (1, padded_length)
        input_ids = torch.full(input_shape,
                               self.pad_token_id,
                               dtype=torch.long)
        attention_mask = torch.zeros(input_shape, dtype=torch.long)

        seq_length = len(sample['input_ids'])
        input_ids[0, :seq_length] = sample['input_ids']
        attention_mask[0, :seq_length] = sample['attention_mask']

        text_id = sample['text_id']
        tokens = sample['tokens']
        tokens_clean = sample['tokens_clean']
        # text = sample['text']
        word_boxes = sample['word_boxes']
        gt_spans = sample['gt_spans']

        return dict(text_id=text_id,
                    tokens_clean=tokens_clean,
                    tokens = tokens,
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    word_boxes=word_boxes,
                    gt_spans=gt_spans)

In [26]:
model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [27]:
ds = FeedbackDataset(df.copy(),tokenizer)

Loaded 6807 samples.


In [30]:
idx = random.choice(ds.df[ds.df.PHONE_NUM>0].index)
# idx = 219
# doc = 204
# idx = ds.df[ds.df.document==doc].index[0]
# Example usage:
# idx = 80
full_text_ds = ds.df.iloc[idx]['full_text_clean']
tokens_ds = ds.df.iloc[idx]['tokens_clean']
labels_ds = ds.df.iloc[idx]['labels']
offset_mapping_init = ds.df.iloc[idx]['offset_mapping_init']
# offset_mapping_ds = get_offset_mapping(full_text_ds, tokens_ds)
# offset_mapping_ds1 = tokenize_with_spacy(full_text_ds)['offset_mapping']
# offset_mapping_ds = strip_offset_mapping(full_text_ds,offset_mapping_ds)
# idx,ds.df.iloc[idx]['nb_labels']
len(tokens_ds),len(offset_mapping_init)#len(offset_mapping_ds1),len(tokens_ds)

(707, 707)

In [32]:
full_text = ds.df.iloc[idx]['full_text']
tokens = ds.df.iloc[idx]['tokens']
labels = ds.df.iloc[idx]['labels']
# offset_mapping = 
# idx,df.iloc[idx]['nb_labels']

In [33]:
offset_mapping = get_offset_mapping(full_text, tokens)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")

Waseem: (0, 6) : B-NAME_STUDENT
Mabunda: (7, 14) : I-NAME_STUDENT
591: (16, 19) : B-STREET_ADDRESS
Smith: (20, 25) : I-STREET_ADDRESS
Centers: (26, 33) : I-STREET_ADDRESS
Apt: (34, 37) : I-STREET_ADDRESS
.: (37, 38) : I-STREET_ADDRESS
656: (39, 42) : I-STREET_ADDRESS

: (42, 43) : I-STREET_ADDRESS
Joshuamouth: (43, 54) : I-STREET_ADDRESS
,: (54, 55) : I-STREET_ADDRESS
RI: (56, 58) : I-STREET_ADDRESS
95963: (59, 64) : I-STREET_ADDRESS
410.526.1667: (85, 97) : B-PHONE_NUM
https://www.youtube.com/watch?v=tIBN9VJ0S4a: (3242, 3285) : B-URL_PERSONAL


In [34]:
ds[idx]['word_boxes'].shape

torch.Size([707, 4])

In [35]:
# ds[idx]['word_boxes'][:38,:]

In [36]:
# text = "Reflection – Visualization   Deiby"
print((full_text_ds))

Waseem Mabunda 591 Smith Centers Apt. 656 Joshuamouth, RI 95963 ( The Netherlands) 410.526.1667 vpi@mn.nl | Mind Mapping, Challenge: For several years I have been working for an Asset manager in the Netherlands. During this period I have been involved in many projects. Certainly in the world of asset management, much has changed in recent years in the area of Law and Regulations. What I mainly experience in these projects is that all departments have a different interest in starting a new project. This certainly does not benefit the project. How do you get everyone to complete a project in the common interest and how do you motivate everyone who participate in the project? Selection: An improvement project can be approached in different ways. The most common way is the scrum approach. We work in multidisciplinary teams that work in short sprints, with a fixed length of 1 to 4 weeks. Cooperation is very important and everyone must be able to respond quickly to changing circumstances. Sc

In [67]:
RE_ID_PHONE = r"""(\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(\s*\d{3}\s*\)\s*\d{3}\s*\-\s*\d{4}\s*\w{0,3}(\s*\d{1,8}\s*)?|\b\d{2,}-\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{2,}-\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\d{3}\s*\.\s*\d{3}\s*\.\s*\d{1,5}|\d{3}\s*\-\s*\d{3}\s*\-\s*\d{1,5}|\d{3}\s*x\s*\d{3}\s*x\s*\d{1,5}|\d{1,3}\s{0,2}\d{1,}\s{0,2}\d{1,}|\b\d{1,}\s*\d{1,}\s*\d{1,}|\b\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{1,}-\d{1,}|[\w\.\:\-\_\|]*\d{6,})"""

In [68]:
# Compile the regex pattern
REGEX_COMPILE = re.compile(RE_ID_PHONE)

In [69]:
def strip_offset_mapping(text, offset_mapping):
    ret = []
    for start, end in offset_mapping:
        match = list(re.finditer('\S+', text[start:end]))
        
        if len(match) == 0:
            ret.append((start, end))
        else:
            span_start, span_end = match[0].span()
            ret.append((start + span_start, start + span_end))
    return np.array(ret)

In [70]:
def find_patterns(text,regex):
    matches = [(match.group(0), match.start(), match.end()) for match in regex.finditer(text)]
    offsets = strip_offset_mapping(text,[(m[1],m[2]) for m in matches])
    return [m[0].strip() for m in matches],offsets

In [166]:
lab,off_matc = find_patterns(full_text_ds,regex)

In [167]:
off_matc

array([[  74,   93],
       [4135, 4139],
       [4188, 4192],
       [5763, 5766],
       [6080, 6084]])

In [168]:
lab

['(320)202-0688x95843', '2013', '2014', '342', '2014']

In [169]:
offset_mapping_init

[(0, 7),
 (8, 14),
 (14, 14),
 (15, 20),
 (21, 22),
 (23, 27),
 (27, 28),
 (29, 32),
 (33, 40),
 (40, 40),
 (41, 46),
 (47, 54),
 (55, 61),
 (61, 62),
 (63, 68),
 (69, 70),
 (71, 73),
 (73, 73),
 (74, 75),
 (75, 82),
 (82, 83),
 (83, 93),
 (93, 93),
 (94, 111),
 (111, 111),
 (112, 146),
 (147, 148),
 (149, 153),
 (154, 161),
 (162, 163),
 (164, 171),
 (172, 178),
 (179, 187),
 (188, 193),
 (194, 195),
 (196, 200),
 (201, 208),
 (209, 211),
 (212, 214),
 (215, 224),
 (225, 230),
 (231, 233),
 (234, 238),
 (239, 244),
 (245, 248),
 (249, 259),
 (260, 265),
 (266, 272),
 (272, 273),
 (274, 275),
 (275, 275),
 (276, 280),
 (281, 284),
 (285, 293),
 (294, 301),
 (302, 306),
 (307, 308),
 (309, 316),
 (317, 322),
 (323, 326),
 (327, 335),
 (336, 338),
 (339, 342),
 (343, 346),
 (347, 354),
 (354, 354),
 (355, 360),
 (361, 366),
 (367, 374),
 (375, 378),
 (379, 383),
 (384, 387),
 (388, 394),
 (394, 395),
 (396, 398),
 (399, 407),
 (408, 410),
 (411, 414),
 (415, 420),
 (421, 428),
 (429, 433

In [180]:
ious.shape

(1222, 5)

In [179]:
offset_mapping_init

[(0, 7),
 (8, 14),
 (14, 14),
 (15, 20),
 (21, 22),
 (23, 27),
 (27, 28),
 (29, 32),
 (33, 40),
 (40, 40),
 (41, 46),
 (47, 54),
 (55, 61),
 (61, 62),
 (63, 68),
 (69, 70),
 (71, 73),
 (73, 73),
 (74, 75),
 (75, 82),
 (82, 83),
 (83, 93),
 (93, 93),
 (94, 111),
 (111, 111),
 (112, 146),
 (147, 148),
 (149, 153),
 (154, 161),
 (162, 163),
 (164, 171),
 (172, 178),
 (179, 187),
 (188, 193),
 (194, 195),
 (196, 200),
 (201, 208),
 (209, 211),
 (212, 214),
 (215, 224),
 (225, 230),
 (231, 233),
 (234, 238),
 (239, 244),
 (245, 248),
 (249, 259),
 (260, 265),
 (266, 272),
 (272, 273),
 (274, 275),
 (275, 275),
 (276, 280),
 (281, 284),
 (285, 293),
 (294, 301),
 (302, 306),
 (307, 308),
 (309, 316),
 (317, 322),
 (323, 326),
 (327, 335),
 (336, 338),
 (339, 342),
 (343, 346),
 (347, 354),
 (354, 354),
 (355, 360),
 (361, 366),
 (367, 374),
 (375, 378),
 (379, 383),
 (384, 387),
 (388, 394),
 (394, 395),
 (396, 398),
 (399, 407),
 (408, 410),
 (411, 414),
 (415, 420),
 (421, 428),
 (429, 433

In [71]:
def spacy_to_re_off(text,tokens,offset_mapping_init):
    lab,off_matc = find_patterns(text,REGEX_COMPILE)
    
    
    if len(lab):
        
        spacy_to_re = []
        re_oof = []
        tokens_re = []

        # token slices of words
        woff = np.array(offset_mapping_init)
        toff = off_matc
        wx1, wx2 = woff.T
        tx1, tx2 = toff.T
        ix1 = np.maximum(wx1[..., None], tx1[None, ...])
        ix2 = np.minimum(wx2[..., None], tx2[None, ...])
        ux1 = np.minimum(wx1[..., None], tx1[None, ...])
        ux2 = np.maximum(wx2[..., None], tx2[None, ...])
        ious = (ix2 - ix1).clip(min=0) / (ux2 - ux1)

        for i,(spcy_of_set,tok,row) in enumerate(zip(offset_mapping_init,tokens,ious)):
            inds = row.nonzero()[0]
            if len(inds):
                spacy_to_re.append(inds[0])
                re_oof.append((off_matc[inds[0]].tolist()[0],off_matc[inds[0]].tolist()[1]))
                tokens_re.append(lab[inds[0]])
            else:
                spacy_to_re.append(len(lab)+i)
                re_oof.append(spcy_of_set)
                tokens_re.append(tok)

        re_oof = [x for i, x in enumerate(re_oof) if re_oof.index(x) == i]
    else:
        spacy_to_re = np.arange(len(offset_mapping_init)).tolist()
        re_oof = offset_mapping_init
        tokens_re = tokens

    return re_oof,spacy_to_re,tokens_re

In [72]:
re_oof,spacy_to_re,tokens_re = spacy_to_re_off(full_text_ds,tokens_ds,offset_mapping_init)

In [73]:
len(re_oof),len(spacy_to_re)

(721, 730)

In [75]:
x = pd.DataFrame({'tokens':tokens_ds,'tokens_re':tokens_re,
              'labels':labels_ds,'spacy_to_re':spacy_to_re})

x[x.spacy_to_re<=3]

Unnamed: 0,tokens,tokens_re,labels,spacy_to_re
0,Name,Name,O,3
11,(,(820)913-3241x894,B-PHONE_NUM,0
12,820)913,(820)913-3241x894,I-PHONE_NUM,0
13,-,(820)913-3241x894,I-PHONE_NUM,0
14,3241x894,(820)913-3241x894,I-PHONE_NUM,0
442,(,(820)913-3241x894,B-PHONE_NUM,1
443,820)913,(820)913-3241x894,I-PHONE_NUM,1
444,-,(820)913-3241x894,I-PHONE_NUM,1
445,3241x894,(820)913-3241x894,I-PHONE_NUM,1
469,(,(820)913-3241x894,B-PHONE_NUM,2


In [54]:
spacy_to_re

[2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 0,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 18

In [82]:
offset_mapping_init[584]

(3087, 3096)

In [57]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[-1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [28]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [29]:
visualize(full_text,offset_mapping_,labels_)

In [58]:
d = ds[idx]
full_text_ds = d['text']
tokens_ds = d['tokens']
labels_ds = d['labels']

In [30]:
# offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping_init,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [31]:
# for (x,y) in zip(offset,labels_ds):
#     print(y)

In [32]:
visualize(full_text_ds,offset_mapping_,labels_)

In [1]:
# ======================================================================================== #
import numpy as np

def find_successive_numbers(input_array):
    result = []
    current_sublist = []

    for num in input_array:
        if not current_sublist or num == current_sublist[-1] + 1:
            current_sublist.append(num)
        else:
            result.append(current_sublist)
            current_sublist = [num]

    if current_sublist:
        result.append(current_sublist)

    return result
# ======================================================================================== #

import re
from mimesis import Generic

def generate_random_data_with_probabilities():
    generic = Generic()

    # Probabilities for each country
    country_probabilities = {'fr': 0.5, 'en': 0.1, 'it': 0.1, 'de': 0.1, 'es': 0.2}

    # Function to randomly choose a country based on probabilities
    def choose_country():
        return generic.random.choices(list(country_probabilities.keys()), weights=country_probabilities.values())

    # Generate random data
    country = choose_country()[0]
    generic = Generic(locale=country)
    name = generic.person.full_name()
#     phone_number = generic.person.telephone()
#     username = generic.person.username()
#     email = generic.person.email()
#     address = generic.address.address()
#     surname = generic.person.surname()
    ret = dict(
              NAME_STUDENT=name
              )
    return ret
# ======================================================================================== #

def generate_ent(labels,tokens):
    
    idx_lab = np.argwhere(np.array(labels)!="O").reshape(-1)
    pos = sorted(find_successive_numbers(idx_lab),reverse=True,key=len)

    lab = np.array(labels)
    toks = np.array(tokens)

    ent = {}
    for i,p in enumerate(pos):
        l = np.unique([x.split('-')[-1] for x in lab[p]]).tolist()
        t = toks[p].tolist()
        if 'NAME_STUDENT' in l:
            full_name = " ".join(t)
            ent[clean_text(full_name)] = l[-1]

        else:
            full_name = " ".join(t)
            ent[clean_text(full_name)] = l[-1]
    return ent
# ======================================================================================== #
from spacy.lang.en import English
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}
# ======================================================================================== #

def create_mapper_n_clean(full_text,labels,tokens,attribut=["NAME_STUDENT"]):
    ent = generate_ent(labels,tokens)
    print(ent)
    mapper = {}
    label_mapper = {}
    for k,v in ent.items():
#         k,v = clean_text(k),clean_text(v)
        if v in attribut:      
            dc_ent = generate_random_data_with_probabilities()
            if 'NAME_STUDENT' in v:
                names = k.split()
                if k not in mapper.keys():
                    mapper[k] = dc_ent[v]
                    label_mapper[dc_ent[v]] = v
                if len(k.split())>1:
                    map_ = dc_ent[v].split()
                    if names[0] not in mapper.keys():
                        mapper[names[0]] = map_[0]
                        label_mapper[map_[0]] = v
                    if " ".join(names[1:]) not in mapper.keys():
                        mapper[" ".join(names[1:])] = " ".join(map_[1:]) 
                        label_mapper[" ".join(names[1:])] = v
                else:
                    map_ = dc_ent[v].split()
                    if names[0] not in mapper.keys():
                        mapper[names[0]] = map_[0]
                        label_mapper[map_[0]] = v      

            else:
                mapper[k] = dc_ent[v]
                label_mapper[dc_ent[v]] = v

            if k in mapper.keys():
                full_text = re.sub(k,mapper[k],full_text)
            else:
                full_text = re.sub(k,dc_ent[v],full_text)
        else:
            label_mapper[k] = v
            
    
    print(mapper)
    print(label_mapper)
    
    full_text = clean_text(full_text)
    
    tokenized_text = tokenize_with_spacy(full_text, tokenizer=en_tokenizer)
    tokens = tokenized_text['tokens']
    tg = get_offset_mapping(full_text, list(label_mapper.keys()))
    
    offs = {}
    for s in list(label_mapper.keys()):
        res = [(m.start(0), m.end(0)) for m in re.finditer(s,full_text)]
        if len(res):
            offs[s] = res
            
    labels = []
    for tok, off in zip(tokens, tokenized_text['offset_mapping']):
        found_label = False
        for k, ofs in offs.items():
            for o in ofs:
                if o[0] <= off[0] <= o[1] or o[0] <= off[1] <=o[1]:
                    labels.append(label_mapper[k])
                    found_label = True
                    break
            if found_label:
                break
        else:
            labels.append('O')
        
    return full_text,labels,tokens

In [183]:
# cachez l'id num dans URL ou mixer chiffre et lettre

In [184]:
full_text

'Name: Jana Telfah  Email: nbarker@hotmail.com  Mobile: (820)913-3241x894\n\nDesign Thinking For Innovation\n\nChallenge & Selection  Jordan’s economic growth and stability is strengthened by thriving entrepreneurial ecosystem focused  on tech innovation and tech-based startups (TBS). While past efforts have improved ecosystem  dynamics and innovation input; nonetheless, innovation outputs and socioeconomic impact remains  marginal for many reasons, among which are inefficiencies, replication, lack of creativity, and low- quality pipeline.\n\nMid last year, UVA (University of Bedfordshire) in Jordan decided to contribute to this conversation  and to the Jordanian tech ecosystem by establishing, a tech accelerator called (The Belvedere Vodka). I was  chosen as the Program Manager for this project, and my current challenge was to develop new models  that would steer the university and other educational institutions towards more responsible and value- focused models of entrepreneurial sup

In [185]:
full_text,labels,tokens = create_mapper_n_clean(full_text,labels,tokens,attribut=["NAME_STUDENT"])

{'( 820)913 - 3241x894': 'PHONE_NUM', 'Jana Telfah': 'NAME_STUDENT', 'nbarker@hotmail.com': 'EMAIL'}
{'Jana Telfah': 'Rafa Pardo', 'Jana': 'Rafa', 'Telfah': 'Pardo'}
{'( 820)913 - 3241x894': 'PHONE_NUM', 'Rafa Pardo': 'NAME_STUDENT', 'Rafa': 'NAME_STUDENT', 'Telfah': 'NAME_STUDENT', 'nbarker@hotmail.com': 'EMAIL'}


In [186]:
# labels

In [187]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [188]:
print(full_text)

Name: Rafa Pardo  Email: nbarker@hotmail.com  Mobile: (820)913-3241x894

Design Thinking For Innovation

Challenge & Selection  Jordan’s economic growth and stability is strengthened by thriving entrepreneurial ecosystem focused  on tech innovation and tech-based startups (TBS). While past efforts have improved ecosystem  dynamics and innovation input; nonetheless, innovation outputs and socioeconomic impact remains  marginal for many reasons, among which are inefficiencies, replication, lack of creativity, and low- quality pipeline.

Mid last year, UVA (University of Bedfordshire) in Jordan decided to contribute to this conversation  and to the Jordanian tech ecosystem by establishing, a tech accelerator called (The Belvedere Vodka). I was  chosen as the Program Manager for this project, and my current challenge was to develop new models  that would steer the university and other educational institutions towards more responsible and value- focused models of entrepreneurial support.

W

In [189]:
visualize(full_text,offset_mapping_,labels_)

In [190]:
import torch
from torch.utils.data.sampler import WeightedRandomSampler

# Assuming you have a dataset `my_dataset` and you want to assign different probabilities to each sample
weights = [0.1, 0.5, 0.2, 0.2,1.]

# Create a WeightedRandomSampler
sampler = WeightedRandomSampler(weights, len(weights), replacement=False)

# Print sampled indices for three epochs
for epoch in range(3):
    sampled_indices = list(sampler)
    print(f"Epoch {epoch + 1}: {sampled_indices}")


Epoch 1: [3, 0, 4, 1, 2]
Epoch 2: [4, 1, 3, 2, 0]
Epoch 3: [4, 3, 1, 2, 0]


In [141]:
728//512

1

In [142]:
df

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,nb_labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",1,0,0,0,0,0,0,6
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",1,0,0,0,0,0,0,4
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",1,0,0,0,0,0,0,2
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",1,0,0,0,0,0,0,2
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",1,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6802,22678,EXAMPLE – JOURNEY MAP\n\nTHE CHALLENGE My w...,"[EXAMPLE, –, JOURNEY, MAP, \n\n, THE, CHALLENG...","[True, True, True, False, False, True, True, F...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,0,0,0,0,0
6803,22679,Why Mind Mapping?\n\nMind maps are graphical r...,"[Why, Mind, Mapping, ?, \n\n, Mind, maps, are,...","[True, True, False, False, False, True, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,0,0,0,0,0
6804,22681,"Challenge\n\nSo, a few months back, I had chos...","[Challenge, \n\n, So, ,, a, few, months, back,...","[False, False, False, True, True, True, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,0,0,0,0,0
6805,22684,Brainstorming\n\nChallenge & Selection\n\nBrai...,"[Brainstorming, \n\n, Challenge, &, Selection,...","[False, False, True, True, False, False, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,0,0,0,0,0


In [246]:
from spacy.lang.en import English
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}

In [258]:
idx = random.choice(df[df.NAME_STUDENT>0].index)
# idx = 219
# doc = 204
# idx = ds.df[ds.df.document==doc].index[0]
# Example usage:
# idx = 80
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
labels = df.iloc[idx]['labels']
idx,df.iloc[idx]['nb_labels']

(400, 3)

In [259]:
offset_mapping = get_offset_mapping(full_text, tokens)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")

Mohinder: (4980, 4988) : B-NAME_STUDENT
Kumar: (4989, 4994) : I-NAME_STUDENT
https://www.linkedin.com/in/michael16: (5007, 5044) : B-URL_PERSONAL


In [260]:
offset_mapping = tokenize_with_spacy(full_text, en_tokenizer)['offset_mapping']
# offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]

# offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

# labels_ = [x for x in labels if x!="O"]
# labels_

In [261]:
len(offset_mapping),len(labels_ds),len(offset_mapping_)

(995, 1126, 3)

In [262]:
print(full_text)

Example – Process change – Storytelling

Challenge & Selection.   There is guy who works as a Talent acquisition professional in a multinational organization, his day to day  activity revolves around interacting with different level of professionals, He get involved into the process  of human resource starting from recruitment till separation. He regularly meets with different  stakeholders like hiring managers, head of departments, chief officers of my company also with the  representative of recruitment agencies.   But, why are we discussing about a day to day life of an employee, any clue?  Well, to answer your question this is the story of mine where I tried to show a glimpse of how my days  looks like any fine day. Meeting with various stakeholders require some data, bullets or facts to present  or to provide them update on day to day activities, SLA, projections, selections, forecasting and many  more data and figures. And here I use the method of Storytelling - Storytelling is a

In [263]:
# Compile the regex pattern
# regex = re.compile(r'\s{2,}')
# matches = [(match.group(0), match.start(), match.end()) for match in regex.finditer(full_text) if match.group(0)!='\n\n']
# matches

In [264]:
visualize(full_text,offset_mapping_,labels_)

In [None]:
my_list = [1, 2, 3, 4, 2, 3, 5, 6, 5]

# Using list comprehension with a conditional check
unique_list = [x for i, x in enumerate(my_list) if my_list.index(x) == i]

print(unique_list)
