In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
from data.data_utils import get_offset_mapping,clean_text
from data.dataset import FeedbackDataset

In [4]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

%env TOKENIZERS_PARALLELISM = true

env: TOKENIZERS_PARALLELISM=true


In [5]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[-1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

# Params

In [6]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'mpware_mixtral8x7b_v1.1-no-i-username.json.zip',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'pii-masking-200k.csv',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [7]:
sample_df = pd.read_csv(data_path/'sample_submission.csv')
sample_df.shape

(26, 4)

In [8]:
sample_df.head(5)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT


In [9]:
sample_df.label.unique()

array(['B-NAME_STUDENT', 'I-NAME_STUDENT'], dtype=object)

In [10]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [11]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [12]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
len(LABEL2TYPE)

8

In [13]:
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])>0)))*1

In [14]:
df['nb_labels'] = df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))

In [15]:
df['nb_labels'].value_counts() 

In [16]:
df[list(LABEL2TYPE)[:-1]+['nb_labels']].sum()

NAME_STUDENT       891
EMAIL               24
USERNAME             5
ID_NUM              33
PHONE_NUM            4
URL_PERSONAL        72
STREET_ADDRESS       2
nb_labels         2739
dtype: int64

In [17]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [18]:
seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

In [19]:
df.groupby(name)[list(LABEL2TYPE)[:-1]].sum()

Unnamed: 0_level_0,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS
fold_msk_5_seed_42,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,178,5,1,7,1,14,0
1,178,5,1,7,1,14,0
2,179,5,1,6,1,15,1
3,178,4,1,6,0,15,1
4,178,5,1,7,1,14,0


In [20]:
att = "ID_NUM"

In [21]:
list_of = df[(df[att]>0)]

In [22]:
list_of

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,nb_labels,fold_msk_5_seed_42
29,609,Date:14-09-2021\n\nNEWS PAPER\n\nProject: News...,"[Date:14, -, 09, -, 2021, \n\n, NEWS, PAPER, \...","[False, False, False, False, False, False, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,1,0,0,0,26,1
99,2926,NAME: Ignacia Hernandez Roll No. : 932353568...,"[NAME, :, Ignacia, Hernandez, , Roll, No, .,...","[False, True, True, True, False, True, False, ...","[O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O...",1,0,0,1,0,0,0,3,3
124,3565,Project: Experiment sheet (5W+H)\n\nInterview...,"[Project, :, , Experiment, sheet, (, 5W+H, ),...","[False, True, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",1,0,0,1,0,0,0,6,3
213,4717,STORY TELLING\n\nPIN NO. :163133980712 NAME ...,"[STORY, TELLING, \n\n, PIN, NO, ., :, 16313398...","[True, False, False, True, False, True, False,...","[O, O, O, O, O, O, O, B-ID_NUM, O, O, O, B-NAM...",1,0,0,1,0,0,0,3,0
231,4913,DESIGN THINKING\n\nEXPERIMENT – 2\n\nName: Niz...,"[DESIGN, THINKING, \n\n, EXPERIMENT, –, 2, \n\...","[True, False, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",1,0,0,1,0,0,0,9,4
237,4971,USELESS TO USEFULL\n\nThe colony I live had th...,"[USELESS, TO, USEFULL, \n\n, The, colony, I, l...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,1,0,0,0,9,4
240,5023,Project: Shopping Website\n\nInterviewer Name:...,"[Project, :, Shopping, Website, \n\n, Intervie...","[False, True, True, False, False, True, False,...","[O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAM...",1,0,0,1,0,0,0,6,0
246,5069,Marwadi University\n\nAl Akhawayn University\n...,"[Marwadi, University, \n\n, Al, Akhawayn, Univ...","[True, False, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,1,0,0,0,1,2
289,5606,Project: SPECTACLES\n\nInterviewer Name: Vijay...,"[Project, :, SPECTACLES, \n\n, Interviewer, Na...","[False, True, False, False, True, False, True,...","[O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAME_S...",1,0,0,1,0,0,0,23,0
295,5653,This was entirely a new experience for me and ...,"[This, was, entirely, a, new, experience, for,...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,1,0,0,0,3,2


In [23]:
def get_labels(x,att):
    offset_mapping = get_offset_mapping(x.full_text, x.tokens)
    offset_mapping_ = [x for (x,y) in zip(offset_mapping,x.labels) if att in y]
    v = [x.full_text[max(int(offset[0]),0):(int(offset[1]))] for offset in offset_mapping_]
    t = [x for (x,y) in zip(x.tokens,x.labels) if att in y]
    
    return (v,t)

In [24]:
list_of["v"] = list_of.apply(lambda x:get_labels(x,att),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  list_of["v"] = list_of.apply(lambda x:get_labels(x,att),axis=1)


In [25]:
for x,t in list_of["v"]:
    m = [len(y) for y in x]
    mm = min(m)
    mx = max(m)
    print(f"{x}:{mm} {mx}")

['860632713425', '530670102508', '530670102508', '875673967537', '860632713425', '557349702179', '784372734211', '054176622314', '674915248960']:12 12
['932353568953']:12 12
['982645662261', '409046248321']:12 12
['163133980712', '186941941714']:12 12
['159531167997', '159531167997', '046922558887']:12 12
['943063077874', '792389774673', '167695383458']:12 12
['Iz.:999893751750', 'Kl.:838901042770']:16 16
['06EYD876']:8 8
['143860010348', 'Ei:556799175487', '143860010348', 'Un:705491035775', '143860010348', 'Kh:360595695159', '143860010348', 'Kh:217952887271']:12 15
['35615904922']:11 11
['696135165639', 'Vw.:403489591437']:12 16
['141774671173', '747051878431']:12 12
['779875708882', '800306846075', '955487471144']:12 12
['276795361801', '276795361801']:12 12
['034626995785']:12 12
['VZ:775Y6A5764']:13 13
['DM:705244534902', '132305666219', '789323889085']:12 15
['762035863358']:12 12
['188408534931']:12 12
['nMFtUVxSUI|33529258', 'nMFtUVxSUI|33529258']:19 19
['51,00,23,0', '342998677

In [26]:
from spacy.lang.en import English
import re
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}


def strip_offset_mapping(text, offset_mapping):
    ret = []
    for start, end in offset_mapping:
        match = list(re.finditer('\S+', text[start:end]))
        if len(match) == 0:
            ret.append((start, end))
        else:
            span_start, span_end = match[0].span()
            ret.append((start + span_start, start + span_end))
    return np.array(ret)

In [27]:
text= " Janet.Koch_8388"
tokenize_with_spacy(text, tokenizer=en_tokenizer)

{'tokens': [' ', 'Janet', '.', 'Koch_8388'],
 'offset_mapping': [(0, 1), (1, 6), (6, 7), (7, 16)]}

In [29]:
import re
import torch
import random
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from data.data_utils import clean_text #,get_start_end,get_offset_mapping,get_start_end_offset,create_mapper_n_clean,tokenize_with_spacy,en_tokenizer

from tqdm.auto import tqdm
import re
from difflib import SequenceMatcher

import codecs
import os
from collections import Counter
from typing import Dict, List, Tuple

from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from text_unidecode import unidecode
import joblib
import torch


from faker import Faker
fake = Faker()


def process_regex(pattern, reverse=False):
    replacements = {
        '(': r'\(',
        ')': r'\)',
        '[': r'\[',
        ']': r'\]',
        '|': r'\|',
        '?': r'\?',
        '*': r'\*',
        '+': r'\+'
    }
    
    if reverse:
        replacements = {v: k for k, v in replacements.items()}
    
    for old, new in replacements.items():
        pattern = pattern.replace(old, new)
    
    return pattern



LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
TYPE2LABEL = {t: l for l, t in enumerate(LABEL2TYPE)}
ID_TYPE = {"0-0":0,"0-1":1,
           "1-0":2,"1-1":3,
           "2-0":4,"2-1":5,
           "3-0":6,"3-1":7,
           "4-0":8,"4-1":9,
           "5-0":10,"5-1":11,
           "6-0":12,"6-1":13
          }

ID_NAME = {"0-0":"B-NAME_STUDENT","0-1":"I-NAME_STUDENT",
           "1-0":"B-EMAIL","1-1":"I-EMAIL",
           "2-0":"B-USERNAME","2-1":"I-USERNAME",
           "3-0":"B-ID_NUM","3-1":"I-ID_NUM",
           "4-0":"B-PHONE_NUM","4-1":"I-PHONE_NUM",
           "5-0":"B-URL_PERSONAL","5-1":"I-URL_PERSONAL",
           "6-0":"B-STREET_ADDRESS","6-1":"I-STREET_ADDRESS",
           "7-0":"O","7-1":"O"
          }

# RE_ID_PHONE = r"""(\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(\s*\d{3}\s*\)\s*\d{3}\s*\-\s*\d{4}\s*\w{0,3}(\s*\d{1,8}\s*)?|\b\d{2,}-\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{2,}-\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\d{3}\s*\.\s*\d{3}\s*\.\s*\d{1,5}|\d{3}\s*\-\s*\d{3}\s*\-\s*\d{1,5}|\d{3}\s*x\s*\d{3}\s*x\s*\d{1,5}|\d{1,3}\s{0,2}\d{1,}\s{0,2}\d{1,}|\b\d{1,}\s*\d{1,}\s*\d{1,}|\b\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{1,}-\d{1,}|[\w\.\:\-\_\|]*\d{6,})"""
# Compile the regex pattern

RE_ID_PHONE = r"""(\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\s{0,2}\d{0,5}|\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(\s*\d{3}\s*\)\s*\d{3}\s*\-\s*\d{4}\s*\w{0,3}(\s*\d{1,8}\s*)?|\b\d{2,}-\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{2,}-\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\d{3}\s*\.\s*\d{3}\s*\.\s*\d{1,5}|\d{3}\s*\-\s*\d{3}\s*\-\s*\d{1,5}|\d{3}\s*x\s*\d{3}\s*x\s*\d{1,5}|\d{1,3}\s{0,2}\d{1,}\s{0,2}\d{1,}|\b\d{1,}\s*\d{1,}\s*\d{1,}|\b\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{1,}-\d{1,}|[\w\.\:\-\_\|]*\d{6,})"""
REGEX_COMPILE = re.compile(RE_ID_PHONE)


## =============================================================================== ##
class FeedbackDataset(Dataset):
    def __init__(self,
                 df,
                 tokenizer,
                 mask_prob=0.0,
                 mask_ratio=0.0,
                 ):
        
        self.tokenizer = tokenizer
        self.df = df

        print(f'Loaded {len(self)} samples.')

        assert 0 <= mask_prob <= 1
        assert 0 <= mask_ratio <= 1
        self.mask_prob = mask_prob
        self.mask_ratio = mask_ratio

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        
        df = self.df.iloc[index]
        text_id = df['document']
        if len(self.tokenizer.encode("\n\n"))==2:
            text = clean_text(df['full_text'].replace("\n\n"," | ").replace("\n"," [BR] "))
            txt_tokens = [clean_text(x.replace("\n\n"," | ").replace("\n"," [BR] ")) for x in df['tokens']]
        else:
            text = clean_text(df['full_text'])
            txt_tokens = [clean_text(x) for x in df['tokens']]
        labels = df['labels']
        
        offset_mapping_init = self.get_offset_mapping(text,txt_tokens)
        re_offset_mapping,spacy_to_re,re_tokens = self.spacy_to_re_off(text,txt_tokens,offset_mapping_init)


        hf_tokens = self.tokenizer(text, return_offsets_mapping=True)
        input_ids = torch.LongTensor(hf_tokens['input_ids'])
        attention_mask = torch.LongTensor(hf_tokens['attention_mask'])
        hf_offset_mapping = np.array(hf_tokens['offset_mapping'])
        # offset_mapping = self.strip_offset_mapping(text, offset_mapping)
        num_tokens = len(input_ids)

        # token slices of words
        woff = np.array(re_offset_mapping)
        toff = np.array(hf_offset_mapping)
        wx1, wx2 = woff.T
        tx1, tx2 = toff.T
        ix1 = np.maximum(wx1[..., None], tx1[None, ...])
        ix2 = np.minimum(wx2[..., None], tx2[None, ...])
        ux1 = np.minimum(wx1[..., None], tx1[None, ...])
        ux2 = np.maximum(wx2[..., None], tx2[None, ...])
        ious = (ix2 - ix1).clip(min=0) / (ux2 - ux1)
#         assert (ious > 0).any(-1).all()

        word_boxes = []
#         err = []
        for i,row in enumerate(ious):
            inds = row.nonzero()[0]
            try:
                word_boxes.append([inds[0], 0, inds[-1] + 1, 1])
            except:
                word_boxes.append([-100, 0, -99, 1])
#                 err.append(i)
                
        word_boxes = torch.FloatTensor(word_boxes)

        # word slices of ground truth spans
        gt_spans = []        
        for i,label in enumerate(labels) :
            gt_spans.append([i,TYPE2LABEL[label.split('-')[-1] if label!="O" else "O"],0 if label.split('-')[0]=="B" else 1])
            
        gt_spans = torch.LongTensor(gt_spans)

        # random mask augmentation
        if np.random.random() < self.mask_prob:
            all_inds = np.arange(1, len(input_ids) - 1)
            n_mask = max(int(len(all_inds) * self.mask_ratio), 1)
            np.random.shuffle(all_inds)
            mask_inds = all_inds[:n_mask]
            input_ids[mask_inds] = self.tokenizer.mask_token_id

        return dict(
                    text_id=text_id,
                    text=text,
                    labels = labels,
                    re_tokens = re_tokens,
                    spacy_to_re = spacy_to_re,
                    tokens = df['tokens'],
                    tokens_clean = txt_tokens,
                    input_ids=input_ids,
                    offset_mapping_init = offset_mapping_init,
                    re_offset_mapping = re_offset_mapping,
                    attention_mask=attention_mask,
                    word_boxes=word_boxes,
                    gt_spans=gt_spans)
    

    def get_offset_mapping(self,full_text, tokens):
        offset_mapping = []

        current_offset = 0
        for token in tokens:
            start, end = self.get_text_start_end(full_text, token, search_from=current_offset)
            offset_mapping.append((start, end))
            current_offset = end

        return offset_mapping
    
    def get_text_start_end(self,txt, s, search_from=0):
        txt = txt[int(search_from):]
        try:
            idx = txt.find(s)
            if idx >= 0:
                st = idx
                ed = st + len(s)
            else:
                raise ValueError('Error')
        except:
            res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
            if len(res):
                st, ed = res[0][0], res[0][1]
            else:
                m = SequenceMatcher(None, s, txt).get_opcodes()
                for tag, i1, i2, j1, j2 in m:
                    if tag == 'replace':
                        s = s[:i1] + txt[j1:j2] + s[i2:]
                    if tag == "delete":
                        s = s[:i1] + s[i2:]

                res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
                if len(res):
                    st, ed = res[0][0], res[0][1]
                else:
                    idx = txt.find(s)
                    if idx >= 0:
                        st = idx
                        ed = st + len(s)
                    else:
                        st, ed = 0, 0
        return st + search_from, ed + search_from
    
    
    
    def find_patterns(self,text,regex):
        matches = [(match.group(0), match.start(), match.end()) for match in regex.finditer(text)]
        offsets = self.strip_offset_mapping(text,[(m[1],m[2]) for m in matches])
        return [m[0].strip() for m in matches],offsets

    def spacy_to_re_off(self,text,tokens,offset_mapping_init):
        

        lab,off_matc = self.find_patterns(text,REGEX_COMPILE)


        if len(lab):

            spacy_to_re = []
            re_oof = []
            tokens_re = []

            # token slices of words
            woff = np.array(offset_mapping_init)
            toff = off_matc
            wx1, wx2 = woff.T
            tx1, tx2 = toff.T
            ix1 = np.maximum(wx1[..., None], tx1[None, ...])
            ix2 = np.minimum(wx2[..., None], tx2[None, ...])
            ux1 = np.minimum(wx1[..., None], tx1[None, ...])
            ux2 = np.maximum(wx2[..., None], tx2[None, ...])
            ious = (ix2 - ix1).clip(min=0) / (ux2 - ux1+1e-12)

            for i,(spcy_of_set,tok,row) in enumerate(zip(offset_mapping_init,tokens,ious)):
                inds = row.nonzero()[0]
                if len(inds):
                    spacy_to_re.append(inds[0])
                    re_oof.append((off_matc[inds[0]].tolist()[0],off_matc[inds[0]].tolist()[1]))
                    tokens_re.append(lab[inds[0]] if len(lab[inds[0]])>len(tok) else tok)
                else:
                    spacy_to_re.append(len(lab)+i)
                    re_oof.append(spcy_of_set)
                    tokens_re.append(tok)

            re_oof = [x for i, x in enumerate(re_oof) if re_oof.index(x) == i]
        else:
            spacy_to_re = np.arange(len(offset_mapping_init)).tolist()
            re_oof = offset_mapping_init
            tokens_re = tokens

        return re_oof,spacy_to_re,tokens_re
    
    def strip_offset_mapping(self, text, offset_mapping):
        ret = []
        for start, end in offset_mapping:
            match = list(re.finditer('\S+', text[start:end]))
            if len(match) == 0:
                ret.append((start, end))
            else:
                span_start, span_end = match[0].span()
                ret.append((start + span_start, start + span_end))
        return np.array(ret)

    def get_word_offsets(self, text):
        matches = re.finditer("\S+", text)
        spans = []
        words = []
        for match in matches:
            span = match.span()
            word = match.group()
            spans.append(span)
            words.append(word)
        assert tuple(words) == tuple(text.split())
        return np.array(spans)
    
## =============================================================================== ##
class CustomCollator(object):
    def __init__(self, tokenizer, model):
        self.pad_token_id = tokenizer.pad_token_id
        if hasattr(model.config, 'attention_window'):
            # For longformer
            # https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/longformer/modeling_longformer.py#L1548
            self.attention_window = (model.config.attention_window
                                     if isinstance(
                                         model.config.attention_window, int)
                                     else max(model.config.attention_window))
        else:
            self.attention_window = None

    def __call__(self, samples):
        batch_size = len(samples)
        assert batch_size == 1, f'Only batch_size=1 supported, got batch_size={batch_size}.'

        sample = samples[0]

        max_seq_length = len(sample['input_ids'])
        if self.attention_window is not None:
            attention_window = self.attention_window
            padded_length = (attention_window -
                             max_seq_length % attention_window
                             ) % attention_window + max_seq_length
        else:
            padded_length = max_seq_length

        input_shape = (1, padded_length)
        input_ids = torch.full(input_shape,
                               self.pad_token_id,
                               dtype=torch.long)
        attention_mask = torch.zeros(input_shape, dtype=torch.long)

        seq_length = len(sample['input_ids'])
        input_ids[0, :seq_length] = sample['input_ids']
        attention_mask[0, :seq_length] = sample['attention_mask']

        text_id = sample['text_id']
        tokens = sample['tokens']
        tokens_clean = sample['tokens_clean']
        re_tokens = sample['re_tokens']
        spacy_to_re = sample['spacy_to_re']
        # text = sample['text']
        word_boxes = sample['word_boxes']
        gt_spans = sample['gt_spans']

        return dict(text_id=text_id,
                    tokens_clean=tokens_clean,
                    tokens = tokens,
                    re_tokens = re_tokens,
                    spacy_to_re = spacy_to_re,
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    word_boxes=word_boxes,
                    gt_spans=gt_spans)

In [30]:
model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [31]:
# df[df.full_text.str.contains('\?')]

In [32]:
ds = FeedbackDataset(df.copy(),tokenizer)

Loaded 6807 samples.


In [33]:
idx = random.choice(ds.df[ds.df.ID_NUM>0].index)
# idx = 219
# doc = 204
# idx = ds.df[ds.df.document==9854].index[0]
# Example usage:
# idx = 80
full_text_ds = ds[idx]['text']
tokens_ds = ds[idx]['tokens_clean']
labels_ds = ds[idx]['labels']
offset_mapping_init = ds[idx]['offset_mapping_init']
re_offset_mapping = ds[idx]['re_offset_mapping']
spacy_to_re = ds[idx]['spacy_to_re']
re_tokens = ds[idx]['re_tokens']
# offset_mapping_ds = get_offset_mapping(full_text_ds, tokens_ds)
# offset_mapping_ds1 = tokenize_with_spacy(full_text_ds)['offset_mapping']
# offset_mapping_ds = strip_offset_mapping(full_text_ds,offset_mapping_ds)
# idx,ds.df.iloc[idx]['nb_labels']
len(tokens_ds),len(offset_mapping_init),len(re_tokens),len(re_offset_mapping) #len(offset_mapping_ds1),len(tokens_ds)

(915, 915, 915, 911)

In [34]:
# ds.df.iloc[idx]['document']

In [35]:
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
labels = df.iloc[idx]['labels']
offset_mapping = get_offset_mapping(full_text, tokens)
# idx,df.iloc[idx]['nb_labels']

In [36]:
offset_mapping = get_offset_mapping(full_text, tokens)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")

Gabriel: (112, 119) : B-NAME_STUDENT
Bravo: (120, 125) : I-NAME_STUDENT
860632713425: (156, 168) : B-ID_NUM
Hlengiwe: (191, 199) : B-NAME_STUDENT
Swetha: (200, 206) : I-NAME_STUDENT
530670102508: (230, 242) : B-ID_NUM
530670102508: (274, 286) : B-ID_NUM
Tino: (290, 294) : B-NAME_STUDENT
Swetha: (295, 301) : I-NAME_STUDENT
875673967537: (303, 315) : B-ID_NUM
860632713425: (332, 344) : B-ID_NUM
Tino: (348, 352) : B-NAME_STUDENT
Lopez: (353, 358) : I-NAME_STUDENT
557349702179: (361, 373) : B-ID_NUM
Swetha: (377, 383) : B-NAME_STUDENT
Swetha: (384, 390) : I-NAME_STUDENT
784372734211: (393, 405) : B-ID_NUM
Alex: (409, 413) : B-NAME_STUDENT
Swetha: (414, 420) : I-NAME_STUDENT
054176622314: (422, 434) : B-ID_NUM
Alex: (438, 442) : B-NAME_STUDENT
Bravo: (443, 448) : I-NAME_STUDENT
674915248960: (469, 481) : B-ID_NUM
Tino: (485, 489) : B-NAME_STUDENT
Lopez: (490, 495) : I-NAME_STUDENT
Hlengiwe: (640, 648) : B-NAME_STUDENT


In [37]:
ds[idx]['word_boxes'].shape

torch.Size([911, 4])

In [38]:
# text = "Reflection – Visualization   Deiby"
print((full_text_ds))

Date:14-09-2021 | NEWS PAPER | Project: News Paper Designing Interviewer Name : Gabriel Bravo Interviewer PinNo : 860632713425 Interviewee Name : Hlengiwe Swetha Interviewee PinNo : 530670102508 Team: “CSE AIML group 5 ” 530670102508 - Tino Swetha 875673967537 - SSRK Kasyap 860632713425 - Tino Lopez 557349702179 - Swetha Swetha 784372734211 - Alex Swetha 054176622314 - Alex Bravo ( Representative ) 674915248960 - Tino Lopez Experiment - 1 | Introduction: | ● Introduce yourself to the bench? ● Have you ever faced hardship in your | life? | ➔ I am Hlengiwe from Jogipet in the Sanga | Reddy district. I have completed my bachelor's degree in design thinking by holding 79 percent. I have done my bachelor's degree in IIT MADRAS. I have skills in HTML, soft skills. I have worked at Grassroots BPO. I like to improve my skills and knowledge at your company. I will assure you that I will show my skills and knowledge in my work 100 percent. | ➔ I grew up confronting a lot of challenges | since I 

In [39]:
x = pd.DataFrame({'tokens':tokens_ds,'tokens_re':re_tokens,
              'labels':labels_ds,'spacy_to_re':spacy_to_re})

x[x.spacy_to_re<=15]

Unnamed: 0,tokens,tokens_re,labels,spacy_to_re
0,Date:14,14-09-2021,O,0
1,-,14-09-2021,O,0
2,09,14-09-2021,O,0
3,-,14-09-2021,O,0
4,2021,14-09-2021,O,0
26,860632713425,860632713425,B-ID_NUM,1
39,530670102508,530670102508,B-ID_NUM,2
50,530670102508,530670102508,B-ID_NUM,3
56,875673967537,875673967537,B-ID_NUM,4
62,860632713425,860632713425,B-ID_NUM,5


In [40]:
x[x.tokens!=x.tokens_re]

Unnamed: 0,tokens,tokens_re,labels,spacy_to_re
0,Date:14,14-09-2021,O,0
1,-,14-09-2021,O,0
2,09,14-09-2021,O,0
3,-,14-09-2021,O,0
4,2021,14-09-2021,O,0


In [41]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [42]:
visualize(full_text,offset_mapping_,labels_)

In [43]:
# offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping_init,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [44]:
# for (x,y) in zip(offset,labels_ds):
#     print(y)

In [45]:
visualize(full_text_ds,offset_mapping_,labels_)

In [916]:
#url and usernam
fake.user_name(),generate_fake_social_media_urls()

('gmorales', 'https://instagram.com/annanewton')

In [121]:
# id num
fake.passport_number(),fake.bban(), fake.iban(), #fake.license_plate(), fake.ssn()

('U57484780', 'TEKU96707208815715', 'GB87YITO06564710518565')

In [918]:
# names
fake.name(),fake.first_name(), fake.last_name()

('Ashley Tran', 'Courtney', 'Reeves')

In [929]:
# mail
fake.ascii_free_email()

'ann65@yahoo.com'

In [926]:
# phone
s = fake.phone_number()
print(s)

886.547.1178x7725


In [871]:
RE_ID_PHONE = r"""(\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\s{0,2}\d{0,5}|\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(\s*\d{3}\s*\)\s*\d{3}\s*\-\s*\d{4}\s*\w{0,3}(\s*\d{1,8}\s*)?|\b\d{2,}-\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{2,}-\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\d{3}\s*\.\s*\d{3}\s*\.\s*\d{1,5}|\d{3}\s*\-\s*\d{3}\s*\-\s*\d{1,5}|\d{3}\s*x\s*\d{3}\s*x\s*\d{1,5}|\d{1,3}\s{0,2}\d{1,}\s{0,2}\d{1,}|\b\d{1,}\s*\d{1,}\s*\d{1,}|\b\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{1,}-\d{1,}|[\w\.\:\-\_\|]*\d{6,})"""

In [872]:
re.findall(RE_ID_PHONE,s)

[('682-678-9880', '')]

In [932]:
# Adress
z = [fake.address() for i in range(3) ]
z

['1972 Goodman Hills\nWest Jeffrey, FL 13169',
 '9981 Ashley Points Apt. 934\nNorth Matthewland, CA 93374',
 '6664 Bryant Mountains\nStephensmouth, HI 78633']

In [None]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')

In [154]:
# ======================================================================================== #
import numpy as np
Faker.seed(0)
def successive_positions(input_list):
    result = []
    current_group = []
    prev_element = None

    for i, element in enumerate(input_list):
        if element == prev_element:
            current_group.append(i)
        else:
            if current_group:
                result.append(current_group)
            current_group = [i]
        prev_element = element

    if current_group:
        result.append(current_group)

    return result

def find_successive_numbers(input_array):
    result = []
    current_sublist = []

    for num in input_array:
        if not current_sublist or num == current_sublist[-1] + 1:
            current_sublist.append(num)
        else:
            result.append(current_sublist)
            current_sublist = [num]

    if current_sublist:
        result.append(current_sublist)

    return result
# ======================================================================================== #

def generate_fake_social_media_urls(num_urls=1):
    social_media_platforms = {
        'LinkedIn': 'linkedin.com/in/',
        'YouTube': 'youtube.com/c/',
        'Instagram': 'instagram.com/',
        'GitHub': 'github.com/',
        'Facebook': 'facebook.com/',
        'Twitter': 'twitter.com/'
    }

    fake_social_media_urls = []

    for _ in range(num_urls):
        fake_user_name = fake.user_name()
        platform, domain = random.choice(list(social_media_platforms.items()))
        fake_url = f'https://{domain}{fake_user_name}'
        fake_social_media_urls.append(fake_url)

    return fake_social_media_urls[0]
    
def generate_random_number(length):
    return ''.join(random.choice('0123456789') for _ in range(length))

def generate_similar_number():
    length = random.randint(14, 22)
    return generate_random_number(length)

def generate_random_data_with_probabilities():

    name = random.choices([fake.name(),fake.first_name(), fake.last_name()], weights = [0.7,0.15,0.15], k = 1)[0]  #generic.person.full_name()
    phone_number =  fake.phone_number()
    username = fake.user_name()
    email = fake.ascii_free_email()
    address = fake.address()
    id_num = random.choices([fake.passport_number(),fake.bban(),
                             fake.iban(),generate_random_number(12)],k=1,weights = [0.2,0.15,0.15,0.5])[0]
    url_pers = generate_fake_social_media_urls()
#     surname = generic.person.surname()
    # a nezttoter
    ret = dict(
              NAME_STUDENT=name,
              EMAIL=email,
              USERNAME=username,
              ID_NUM=id_num,
              URL_PERSONAL=url_pers,
              PHONE_NUM=phone_number,
              STREET_ADDRESS=address
              )
    
    for k,v in ret.items():
        ret[k] = clean_text(v.replace("\n\n"," | ").replace("\n"," [BR] "))
    return ret
# ======================================================================================== #

def generate_ent(text,labels,offset_mapping):
    
    idx_lab = np.argwhere(np.array(labels)!="O").reshape(-1)
#     print(idx_lab)
#     pos = sorted(find_successive_numbers(idx_lab),reverse=True,key=len)
    pos = find_successive_numbers(idx_lab)
#     print(pos)
    lab = np.array(labels)
#     toks = np.array(tokens)

    ent = {}
    ent_order = []
    ent_offset_in_order = []
    for i,p in enumerate(pos):
        l = [x.split('-')[-1] for x in lab[p]]
        
        if len(np.unique(l))>1:
            px = successive_positions(l)
            for pp in px:
                full_name = text[offset_mapping[p[pp[0]]][0]:offset_mapping[p[pp[-1]]][1]].strip()
                ent[full_name] = l[pp[-1]]
                ent_order.append(full_name)
                ent_offset_in_order.append((offset_mapping[p[pp[0]]][0],
                                            offset_mapping[p[pp[-1]]][1]))
#                 print(l)
#         print(p)
        else:
            full_name = text[offset_mapping[p[0]][0]:offset_mapping[p[-1]][1]].strip()
            ent[full_name] = l[-1]
            ent_order.append(full_name)
            ent_offset_in_order.append((offset_mapping[p[0]][0],offset_mapping[p[-1]][1]))
#         if 'NAME_STUDENT' in l:
# #             full_name = " ".join(t)
#             ent[clean_text(full_name)] = l[-1]

#         else:
#             full_name = " ".join(t)
#             ent[clean_text(full_name)] = l[-1]
    return ent,ent_order,ent_offset_in_order
# ======================================================================================== #
from spacy.lang.en import English
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}
# ======================================================================================== #

def create_mapper_n_clean(full_text,labels,offset_mapping,attribut=["NAME_STUDENT"]):
    ent,ent_order,ent_offset_in_order = generate_ent(full_text,labels,offset_mapping)
    print(ent)
    mapper = {}
    label_mapper = {}
    new_tokens = []
    txt_added = 0
    for num,k in enumerate(ent_order):
        v = ent[k]
#         k,v = clean_text(k),clean_text(v)
        if v in attribut:      
            dc_ent = generate_random_data_with_probabilities()
            mapper[k] = dc_ent[v]
            label_mapper[dc_ent[v]] = v
            
            old_len = len(full_text)
            if k in mapper.keys():
                full_text = full_text[:ent_offset_in_order[num][0]+txt_added] +" " +mapper[k] + " "+full_text[ent_offset_in_order[num][-1]+txt_added:]
                txt_added+= len(full_text)-old_len
#                 full_text = re.sub(k,mapper[k],full_text)
                new_tokens.append(mapper[k])
            else:
                full_text = full_text[:ent_offset_in_order[num][0]+txt_added] + " "+dc_ent[v] +" "+ full_text[ent_offset_in_order[num][-1]+txt_added:]
#                 full_text = re.sub(k,dc_ent[v],full_text)
                new_tokens.append(dc_ent[v])
                txt_added+= len(full_text)-old_len
        else:
            label_mapper[k] = v
            new_tokens.append(k)
            
    
    print(mapper)
    print(label_mapper)
#     print(full_text)
#     print(ent_order)
    full_text = clean_text(full_text)
    
    tokenized_text = tokenize_with_spacy(full_text, tokenizer=en_tokenizer)
    tokens = tokenized_text['tokens']
    tg = get_offset_mapping(full_text, new_tokens)
    
#     print(len(tg))
#     offs = {}
#     for s in list(label_mapper.keys()):
#         res = [(m.start(0), m.end(0)) for m in re.finditer(s,full_text)]
#         print(s)
#         if len(res):
#             offs[s] = res
    
#     print(full_text)
    woff = np.array(tokenized_text['offset_mapping'])
    labels = np.array(["O"]*len(woff),dtype='<U50')
#     for k, ofs in offs.items():
    toff = np.array(tg)
    wx1, wx2 = woff.T
    tx1, tx2 = toff.T
    ix1 = np.maximum(wx1[..., None], tx1[None, ...])
    ix2 = np.minimum(wx2[..., None], tx2[None, ...])
    ux1 = np.minimum(wx1[..., None], tx1[None, ...])
    ux2 = np.maximum(wx2[..., None], tx2[None, ...])
    ious = (ix2 - ix1).clip(min=0) / (ux2 - ux1)


    for i,row in enumerate(ious):
        inds = row.nonzero()[0]
        if len(inds):
#             print(inds)
            labels[i] = label_mapper[new_tokens[inds[0]]]

    labels = labels.tolist()
    
    
    
#     for tok, off in zip(tokens, tokenized_text['offset_mapping']):
#         found_label = False
#         for k, ofs in offs.items():
#             for o in ofs:
#                 if o[0] <= off[0] <= o[1] or o[0] <= off[1] <=o[1]:
#                     labels.append(label_mapper[k])
#                     found_label = True
#                     break
#             if found_label:
#                 break
#         else:
#             labels.append('O')
        
    return full_text,labels,tokens

In [155]:
# cachez l'id num dans URL ou mixer chiffre et lettre

In [156]:
idx = random.choice(ds.df[ds.df.ID_NUM>0].index)
idx = 428
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
labels = df.iloc[idx]['labels']
offset_mapping = get_offset_mapping(full_text, tokens)


full_text_ds = ds[idx]['text']
tokens_ds = ds[idx]['tokens_clean']
labels_ds = ds[idx]['labels']
offset_mapping_init = ds[idx]['offset_mapping_init']
re_offset_mapping = ds[idx]['re_offset_mapping']
spacy_to_re = ds[idx]['spacy_to_re']
re_tokens = ds[idx]['re_tokens']


# idx,df.iloc[idx]['nb_labels']

In [157]:
# offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping_init,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

visualize(full_text_ds,offset_mapping_,labels_)

In [158]:
full_text_ds

'NAME: Zhe Basmah ROLL DM:705244534902 | EXPERIMENT-1 | Project: | Interviewer Name: Zhe | Interviewer Roll No.: 132305666219 | Team: 10 | Interviewee Name: Zhe Basmah | Interviewee Roll No.: 789323889085 | Date: 28th September 2021 | Date:5-10-2021 | Introduction | 1. What is your profession 2. Tell me what you experienced recently 3. Tell me what annoyed you recently | 1. I am a student of Lycée of Dignity University 2. I Observed that many plastic bottles were | thrown at the beach side and every where In the city which leads to environmental pollution | 3. The usage and the chemicals used in plastic | are dangerous. The cause environmental pollution. Many animals were died by eating them | Get to know the entire story: | 1. What helps you to save money? 2. What is biggest challenge? | 3. Do you think you can earn profits | through this service by providing products with low cost? | 4. Is there any companies or apps which | have done this type of service before? | 1. Making Of Yarn 

In [159]:
full_text_ds,labels,tokens_ds = create_mapper_n_clean(full_text_ds,labels_ds,offset_mapping_init,attribut=['NAME_STUDENT','EMAIL','USERNAME','ID_NUM',
                                                                                                        'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS'])

{'Zhe Basmah': 'NAME_STUDENT', 'DM:705244534902': 'ID_NUM', 'Zhe': 'NAME_STUDENT', '132305666219': 'ID_NUM', '789323889085': 'ID_NUM'}
{'Zhe Basmah': 'Gwendolyn Mendoza', 'DM:705244534902': '402119838146', 'Zhe': 'Rachel Moore', '132305666219': '773116600706', '789323889085': '472112081194'}
{'Jonathan': 'NAME_STUDENT', '402119838146': 'ID_NUM', 'Rachel Moore': 'NAME_STUDENT', '773116600706': 'ID_NUM', 'Gwendolyn Mendoza': 'NAME_STUDENT', '472112081194': 'ID_NUM'}


In [160]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

visualize(full_text_ds,offset_mapping_,labels_)

In [2613]:
def custom_distribution(n):
    distribution = [0] * n
    middle_index = n // 2
    for i in range(middle_index):
        distribution[i] = (middle_index - i) / middle_index
        distribution[n - 1 - i] = (middle_index - i) / middle_index
    return distribution

In [2589]:
def add_text(full_text,tokens,labels,offset_mapping,new_text,new_tokens,new_labels,new_offset_mapping):
    
    s = full_text.split('|')
    prob_dist = custom_distribution(len(s))
    id_ = random.choices(np.arange(len(s)),k=1,weights = prob_dist)[0]
    
    idx = [len(s[i]) for i in range(id_+1)]
    idx = sum(idx)
    full_text = full_text[:idx+id_+1] +" "+ new_text + full_text[idx+id_+1:]

    
    t_idx = [i for i,x in enumerate(offset_mapping) if x[1]==idx+id_+1][-1]+1

    
    tokens = tokens[:t_idx]+new_tokens+tokens[t_idx:]
    labels = labels[:t_idx]+new_labels+labels[t_idx:]
    
    
    
    v = offset_mapping[:t_idx][-1][1]
    new_offset_mappings = [(x[0]+v+1,x[1]+v+1) for x in new_offset_mapping]
    v1 = new_offset_mappings[-1][1]
    vx = v1-v
    old_offset_mapping =  [(x[0]+vx,x[1]+vx) for x in offset_mapping[t_idx:]]
    offset_mapping = offset_mapping[:t_idx]+new_offset_mappings+old_offset_mapping


#     v = offset_mapping[:t_idx][-1][1]+2
# #     print(v)
#     new_offset_mappings = [(x[0]+v+1,x[1]+v+1) for x in new_offset_mapping]
    
#     v1 = new_offset_mappings[-1][1]
# #     print(v1)
# #     print(ifc)
#     old_offset_mapping =  [(x[0]+v1,x[1]+v1) for x in offset_mapping[t_idx:]]
    
#     offset_mapping = offset_mapping[:t_idx]+new_offset_mapping+old_offset_mapping
    
    return full_text,tokens,labels,offset_mapping

In [2600]:
idx = random.choice(ds.df[ds.df.ID_NUM>0].index)
# full_text = df.iloc[idx]['full_text']
# tokens = df.iloc[idx]['tokens']
# labels = df.iloc[idx]['labels']
# offset_mapping = get_offset_mapping(full_text, tokens)


full_text_ds = ds[idx]['text']
tokens_ds = ds[idx]['tokens_clean']
labels_ds = ds[idx]['labels']
offset_mapping_init = ds[idx]['offset_mapping_init']
re_offset_mapping = ds[idx]['re_offset_mapping']
spacy_to_re = ds[idx]['spacy_to_re']
re_tokens = ds[idx]['re_tokens']


# idx,df.iloc[idx]['nb_labels']

In [2575]:
# df_ai = pd.read_csv(data_path/'pii-masking-200k.csv')
# df_ai.shape

In [2576]:
# df_ai["offset_mapping"] = df_ai["offset_mapping"].transform(lambda x:eval(x))
# df_ai["labels"] = df_ai["labels"].transform(lambda x:eval(x))
# df_ai["tokens"] = df_ai["tokens"].transform(lambda x:eval(x))

In [2577]:
idx = random.choice(df_ai.index)
full_text = df_ai.iloc[idx]['full_text']
tokens = df_ai.iloc[idx]['tokens']
labels = df_ai.iloc[idx]['labels']
offset_mapping = df_ai.iloc[idx]['offset_mapping']

In [2578]:
full_text 

'Update aus der Intensivstation bezüglich Patient[in] Icie58: Der Zustand des Patienten bleibt kritisch. Weitere Informationen werden an Keegan.Kunde@yahoo.com gesendet. Unser primärer Ansprechpartner bleibt Future Integration Consultant, Ernest Corkery. Bitte kontaktieren Sie uns für ein detailliertes Update.'

In [2579]:
# offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [2580]:
visualize(full_text,offset_mapping_,labels_)

In [2601]:
# offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping_init,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

visualize(full_text_ds,offset_mapping_,labels_)

In [2582]:
full_texts,tokenss,labelss,offset_mappings  = add_text(full_text_ds,tokens_ds,labels_ds,offset_mapping_init,
                                                   full_text,tokens,labels,offset_mapping)

In [2583]:
len(full_texts),len(tokenss),len(labelss),len(offset_mappings)

(1979, 476, 476, 476)

In [2584]:
offset_mapping_ = [x for (x,y) in zip(offset_mappings,labelss) if y!="O"]
labels_ = [x for x in labelss if x!="O"]
visualize(full_texts,offset_mapping_,labels_)

In [1945]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [1946]:
visualize(full_text,offset_mapping_,labels_)

In [1947]:
# k,en,o = generate_ent(full_text_ds,labels,offset_mapping_init)
# k,en,o

In [1948]:
print(full_text_ds)

Project: SPECTACLES | Interviewer Name: Vijay Shende Interviewee Name: Vijay Shende | Interviewer Pin No: 143860010348 Interviewee Pin Ei:556799175487 | Team: 2 Date:09.07.2013 | Introduction: (Define the questions) | • Do you want an advanced way to look at | stuff around you? | • Of course. I would love to look at the world in a | different way. | Get to know the entire story: (Which questions help to understand the hopes, fears and motivations of the interviewers) | • If you wear a modern spectacle, do you | think it should have an in-built camera? | • Would you want a voice assistant in your | specs? | • In clicking pictures when your hands are busy, I | would say yes. But if there is a feature like that then there might be a huge number of people who might miss use it. | • Maybe, it might not always be useful but in | most of the cases it will. | Conclusion: (Explain what happens with the answers and thank the Interviewee for the discussion) | • These responses are taken for more 

In [2615]:
print(full_text_ds)

Environment | • Evening time | • Wind is blowing | • Birds are charming | • Sun is looking red | • Sky is blue and red | • Dogs are running here and there | Interaction | • My group members to band | • Students to band | • Teacher to band | • Worker to band | • Some students are studying late night in the room so more students facing light reflect . | Objects | • Bluetooth connection | • An set of microphones | • Soft cotton Band | • With eye band | • Smartphone | • Flexible band | Activities • Some student is working on laptop. | • Student are going here and there. | • Some students are studying. | • Lift is going up and down with the students. | • Some students are riding on bikes. | • Some people are walking | Users • Students | • Workers | • Personal uses | • Old People | • Anybody can use | Kanali Foundation Summary Group id : GB04HOKU62136491888841 Date : 2nd August 2020 Version : Domain Name : VERSATILE BAND |


In [2604]:
# labels

In [2616]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping_init,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [2617]:
visualize(full_text_ds,offset_mapping_,labels_)

In [None]:
(full_text_ds,tokens_ds,labels_ds,offset_mapping_init,
                                                   full_text,tokens,labels,offset_mapping)

In [2472]:
full_text

'Mr. Moen, Ihre Konsultation wurde erfolgreich für den 21/02/1984 gebucht. Die Gebühr wird Ihrem Konto Checking Account belastet.'

In [2473]:
s = full_text_ds.split('|')
prob_dist = custom_distribution(len(s))
id_ = random.choices(np.arange(len(s)),k=1,weights = prob_dist)[0]
idx = [len(s[i]) for i in range(id_+1)]
idx = sum(idx)
new_text = full_text_ds[:idx+id_+1] +" "+ full_text + full_text_ds[idx+id_+1:]
idx

([39, 29, 27, 54, 18, 9, 15, 34, 37, 51], 9)

313

In [2475]:
s[id_]

' • what are the features in Life cycle assesment-- '

In [2476]:
full_text_ds[:idx+id_+1]

'Design thinking and product innovation | (Analysis-question-builder) | Madison Tate 034626995785 | Life cycle assessment manufacture marketing Handling | update operation | workout | app structure | logo free result Design products | Define the questions: Data sources: | • what are the features in Life cycle assesment-- |'

In [2477]:
idx,id_

(313, 9)

In [2478]:
ifc = len(full_text_ds[:idx+1] +" "+ full_text)
ifc

443

In [2480]:
print(new_text)

Design thinking and product innovation | (Analysis-question-builder) | Madison Tate 034626995785 | Life cycle assessment manufacture marketing Handling | update operation | workout | app structure | logo free result Design products | Define the questions: Data sources: | • what are the features in Life cycle assesment-- | Mr. Moen, Ihre Konsultation wurde erfolgreich für den 21/02/1984 gebucht. Die Gebühr wird Ihrem Konto Checking Account belastet. the app? App should consists the | • How much data it may most comfortable features | consume for each updation? for the users and should be user friendly and it | should be prepared with | advanced software | • How can people attract the app? Design-- • Why should people choose this App logo should be bit clear | app? about the intension of the app and discription of the | app should provide all the | answers for the doubts of users | • Can we operate the app without Result: | prior experience? Most of the people are lazy | • Why this app d

In [2481]:
t_idx = [i for i,x in enumerate(offset_mapping_init) if x[1]==idx+id_+1][-1]+1
t_idx

67

In [2482]:
tokens_ds[:t_idx]

['Design',
 'thinking',
 'and',
 'product',
 'innovation',
 '|',
 '(',
 'Analysis',
 '-',
 'question',
 '-',
 'builder',
 ')',
 '|',
 'Madison',
 'Tate',
 '034626995785',
 '|',
 'Life',
 'cycle',
 'assessment',
 '',
 'manufacture',
 '',
 'marketing',
 '',
 'Handling',
 '|',
 'update',
 '',
 'operation',
 '|',
 'workout',
 '|',
 'app',
 'structure',
 '|',
 'logo',
 '',
 'free',
 '',
 'result',
 '',
 'Design',
 '',
 'products',
 '|',
 'Define',
 'the',
 'questions',
 ':',
 '',
 'Data',
 'sources',
 ':',
 '|',
 '•',
 'what',
 'are',
 'the',
 'features',
 'in',
 '',
 'Life',
 'cycle',
 'assesment--',
 '|']

In [2483]:
tokenss = tokens_ds[:t_idx]+tokens+tokens_ds[t_idx:]
labelss = labels_ds[:t_idx]+labels+labels_ds[t_idx:]

In [2484]:
v = offset_mapping_init[:t_idx][-1][1]
new_offset_mappings = [(x[0]+v+1,x[1]+v+1) for x in offset_mapping]
v1 = new_offset_mappings[-1][1]
vx = v1-v
old_offset_mapping =  [(x[0]+vx,x[1]+vx) for x in offset_mapping_init[t_idx:]]
offset_mappingvf = offset_mapping_init[:t_idx]+new_offset_mappings+old_offset_mapping

323

In [2495]:
# offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mappingvf,labelss) if y!="O"]
labels_ = [x for x in labelss if x!="O"]

In [2496]:
visualize(new_text,offset_mapping_,labels_)