In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
from data.data_utils import get_offset_mapping
from data.dataset import FeedbackDataset

In [4]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

%env TOKENIZERS_PARALLELISM = true

env: TOKENIZERS_PARALLELISM=true


# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'dubai-ar.zip',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'openaddr-collected-global.zip',
 'lecture2.pptx',
 'openaddr-collected-us_west-sa.zip',
 'test.json',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv']

In [6]:
sample_df = pd.read_csv(data_path/'sample_submission.csv')
sample_df.shape

(26, 4)

In [7]:
sample_df.head(5)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT


In [8]:
sample_df.label.unique()

array(['B-NAME_STUDENT', 'I-NAME_STUDENT'], dtype=object)

In [9]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [10]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [11]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
len(LABEL2TYPE)

8

In [118]:
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ]))))*1

In [13]:
df['nb_labels'] = df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))

In [14]:
df['nb_labels'].value_counts() 

0     5862
2      599
4      108
1       86
3       52
6       46
8       14
5       10
12       6
10       5
11       3
9        3
15       2
14       2
21       2
7        1
23       1
18       1
17       1
26       1
34       1
22       1
Name: nb_labels, dtype: int64

In [90]:
df[list(LABEL2TYPE)[:-1]+['nb_labels']].sum()

NAME_STUDENT       891
EMAIL               24
USERNAME             5
ID_NUM              33
PHONE_NUM            4
URL_PERSONAL        72
STREET_ADDRESS       2
nb_labels         2739
dtype: int64

In [16]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [17]:
seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

In [91]:
df.groupby(name)[list(LABEL2TYPE)[:-1]].sum()

Unnamed: 0_level_0,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS
fold_msk_5_seed_42,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,178,5,1,7,1,14,0
1,178,5,1,7,1,14,0
2,179,5,1,6,1,15,1
3,178,4,1,6,0,15,1
4,178,5,1,7,1,14,0


In [94]:
model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [95]:
ds = FeedbackDataset(df.copy(),tokenizer)

Loaded 6807 samples.


In [803]:
idx = random.choice(ds.df[ds.df.ID_NUM>8].index)
# doc = 204
# idx = ds.df[ds.df.document==doc].index[0]
# Example usage:
# idx = 80
full_text_ds = ds.df.iloc[idx]['full_text']
tokens_ds = ds.df.iloc[idx]['tokens']
labels_ds = ds.df.iloc[idx]['labels']
idx,ds.df.iloc[idx]['nb_labels']

(29, 26)

In [804]:
len(labels_ds),len(tokens_ds)

(915, 915)

In [805]:
tokens_ds[:15]

['Date:14',
 '-',
 '09',
 '-',
 '2021',
 '|',
 'NEWS',
 'PAPER',
 '|',
 'Project',
 ':',
 'News',
 'Paper',
 'Designing',
 '']

In [806]:
ds[idx]

{'text_id': 609,
 'tokens': ['Date:14',
  '-',
  '09',
  '-',
  '2021',
  '|',
  'NEWS',
  'PAPER',
  '|',
  'Project',
  ':',
  'News',
  'Paper',
  'Designing',
  '',
  'Interviewer',
  'Name',
  ':',
  '',
  'Gabriel',
  'Bravo',
  '',
  'Interviewer',
  'PinNo',
  ':',
  '',
  '860632713425',
  '',
  'Interviewee',
  'Name',
  ':',
  '',
  'Hlengiwe',
  'Swetha',
  '',
  'Interviewee',
  'PinNo',
  ':',
  '',
  '530670102508',
  '',
  'Team',
  ':',
  '“',
  'CSE',
  'AIML',
  'group',
  '5',
  '”',
  '',
  '530670102508',
  '-',
  '',
  'Tino',
  'Swetha',
  '',
  '875673967537',
  '-',
  '',
  'SSRK',
  'Kasyap',
  '',
  '860632713425',
  '-',
  '',
  'Tino',
  'Lopez',
  '',
  '557349702179',
  '-',
  '',
  'Swetha',
  'Swetha',
  '',
  '784372734211',
  '-',
  '',
  'Alex',
  'Swetha',
  '',
  '054176622314',
  '-',
  '',
  'Alex',
  'Bravo',
  '(',
  'Representative',
  ')',
  '',
  '674915248960',
  '-',
  '',
  'Tino',
  'Lopez',
  '',
  'Experiment',
  '-',
  '1',
  '|',
  

In [807]:
ds[idx]['word_boxes'][:15,:]

tensor([[   1.,    0.,    4.,    1.],
        [   4.,    0.,    5.,    1.],
        [   5.,    0.,    6.,    1.],
        [   6.,    0.,    7.,    1.],
        [   7.,    0.,    8.,    1.],
        [   8.,    0.,    9.,    1.],
        [   9.,    0.,   10.,    1.],
        [  10.,    0.,   11.,    1.],
        [  11.,    0.,   12.,    1.],
        [  12.,    0.,   13.,    1.],
        [  13.,    0.,   14.,    1.],
        [  14.,    0.,   15.,    1.],
        [  15.,    0.,   16.,    1.],
        [  16.,    0.,   17.,    1.],
        [-100.,    0.,  -99.,    1.]])

In [868]:
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
labels = df.iloc[idx]['labels']
idx,df.iloc[idx]['nb_labels']

(29, 26)

In [869]:
offset_mapping = get_offset_mapping(full_text, tokens)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")

Gabriel: (112, 119) : B-NAME_STUDENT
Bravo: (120, 125) : I-NAME_STUDENT
860632713425: (156, 168) : B-ID_NUM
Hlengiwe: (191, 199) : B-NAME_STUDENT
Swetha: (200, 206) : I-NAME_STUDENT
530670102508: (230, 242) : B-ID_NUM
530670102508: (274, 286) : B-ID_NUM
Tino: (290, 294) : B-NAME_STUDENT
Swetha: (295, 301) : I-NAME_STUDENT
875673967537: (303, 315) : B-ID_NUM
860632713425: (332, 344) : B-ID_NUM
Tino: (348, 352) : B-NAME_STUDENT
Lopez: (353, 358) : I-NAME_STUDENT
557349702179: (361, 373) : B-ID_NUM
Swetha: (377, 383) : B-NAME_STUDENT
Swetha: (384, 390) : I-NAME_STUDENT
784372734211: (393, 405) : B-ID_NUM
Alex: (409, 413) : B-NAME_STUDENT
Swetha: (414, 420) : I-NAME_STUDENT
054176622314: (422, 434) : B-ID_NUM
Alex: (438, 442) : B-NAME_STUDENT
Bravo: (443, 448) : I-NAME_STUDENT
674915248960: (469, 481) : B-ID_NUM
Tino: (485, 489) : B-NAME_STUDENT
Lopez: (490, 495) : I-NAME_STUDENT
Hlengiwe: (640, 648) : B-NAME_STUDENT


In [870]:
# text = "Reflection – Visualization   Deiby"
print((full_text))

Date:14-09-2021

NEWS PAPER

Project: News Paper Designing                                 Interviewer Name :   Gabriel Bravo         Interviewer PinNo :   860632713425  Interviewee Name :   Hlengiwe Swetha   Interviewee PinNo :  530670102508     Team: “CSE AIML group 5 ”  530670102508 -  Tino Swetha  875673967537 -  SSRK Kasyap  860632713425 -  Tino Lopez   557349702179 -  Swetha Swetha   784372734211 -  Alex Swetha  054176622314 -  Alex Bravo ( Representative )  674915248960 -  Tino Lopez    Experiment - 1

Introduction:

● Introduce yourself to the bench?                      ● Have you ever faced hardship in your

life?

➔ I am Hlengiwe from Jogipet in the Sanga

Reddy district. I have completed my  bachelor's degree in design thinking by  holding 79 percent. I have done my  bachelor's degree in IIT MADRAS. I have  skills in HTML, soft skills. I have worked  at Grassroots BPO. I like to improve my  skills and knowledge at your company. I  will assure you that I will show my skills 

In [811]:
print(full_text_ds)

Date:14-09-2021 | NEWS PAPER | Project: News Paper Designing Interviewer Name : Gabriel Bravo Interviewer PinNo : 860632713425 Interviewee Name : Hlengiwe Swetha Interviewee PinNo : 530670102508 Team: “CSE AIML group 5 ” 530670102508 - Tino Swetha 875673967537 - SSRK Kasyap 860632713425 - Tino Lopez 557349702179 - Swetha Swetha 784372734211 - Alex Swetha 054176622314 - Alex Bravo ( Representative ) 674915248960 - Tino Lopez Experiment - 1 | Introduction: | ● Introduce yourself to the bench? ● Have you ever faced hardship in your | life? | ➔ I am Hlengiwe from Jogipet in the Sanga | Reddy district. I have completed my bachelor's degree in design thinking by holding 79 percent. I have done my bachelor's degree in IIT MADRAS. I have skills in HTML, soft skills. I have worked at Grassroots BPO. I like to improve my skills and knowledge at your company. I will assure you that I will show my skills and knowledge in my work 100 percent. | ➔ I grew up confronting a lot of challenges | since I 

In [871]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [872]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [873]:
visualize(full_text,offset_mapping_,labels_)

In [715]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [716]:
visualize(full_text_ds,offset_mapping_,labels_)

In [874]:
import numpy as np

def find_successive_numbers(input_array):
    result = []
    current_sublist = []

    for num in input_array:
        if not current_sublist or num == current_sublist[-1] + 1:
            current_sublist.append(num)
        else:
            result.append(current_sublist)
            current_sublist = [num]

    if current_sublist:
        result.append(current_sublist)

    return result

# Example usage
input_array = np.array([0, 1, 3, 895, 896, 901, 902, 903])
result = find_successive_numbers(input_array)
print(result)


[[0, 1], [3], [895, 896], [901, 902, 903]]


In [None]:
# cachez l'id num dans URL ou mixer chiffre et lettre

In [875]:
idx_lab = np.argwhere(np.array(labels)!="O").reshape(-1)

In [876]:
np.array(labels)[idx_lab]

array(['B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-ID_NUM', 'B-NAME_STUDENT',
       'I-NAME_STUDENT', 'B-ID_NUM', 'B-ID_NUM', 'B-NAME_STUDENT',
       'I-NAME_STUDENT', 'B-ID_NUM', 'B-ID_NUM', 'B-NAME_STUDENT',
       'I-NAME_STUDENT', 'B-ID_NUM', 'B-NAME_STUDENT', 'I-NAME_STUDENT',
       'B-ID_NUM', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-ID_NUM',
       'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-ID_NUM', 'B-NAME_STUDENT',
       'I-NAME_STUDENT', 'B-NAME_STUDENT'], dtype='<U14')

In [877]:
idx_lab

array([ 19,  20,  26,  32,  33,  39,  50,  53,  54,  56,  62,  65,  66,
        68,  71,  72,  74,  77,  78,  80,  83,  84,  89,  92,  93, 125])

In [878]:
pos = sorted(find_successive_numbers(idx_lab),reverse=True,key=len)
pos

[[19, 20],
 [32, 33],
 [53, 54],
 [65, 66],
 [71, 72],
 [77, 78],
 [83, 84],
 [92, 93],
 [26],
 [39],
 [50],
 [56],
 [62],
 [68],
 [74],
 [80],
 [89],
 [125]]

In [879]:
lab = np.array(labels)

In [880]:
toks = np.array(tokens)

In [881]:
ent = {}
for i,p in enumerate(pos):
    l = np.unique([x.split('-')[-1] for x in lab[p]]).tolist()
    t = toks[p].tolist()
    if 'NAME_STUDENT' in l:
        full_name = " ".join(t)
        ent[full_name] = l[-1]
#         if len(t)==2:
#             ent[t[0]] = l[0]
#             ent[t[1]] = l[0]
#         if len(t)>2:
#             ent[t[0]] = l[0]
#             ent[" ".join(t[1:])] = l[0]
            
    else:
        full_name = " ".join(t)
        ent[full_name] = l[-1]

In [882]:
ent

{'Gabriel Bravo': 'NAME_STUDENT',
 'Hlengiwe Swetha': 'NAME_STUDENT',
 'Tino Swetha': 'NAME_STUDENT',
 'Tino Lopez': 'NAME_STUDENT',
 'Swetha Swetha': 'NAME_STUDENT',
 'Alex Swetha': 'NAME_STUDENT',
 'Alex Bravo': 'NAME_STUDENT',
 '860632713425': 'ID_NUM',
 '530670102508': 'ID_NUM',
 '875673967537': 'ID_NUM',
 '557349702179': 'ID_NUM',
 '784372734211': 'ID_NUM',
 '054176622314': 'ID_NUM',
 '674915248960': 'ID_NUM',
 'Hlengiwe': 'NAME_STUDENT'}

In [883]:
import re
from mimesis import Generic

def generate_random_data_with_probabilities():
    generic = Generic()

    # Probabilities for each country
    country_probabilities = {'fr': 0.5, 'en': 0.1, 'it': 0.1, 'de': 0.1, 'es': 0.2}

    # Function to randomly choose a country based on probabilities
    def choose_country():
        return generic.random.choices(list(country_probabilities.keys()), weights=country_probabilities.values())

    # Generate random data
    country = choose_country()[0]
    generic = Generic(locale=country)
    name = generic.person.full_name()
    phone_number = generic.person.telephone()
    username = generic.person.username()
    email = generic.person.email()
    address = generic.address.address()
    surname = generic.person.surname()
    
    return {"NAME_STUDENT":name,"EMAIL":email}

In [884]:
dc_ent = generate_random_data_with_probabilities()
dc_ent

{'NAME_STUDENT': 'Donatien Lussier', 'EMAIL': 'herself2090@gmail.com'}

In [885]:
l

['NAME_STUDENT']

In [886]:
mapper = {}
label_mapper = {}
for k,v in ent.items():
    
    if v in ["NAME_STUDENT","EMAIL"]:      
        dc_ent = generate_random_data_with_probabilities()
        if 'NAME_STUDENT' in l:
            names = k.split()
            if k not in mapper.keys():
                mapper[k] = dc_ent[v]
                label_mapper[dc_ent[v]] = v
            if len(k.split())>1:
                
                map_ = dc_ent[v].split()
                if names[0] not in mapper.keys():
                    mapper[names[0]] = map_[0]
                    label_mapper[map_[0]] = v
                if " ".join(names[1:]) not in mapper.keys():
                    mapper[" ".join(names[1:])] = " ".join(map_[1:]) 
                    label_mapper[" ".join(names[1:])] = v
            else:
                map_ = dc_ent[v].split()
                if names[0] not in mapper.keys():
                    mapper[names[0]] = map_[0]
                    label_mapper[map_[0]] = v      
        
        else:
            mapper[k] = dc_ent[v]
            label_mapper[dc_ent[v]] = v
            
        if k in mapper.keys():
            full_text = re.sub(k,mapper[k],full_text)
        else:
            full_text = re.sub(k,dc_ent[v],full_text)

In [887]:
mapper

{'Gabriel Bravo': 'Maimouna Monier',
 'Gabriel': 'Maimouna',
 'Bravo': 'Monier',
 'Hlengiwe Swetha': 'Manon Baillargeon',
 'Hlengiwe': 'Manon',
 'Swetha': 'Baillargeon',
 'Tino Swetha': 'Diane Sicard',
 'Tino': 'Diane',
 'Tino Lopez': 'Marwane Busson',
 'Lopez': 'Busson',
 'Swetha Swetha': 'Sabato Fragale',
 'Alex Swetha': 'Leelou Champagne',
 'Alex': 'Leelou',
 'Alex Bravo': 'Léon Berger'}

In [892]:
label_mapper

{'Maimouna Monier': 'NAME_STUDENT',
 'Maimouna': 'NAME_STUDENT',
 'Bravo': 'NAME_STUDENT',
 'Manon Baillargeon': 'NAME_STUDENT',
 'Manon': 'NAME_STUDENT',
 'Swetha': 'NAME_STUDENT',
 'Diane Sicard': 'NAME_STUDENT',
 'Diane': 'NAME_STUDENT',
 'Marwane Busson': 'NAME_STUDENT',
 'Lopez': 'NAME_STUDENT',
 'Sabato Fragale': 'NAME_STUDENT',
 'Leelou Champagne': 'NAME_STUDENT',
 'Leelou': 'NAME_STUDENT',
 'Léon Berger': 'NAME_STUDENT'}

In [888]:
from spacy.lang.en import English
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}

In [889]:
tokenized_text = tokenize_with_spacy(full_text, tokenizer=en_tokenizer)

In [890]:
tokens = tokenized_text['tokens']

In [893]:
tg = get_offset_mapping(full_text, list(label_mapper.keys()))

In [907]:
offs = {}
for s in list(label_mapper.keys()):
    res = [(m.start(0), m.end(0)) for m in re.finditer(s,full_text)]
    if len(res):
        offs[s] = res
    
offs

{'Maimouna Monier': [(112, 127)],
 'Maimouna': [(112, 120)],
 'Manon Baillargeon': [(193, 210)],
 'Manon': [(193, 198), (660, 665)],
 'Diane Sicard': [(294, 306)],
 'Diane': [(294, 299)],
 'Marwane Busson': [(353, 367), (501, 515)],
 'Sabato Fragale': [(386, 400)],
 'Leelou Champagne': [(419, 435)],
 'Leelou': [(419, 425)],
 'Léon Berger': [(453, 464)]}

In [None]:
labels = []
for tok,off in zip(tokens,tokenized_text['offset_mapping']):
    for k in list(label_mapper.keys()):
        if off[0]>=

In [829]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

error: missing ), unterminated subpattern at position 0

In [759]:
print(full_text)

REFLECTION-STORYTELLING

Challenge & Selection   As a Human Resource (HR) Manager I handle all employee issues and ensure that the work  environment is conducive for highest possible human productivity. Before I was promoted to this  position, the Office Administrator used to handle this task despite her not being well conversant with  Hr and its procedures. A lot of our employees within the company felt unattended, ignored and greatly  demotivated. In addition, managers and their supervisors had major challenges dealing and handling  discipline matters within their departments.  As soon as I started, I realized that I needed to relate closely with my fellow colleagues in order to solve  their issues and build more cohesion within departments and the company. I choose to use the  Storytelling tool described in Module 2.

Storytelling allowed me to interact and engage with employees. It also helped to soften ground so  employees spoke more freely.


Benoit Fortin could not understand wh

In [760]:
visualize(full_text,offset_mapping_,labels_)