In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
from data.data_utils import get_offset_mapping,clean_text
from data.dataset import FeedbackDataset

In [4]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

%env TOKENIZERS_PARALLELISM = true

env: TOKENIZERS_PARALLELISM=true


# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [6]:
sample_df = pd.read_csv(data_path/'sample_submission.csv')
sample_df.shape

(26, 4)

In [7]:
sample_df.head(5)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT


In [8]:
sample_df.label.unique()

array(['B-NAME_STUDENT', 'I-NAME_STUDENT'], dtype=object)

In [9]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [10]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [11]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
len(LABEL2TYPE)

8

In [12]:
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])>0)))*1

In [13]:
df['nb_labels'] = df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))

In [14]:
df['nb_labels'].value_counts() 

0     5862
2      599
4      108
1       86
3       52
6       46
8       14
5       10
12       6
10       5
11       3
9        3
15       2
14       2
21       2
7        1
23       1
18       1
17       1
26       1
34       1
22       1
Name: nb_labels, dtype: int64

In [15]:
df[list(LABEL2TYPE)[:-1]+['nb_labels']].sum()

NAME_STUDENT       891
EMAIL               24
USERNAME             5
ID_NUM              33
PHONE_NUM            4
URL_PERSONAL        72
STREET_ADDRESS       2
nb_labels         2739
dtype: int64

In [16]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [17]:
seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

In [18]:
df.groupby(name)[list(LABEL2TYPE)[:-1]].sum()

Unnamed: 0_level_0,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS
fold_msk_5_seed_42,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,178,5,1,7,1,14,0
1,178,5,1,7,1,14,0
2,179,5,1,6,1,15,1
3,178,4,1,6,0,15,1
4,178,5,1,7,1,14,0


In [19]:
model_name = 'microsoft/deberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [20]:
ds = FeedbackDataset(df.copy(),tokenizer,en_train=True)

Loaded 6807 samples.


In [167]:
idx = random.choice(ds.df[ds.df.PHONE_NUM>1].index)
# idx = 219
# doc = 204
# idx = ds.df[ds.df.document==doc].index[0]
# Example usage:
# idx = 80
full_text_ds = ds.df.iloc[idx]['full_text']
tokens_ds = ds.df.iloc[idx]['tokens']
labels_ds = ds.df.iloc[idx]['labels']
idx,ds.df.iloc[idx]['nb_labels']

(350, 21)

In [168]:
len(labels_ds),len(tokens_ds)

(730, 730)

In [169]:
tokens_ds[:15]

['Name',
 ':',
 'Jana',
 'Telfah',
 '',
 'Email',
 ':',
 'nbarker@hotmail.com',
 '',
 'Mobile',
 ':',
 '(',
 '820)913',
 '-',
 '3241x894']

In [170]:
ds[idx]

{'text_id': 6243,
 'text': 'Name: Calandra Fazio  Email: nbarker@hotmail.com  Mobile: (820)913-3241x894\n\nDesign Thinking For Innovation\n\nChallenge & Selection  Jordan’s economic growth and stability is strengthened by thriving entrepreneurial ecosystem focused  on tech innovation and tech-based startups (TBS). While past efforts have improved ecosystem  dynamics and innovation input; nonetheless, innovation outputs and socioeconomic impact remains  marginal for many reasons, among which are inefficiencies, replication, lack of creativity, and low- quality pipeline.\n\nMid last year, UVA (University of Bedfordshire) in Jordan decided to contribute to this conversation  and to the Jordanian tech ecosystem by establishing, a tech accelerator called (The Belvedere Vodka). I was  chosen as the Program Manager for this project, and my current challenge was to develop new models  that would steer the university and other educational institutions towards more responsible and value- focused

In [171]:
ds[idx]['word_boxes'][:15,:]

tensor([[ 1.,  0.,  2.,  1.],
        [ 2.,  0.,  3.,  1.],
        [ 3.,  0.,  6.,  1.],
        [ 6.,  0.,  9.,  1.],
        [ 9.,  0., 10.,  1.],
        [10.,  0., 11.,  1.],
        [11.,  0., 12.,  1.],
        [12.,  0., 21.,  1.],
        [21.,  0., 22.,  1.],
        [22.,  0., 23.,  1.],
        [23.,  0., 24.,  1.],
        [24.,  0., 25.,  1.],
        [25.,  0., 29.,  1.],
        [29.,  0., 30.,  1.],
        [30.,  0., 35.,  1.]])

In [172]:
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
labels = df.iloc[idx]['labels']
idx,df.iloc[idx]['nb_labels']

(350, 21)

In [173]:
offset_mapping = get_offset_mapping(full_text, tokens)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")

Jana: (6, 10) : B-NAME_STUDENT
Telfah: (11, 17) : I-NAME_STUDENT
nbarker@hotmail.com: (26, 45) : B-EMAIL
(: (55, 56) : B-PHONE_NUM
820)913: (56, 63) : I-PHONE_NUM
-: (63, 64) : I-PHONE_NUM
3241x894: (64, 72) : I-PHONE_NUM
Jana: (2365, 2369) : B-NAME_STUDENT
Telfah: (2370, 2376) : I-NAME_STUDENT
nbarker@hotmail.com: (2385, 2404) : B-EMAIL
(: (2414, 2415) : B-PHONE_NUM
820)913: (2415, 2422) : I-PHONE_NUM
-: (2422, 2423) : I-PHONE_NUM
3241x894: (2423, 2431) : I-PHONE_NUM
Jana: (2486, 2490) : B-NAME_STUDENT
Telfah: (2491, 2497) : I-NAME_STUDENT
nbarker@hotmail.com: (2506, 2525) : B-EMAIL
(: (2535, 2536) : B-PHONE_NUM
820)913: (2536, 2543) : I-PHONE_NUM
-: (2543, 2544) : I-PHONE_NUM
3241x894: (2544, 2552) : I-PHONE_NUM


In [174]:
full_text

'Name: Jana Telfah  Email: nbarker@hotmail.com  Mobile: (820)913-3241x894\n\nDesign Thinking For Innovation\n\nChallenge & Selection  Jordan’s economic growth and stability is strengthened by thriving entrepreneurial ecosystem focused  on tech innovation and tech-based startups (TBS). While past efforts have improved ecosystem  dynamics and innovation input; nonetheless, innovation outputs and socioeconomic impact remains  marginal for many reasons, among which are inefficiencies, replication, lack of creativity, and low- quality pipeline.\n\nMid last year, UVA (University of Bedfordshire) in Jordan decided to contribute to this conversation  and to the Jordanian tech ecosystem by establishing, a tech accelerator called (The Belvedere Vodka). I was  chosen as the Program Manager for this project, and my current challenge was to develop new models  that would steer the university and other educational institutions towards more responsible and value- focused models of entrepreneurial sup

In [175]:
# text = "Reflection – Visualization   Deiby"
print((full_text))

Name: Jana Telfah  Email: nbarker@hotmail.com  Mobile: (820)913-3241x894

Design Thinking For Innovation

Challenge & Selection  Jordan’s economic growth and stability is strengthened by thriving entrepreneurial ecosystem focused  on tech innovation and tech-based startups (TBS). While past efforts have improved ecosystem  dynamics and innovation input; nonetheless, innovation outputs and socioeconomic impact remains  marginal for many reasons, among which are inefficiencies, replication, lack of creativity, and low- quality pipeline.

Mid last year, UVA (University of Bedfordshire) in Jordan decided to contribute to this conversation  and to the Jordanian tech ecosystem by establishing, a tech accelerator called (The Belvedere Vodka). I was  chosen as the Program Manager for this project, and my current challenge was to develop new models  that would steer the university and other educational institutions towards more responsible and value- focused models of entrepreneurial support.



In [176]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[-1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [177]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [178]:
visualize(full_text,offset_mapping_,labels_)

In [179]:
d = ds[idx]
full_text_ds = d['text']
tokens_ds = d['tokens']
labels_ds = d['labels']

In [180]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [181]:
visualize(full_text_ds,offset_mapping_,labels_)

In [182]:
# ======================================================================================== #
import numpy as np

def find_successive_numbers(input_array):
    result = []
    current_sublist = []

    for num in input_array:
        if not current_sublist or num == current_sublist[-1] + 1:
            current_sublist.append(num)
        else:
            result.append(current_sublist)
            current_sublist = [num]

    if current_sublist:
        result.append(current_sublist)

    return result
# ======================================================================================== #

import re
from mimesis import Generic

def generate_random_data_with_probabilities():
    generic = Generic()

    # Probabilities for each country
    country_probabilities = {'fr': 0.5, 'en': 0.1, 'it': 0.1, 'de': 0.1, 'es': 0.2}

    # Function to randomly choose a country based on probabilities
    def choose_country():
        return generic.random.choices(list(country_probabilities.keys()), weights=country_probabilities.values())

    # Generate random data
    country = choose_country()[0]
    generic = Generic(locale=country)
    name = generic.person.full_name()
#     phone_number = generic.person.telephone()
#     username = generic.person.username()
#     email = generic.person.email()
#     address = generic.address.address()
#     surname = generic.person.surname()
    ret = dict(
              NAME_STUDENT=name
              )
    return ret
# ======================================================================================== #

def generate_ent(labels,tokens):
    
    idx_lab = np.argwhere(np.array(labels)!="O").reshape(-1)
    pos = sorted(find_successive_numbers(idx_lab),reverse=True,key=len)

    lab = np.array(labels)
    toks = np.array(tokens)

    ent = {}
    for i,p in enumerate(pos):
        l = np.unique([x.split('-')[-1] for x in lab[p]]).tolist()
        t = toks[p].tolist()
        if 'NAME_STUDENT' in l:
            full_name = " ".join(t)
            ent[clean_text(full_name)] = l[-1]

        else:
            full_name = " ".join(t)
            ent[clean_text(full_name)] = l[-1]
    return ent
# ======================================================================================== #
from spacy.lang.en import English
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    offset_mapping = [(token.idx,token.idx+len(token)) for token in tokenized_text]
    return {'tokens': tokens, 'offset_mapping': offset_mapping}
# ======================================================================================== #

def create_mapper_n_clean(full_text,labels,tokens,attribut=["NAME_STUDENT"]):
    ent = generate_ent(labels,tokens)
    print(ent)
    mapper = {}
    label_mapper = {}
    for k,v in ent.items():
#         k,v = clean_text(k),clean_text(v)
        if v in attribut:      
            dc_ent = generate_random_data_with_probabilities()
            if 'NAME_STUDENT' in v:
                names = k.split()
                if k not in mapper.keys():
                    mapper[k] = dc_ent[v]
                    label_mapper[dc_ent[v]] = v
                if len(k.split())>1:
                    map_ = dc_ent[v].split()
                    if names[0] not in mapper.keys():
                        mapper[names[0]] = map_[0]
                        label_mapper[map_[0]] = v
                    if " ".join(names[1:]) not in mapper.keys():
                        mapper[" ".join(names[1:])] = " ".join(map_[1:]) 
                        label_mapper[" ".join(names[1:])] = v
                else:
                    map_ = dc_ent[v].split()
                    if names[0] not in mapper.keys():
                        mapper[names[0]] = map_[0]
                        label_mapper[map_[0]] = v      

            else:
                mapper[k] = dc_ent[v]
                label_mapper[dc_ent[v]] = v

            if k in mapper.keys():
                full_text = re.sub(k,mapper[k],full_text)
            else:
                full_text = re.sub(k,dc_ent[v],full_text)
        else:
            label_mapper[k] = v
            
    
    print(mapper)
    print(label_mapper)
    
    full_text = clean_text(full_text)
    
    tokenized_text = tokenize_with_spacy(full_text, tokenizer=en_tokenizer)
    tokens = tokenized_text['tokens']
    tg = get_offset_mapping(full_text, list(label_mapper.keys()))
    
    offs = {}
    for s in list(label_mapper.keys()):
        res = [(m.start(0), m.end(0)) for m in re.finditer(s,full_text)]
        if len(res):
            offs[s] = res
            
    labels = []
    for tok, off in zip(tokens, tokenized_text['offset_mapping']):
        found_label = False
        for k, ofs in offs.items():
            for o in ofs:
                if o[0] <= off[0] <= o[1] or o[0] <= off[1] <=o[1]:
                    labels.append(label_mapper[k])
                    found_label = True
                    break
            if found_label:
                break
        else:
            labels.append('O')
        
    return full_text,labels,tokens

In [183]:
# cachez l'id num dans URL ou mixer chiffre et lettre

In [184]:
full_text

'Name: Jana Telfah  Email: nbarker@hotmail.com  Mobile: (820)913-3241x894\n\nDesign Thinking For Innovation\n\nChallenge & Selection  Jordan’s economic growth and stability is strengthened by thriving entrepreneurial ecosystem focused  on tech innovation and tech-based startups (TBS). While past efforts have improved ecosystem  dynamics and innovation input; nonetheless, innovation outputs and socioeconomic impact remains  marginal for many reasons, among which are inefficiencies, replication, lack of creativity, and low- quality pipeline.\n\nMid last year, UVA (University of Bedfordshire) in Jordan decided to contribute to this conversation  and to the Jordanian tech ecosystem by establishing, a tech accelerator called (The Belvedere Vodka). I was  chosen as the Program Manager for this project, and my current challenge was to develop new models  that would steer the university and other educational institutions towards more responsible and value- focused models of entrepreneurial sup

In [185]:
full_text,labels,tokens = create_mapper_n_clean(full_text,labels,tokens,attribut=["NAME_STUDENT"])

{'( 820)913 - 3241x894': 'PHONE_NUM', 'Jana Telfah': 'NAME_STUDENT', 'nbarker@hotmail.com': 'EMAIL'}
{'Jana Telfah': 'Rafa Pardo', 'Jana': 'Rafa', 'Telfah': 'Pardo'}
{'( 820)913 - 3241x894': 'PHONE_NUM', 'Rafa Pardo': 'NAME_STUDENT', 'Rafa': 'NAME_STUDENT', 'Telfah': 'NAME_STUDENT', 'nbarker@hotmail.com': 'EMAIL'}


In [186]:
# labels

In [187]:
offset_mapping = get_offset_mapping(full_text, tokens)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [188]:
print(full_text)

Name: Rafa Pardo  Email: nbarker@hotmail.com  Mobile: (820)913-3241x894

Design Thinking For Innovation

Challenge & Selection  Jordan’s economic growth and stability is strengthened by thriving entrepreneurial ecosystem focused  on tech innovation and tech-based startups (TBS). While past efforts have improved ecosystem  dynamics and innovation input; nonetheless, innovation outputs and socioeconomic impact remains  marginal for many reasons, among which are inefficiencies, replication, lack of creativity, and low- quality pipeline.

Mid last year, UVA (University of Bedfordshire) in Jordan decided to contribute to this conversation  and to the Jordanian tech ecosystem by establishing, a tech accelerator called (The Belvedere Vodka). I was  chosen as the Program Manager for this project, and my current challenge was to develop new models  that would steer the university and other educational institutions towards more responsible and value- focused models of entrepreneurial support.

W

In [189]:
visualize(full_text,offset_mapping_,labels_)

In [190]:
import torch
from torch.utils.data.sampler import WeightedRandomSampler

# Assuming you have a dataset `my_dataset` and you want to assign different probabilities to each sample
weights = [0.1, 0.5, 0.2, 0.2,1.]

# Create a WeightedRandomSampler
sampler = WeightedRandomSampler(weights, len(weights), replacement=False)

# Print sampled indices for three epochs
for epoch in range(3):
    sampled_indices = list(sampler)
    print(f"Epoch {epoch + 1}: {sampled_indices}")


Epoch 1: [3, 0, 4, 1, 2]
Epoch 2: [4, 1, 3, 2, 0]
Epoch 3: [4, 3, 1, 2, 0]


In [141]:
728//512

1

In [142]:
384/2

192.0