In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
import re
import os
import gc
import math
import time
import json
import random
import numpy as np
import pandas as pd
import wandb

from pathlib import Path

import torch 
import torch.nn as nn
from torch.cuda import amp
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoConfig
 
from data.data_utils import to_gpu,to_np
from data.dataset import FeedbackDataset,CustomCollator
from torch.utils.data import DataLoader

from model_zoo.models import FeedbackModel,span_nms,aggregate_tokens_to_words
from metrics_loss.metrics import score_feedback,score,pii_fbeta_score_v2,compute_metrics
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.metrics import log_loss 
from tqdm.auto import tqdm

from utils.utils import count_parameters
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'mpware_mixtral8x7b_v1.1-no-i-username.json.zip',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'pii-masking-200k.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [6]:
CHECKPOINT_PATH = Path(r"/database/kaggle/PII/checkpoint")

In [7]:
df = pd.read_csv(data_path/'pii-masking-200k.csv')
df.shape

(165750, 5)

In [7]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in eval(x) if i==name ])))>0)*1

NameError: name 'df' is not defined

In [9]:
df["offset_mapping"] = df["offset_mapping"].transform(lambda x:eval(x))
df["labels"] = df["labels"].transform(lambda x:eval(x))
df["tokens"] = df["tokens"].transform(lambda x:eval(x))

In [30]:
df = pd.read_json(data_path/'mixtral-8x7b-v1.json')
df.shape

(2355, 5)

In [31]:
df.head(2)

Unnamed: 0,document,full_text,tokens,labels,trailing_whitespace
0,dtduupvzgt,"Tiburce Evans, https://www.instagram.com/tibur...","[Tiburce, Evans, ,, https://www.instagram.com/...","[B-NAME_STUDENT, I-NAME_STUDENT, O, B-URL_PERS...","[True, False, True, False, True, True, True, F..."
1,uejmzisyyh,Rose-Mai Rodriguez | PIN # 3814374\n501 Andrea...,"[Rose, -, Mai, Rodriguez, |, PIN, #, 3814374, ...","[B-NAME_STUDENT, I-NAME_STUDENT, I-NAME_STUDEN...","[False, False, True, True, True, True, True, F..."


In [32]:
from sklearn.model_selection import GroupKFold,StratifiedGroupKFold,KFold,StratifiedKFold

In [33]:
# df['has_label'] = (df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))>0)*1
# seeds = [42]
# folds_names = []
# for K in [5]:  
#     for seed in seeds:
#         mskf = StratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
#         name = f"fold_sk_{K}_seed_{seed}"
#         df[name] = -1
#         for fold, (trn_, val_) in enumerate(mskf.split(df,df['has_label'])):
#             df.loc[val_, name] = fold

In [34]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [35]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])))>0)*1

seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

In [24]:
external_data = True

In [18]:
if external_data:
    print("Using external data")
    dx = pd.read_json(data_path/f'mixtral-8x7b-v1.json')
    LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
    for name in LABEL2TYPE[:-1]:
        dx[name] = ((dx['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])))>0)*1

    seeds = [42]
    folds_names = []
    for K in [5]:  
        for seed in seeds:
            mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
            name = f"fold_msk_{K}_seed_{seed}"
            dx[name] = -1
            for fold, (trn_, val_) in enumerate(mskf.split(dx,dx[list(LABEL2TYPE)[:-1]])):
                dx.loc[val_, name] = fold

    # dx[name] = -1
    df = pd.concat([df,dx],axis=0).reset_index(drop=True)

Using external data


# Data

In [36]:
from train_utils import inference_step
from types import SimpleNamespace

In [37]:
ID_TYPE = {"0-0":0,"0-1":1,
           "1-0":2,"1-1":3,
           "2-0":4,"2-1":5,
           "3-0":6,"3-1":7,
           "4-0":8,"4-1":9,
           "5-0":10,"5-1":11,
           "6-0":12,"6-1":13
          }
ID_NAME = {"0-0":"B-NAME_STUDENT","0-1":"I-NAME_STUDENT",
           "1-0":"B-EMAIL","1-1":"I-EMAIL",
           "2-0":"B-USERNAME","2-1":"I-USERNAME",
           "3-0":"B-ID_NUM","3-1":"I-ID_NUM",
           "4-0":"B-PHONE_NUM","4-1":"I-PHONE_NUM",
           "5-0":"B-URL_PERSONAL","5-1":"I-URL_PERSONAL",
           "6-0":"B-STREET_ADDRESS","6-1":"I-STREET_ADDRESS",
           "7-0":"O","7-1":"O"
          }

def inference_steps(df,folder,bs=1,folds=[0]):
    
    # ==== Loading Args =========== #
    f = open(f'{folder}/params.json')
    args = json.load(f)
    args = SimpleNamespace(**args)
    args.val_loader['batch_size'] = bs
    args.model['pretrained_tokenizer'] = f"{folder}/tokenizer"
    args.model['model_params']['config_path'] = f"{folder}/config.pth"
    args.model['pretrained_weights'] = None
    args.model["model_params"]['pretrained_path'] = None
    args.data['params_valid'] = {"add_text_prob":0,
                                          "replace_text_prob":0,
                                          "use_re":False
                                         }
    
    args.device = 1
    f.close()
    device = torch.device(f"cuda:{args.device}" if torch.cuda.is_available() else "cpu")
    
    # ==== Loading dataset =========== #
    tokenizer = AutoTokenizer.from_pretrained(args.model["model_params"]['model_name'])
    valid_dataset = eval(args.dataset)(df,tokenizer,**args.data["params_valid"])
    
    
    
    # ==== Loading checkpoints =========== #
    checkpoints = [x.as_posix() for x in (Path(folder)).glob("*.pth") if f"config" not in x.as_posix()]
    checkpoints = [ x for x in checkpoints if any([f"fold_{fold}" in x for fold in folds])]
    print(checkpoints)
    weights = [1/len(checkpoints)]*len(checkpoints)
    
    
    # ==== Loop Inference =========== #
    doc_ids = []
    tokens = []
    tokens_v = []
    predictions = None
    gt_df = []
    for j,(checkpoint,weight) in enumerate(zip(checkpoints,weights)):
        net = FeedbackModel(**args.model["model_params"])
        net.load_state_dict(torch.load(checkpoint, map_location=lambda storage, loc: storage))
        net = net.to(device)
        net.eval()
        
        collator = CustomCollator(tokenizer,net)
        val_loader = DataLoader(valid_dataset,**args.val_loader,collate_fn=collator)
    

        
        preds = []
        with torch.no_grad():
            for data in tqdm(val_loader):
                data = to_gpu(data, device)
                
                pred = net(data)['pred']
                preds.append(pred.detach().cpu().to(torch.float32))
#                 pred  = pred.softmax(-1)
                
                
                if j==0:
                
                    doc_ids+=[data['text_id']]*pred.shape[0]
                    tokens+=np.arange(pred.shape[0]).tolist()
                    tokens_v += data['tokens']
                    data = to_np(data)
                    gt = pd.DataFrame({
                                      "document":data['text_id'],
                                      "token":np.arange(pred.shape[0]),
                                      "label":data["gt_spans"][:,1],
                                      "I":data["gt_spans"][:,2],
                                     })
                    gt_df.append(gt)

        
        
        
        if predictions is not None:
            predictions = torch.cat([torch.max(predictions[:, :-1], torch.cat(preds,dim=0)[:, :-1]),
                                     torch.min(predictions[:, -1:], torch.cat(preds,dim=0)[:, -1:])],dim=-1)
            
#             predictions+= torch.cat(preds,dim=0)*weight
        else:
            predictions = torch.cat(preds,dim=0)#*weight
            
#         if predictions is not None:
# #             predictions = torch.max(predictions,torch.cat(preds,dim=0))
#             predictions+= torch.cat(preds,dim=0)*weight
#         else:
#             predictions = torch.cat(preds,dim=0)*weight
#             predictions+= torch.cat(preds,dim=0)*weight
#         print(predictions.shape)
    
    predictions = predictions.softmax(-1)
    s,i = predictions.max(-1)
    pred_df = pd.DataFrame({"document":doc_ids,
                                 "token" : tokens,
                                 "tokens":tokens_v,
                                 "label" : i.numpy() ,
                                 "score" : s.numpy() ,
#                                  "o_score":predictions[:,-1].numpy()
                                 })
    
    # ==== Loop Inference =========== #
    del valid_dataset
    del val_loader
    del net
    del s,i
    del predictions

    gc.collect()
#     torch.cuda.empty_cache()
    
    # ==== Loop Inference =========== #
#     pred_df = pred_df[(pred_df.label!=7) & (pred_df.score>0.5)].reset_index(drop=True)
#     pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
#     pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
#     pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
#     pred_df['row_id'] = np.arange(len(pred_df))
    

    gt_df = pd.concat(gt_df,axis=0).reset_index(drop=True)
    gt_df = gt_df[gt_df.label!=7].reset_index(drop=True)
    gt_df['labels'] = gt_df['label'].astype(str)+'-'+gt_df['I'].astype(str)
    gt_df["label_gt"] = gt_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    gt_df['row_id'] = np.arange(len(gt_df))

    
    
    return pred_df , gt_df

In [38]:
FOLD_NAME = "fold_msk_5_seed_42"
model_name = "deberta-v3-xsmall"
exp_name = "2024-02-03--v2_5fold"
folder = str(CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/{exp_name}')) 
os.listdir(folder)

['pii-200-ms-2.csv',
 'pii-200-ms-gt.csv',
 'pii-200-ms-fold-3.csv',
 'fold_1 _epoch_10 _step_4397 _valid_loss_0.0249 _f5_micro_0.9943 _train_loss_0.0001.pth',
 'fold_2 _epoch_9 _step_6596 _valid_loss_0.0142 _f5_micro_0.9967 _train_loss_0.0002.pth',
 'pii-200-ms-fold-0.csv',
 'tokenizer',
 'params.json',
 'fold_3 _epoch_10 _step_5863 _valid_loss_0.0175 _f5_micro_0.9977 _train_loss_0.0005.pth',
 'pii-200-ms-4.csv',
 'fold_0 _epoch_10 _step_4395 _valid_loss_0.0144 _f5_micro_0.9972 _train_loss_0.0002.pth',
 'pii-200-ms-fold-4.csv',
 'pii-200-ms-fold-1.csv',
 'pii-200-ms-0.csv',
 'pii-200-ms-3.csv',
 'config.pth',
 'pii-200-ms-fold-2.csv',
 'pii-200-ms-1.csv',
 'fold_4 _epoch_11 _step_6591 _valid_loss_0.0117 _f5_micro_0.9970 _train_loss_0.0001.pth',
 'pii-200-ms-blend.csv',
 'tokenizer.zip']

In [34]:
# df[FOLD_NAME].value_counts()

In [20]:
pred = []
gt = []
for FOLD in [2]:
    pred_df_dv3,gt_df = inference_steps(df[df[FOLD_NAME]==FOLD],folder,bs=1,folds=[FOLD])
    pred.append(pred_df_dv3)
    gt.append(gt_df)

Loaded 1362 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-large/2024-02-04--vsub1/fold_2 _epoch_9 _step_7331 _valid_loss_0.0072 _f5_micro_0.9984 _train_loss_0.0001.pth']
Pooling: MeanPooling


  0%|          | 0/1362 [00:00<?, ?it/s]

In [21]:
gt_df

Unnamed: 0,document,token,label,I,labels,label_gt,row_id
0,93,0,0,0,0-0,0,0
1,93,1,0,1,0-1,1,1
2,333,20,0,0,0-0,0,2
3,333,21,0,1,0-1,1,3
4,375,5,0,0,0-0,0,4
...,...,...,...,...,...,...,...
502,13315,585,5,0,5-0,10,502
503,13342,0,0,0,0-0,0,503
504,13342,1,0,1,0-1,1,504
505,13342,523,0,0,0-0,0,505


In [23]:
gt_df = pd.concat(gt,axis=0)

In [24]:
gt_df.shape

(507, 7)

In [25]:
gt_df.head()

Unnamed: 0,document,token,label,I,labels,label_gt,row_id
0,93,0,0,0,0-0,0,0
1,93,1,0,1,0-1,1,1
2,333,20,0,0,0-0,0,2
3,333,21,0,1,0-1,1,3
4,375,5,0,0,0-0,0,4


In [26]:
g  = gt_df.copy()

In [27]:
pred_df = pd.concat(pred,axis=0)

In [28]:
pred_df.shape

(995975, 5)

In [29]:
gt_df['label'] = gt_df['labels'].map(ID_NAME)

In [51]:
pred_df = pred_df_dv3.copy()#pred_df_dv3[(pred_df_dv3.label!=7) & (pred_df_dv3.score>0.15)].reset_index(drop=True)

In [39]:
def make_pred_df(pred_df,threshold=0.15):
    
    pred_df["label_next_e_prev"] = pred_df.groupby('document')['label'].transform(lambda x: (x.shift(1)==x.shift(-1))*1)
    pred_df["label_next"] = pred_df.groupby('document')['label'].transform(lambda x: x.shift(1))
    pred_df["label_next_e_prev"] = ((pred_df["label_next_e_prev"]==1) & (pred_df["label_next"]==6))*1
    pred_df["score_next"] = pred_df.groupby('document')['score'].transform(lambda x: x.shift(1))
    pred_df.loc[pred_df["label_next_e_prev"]==1,"label"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"label_next"]
    pred_df.loc[pred_df["label_next_e_prev"]==1,"score"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"score_next"]
    
    
    pred_df = pred_df[(pred_df.label!=7) & ((pred_df.score>threshold))].reset_index(drop=True)
    
#     pred_df['token_size'] = pred_df['tokens'].transform(len)
#     pred_df = pred_df[~((pred_df.label==0) & ((pred_df.token_size<=1)))].reset_index(drop=True)
    
    pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
    pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
    pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    pred_df['row_id'] = np.arange(len(pred_df))
    
    pred_df['label'] = pred_df['labels'].map(ID_NAME)
    return pred_df

In [31]:
pred_df = make_pred_df(pred_df,threshold=0.15)

In [32]:
pred_df.shape

(673, 12)

In [40]:
pred = []
gt = []
for FOLD in [0,1,2,3,4]:
    pred_df_dv3,gt_df = inference_steps(df,folder,bs=1,folds=[FOLD]) #[df[FOLD_NAME]==FOLD]
    pred.append(pred_df_dv3)
    gt.append(gt_df)
    
    gt_df['label'] = gt_df['labels'].map(ID_NAME)
    pred_df_dv3 = make_pred_df(pred_df_dv3,threshold=0.15)
    s = compute_metrics(pred_df_dv3, gt_df)
    
    print(f"Fold {FOLD}")
    print(s)

Loaded 2355 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-xsmall/2024-02-03--v2_5fold/fold_0 _epoch_10 _step_4395 _valid_loss_0.0144 _f5_micro_0.9972 _train_loss_0.0002.pth']
Pooling: MeanPooling


  0%|          | 0/2355 [00:01<?, ?it/s]

Fold 0
{'f5_prec': 0.9869876835293299, 'f5_rec': 0.9948797082135091, 'f5_micro': 0.9945738357479583, 'ents_per_type': {'NAME_STUDENT': 0.9952281615940178, 'STREET_ADDRESS': 0.9915109875515733, 'EMAIL': 0.9995757985200081, 'PHONE_NUM': 0.99739087511459, 'URL_PERSONAL': 0.9996850320217444, 'USERNAME': 0.9869478723576314, 'ID_NUM': 0.9996976856154336}}
Loaded 2355 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-xsmall/2024-02-03--v2_5fold/fold_1 _epoch_10 _step_4397 _valid_loss_0.0249 _f5_micro_0.9943 _train_loss_0.0001.pth']
Pooling: MeanPooling


  0%|          | 0/2355 [00:01<?, ?it/s]

Fold 1
{'f5_prec': 0.9863051838446413, 'f5_rec': 0.993897734446237, 'f5_micro': 0.9936035523983476, 'ents_per_type': {'NAME_STUDENT': 0.9845122906261924, 'STREET_ADDRESS': 0.9938566867193573, 'EMAIL': 0.9999685781618224, 'PHONE_NUM': 0.9989310132361346, 'URL_PERSONAL': 1.0, 'USERNAME': 0.9981872329405672, 'ID_NUM': 0.9975705057154693}}
Loaded 2355 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-xsmall/2024-02-03--v2_5fold/fold_2 _epoch_9 _step_6596 _valid_loss_0.0142 _f5_micro_0.9967 _train_loss_0.0002.pth']
Pooling: MeanPooling


  0%|          | 0/2355 [00:01<?, ?it/s]

Fold 2
{'f5_prec': 0.9895113438045375, 'f5_rec': 0.9942309041172758, 'f5_micro': 0.9940485502360081, 'ents_per_type': {'NAME_STUDENT': 0.9914615742148722, 'STREET_ADDRESS': 0.9907534028186179, 'EMAIL': 1.0, 'PHONE_NUM': 0.9990683103009246, 'URL_PERSONAL': 1.0, 'USERNAME': 0.9976220902958638, 'ID_NUM': 0.9974030044551905}}
Loaded 2355 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-xsmall/2024-02-03--v2_5fold/fold_3 _epoch_10 _step_5863 _valid_loss_0.0175 _f5_micro_0.9977 _train_loss_0.0005.pth']
Pooling: MeanPooling


  0%|          | 0/2355 [00:01<?, ?it/s]

Fold 3
{'f5_prec': 0.9945403958213029, 'f5_rec': 0.9966156975520797, 'f5_micro': 0.9965357181152202, 'ents_per_type': {'NAME_STUDENT': 0.9995092792143875, 'STREET_ADDRESS': 0.9917486838282706, 'EMAIL': 1.0, 'PHONE_NUM': 0.9998106981526367, 'URL_PERSONAL': 1.0, 'USERNAME': 0.9976200883967167, 'ID_NUM': 0.9987684177526981}}
Loaded 2355 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-xsmall/2024-02-03--v2_5fold/fold_4 _epoch_11 _step_6591 _valid_loss_0.0117 _f5_micro_0.9970 _train_loss_0.0001.pth']
Pooling: MeanPooling


  0%|          | 0/2355 [00:01<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7efd4d215fc0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1443, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/opt/conda/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/opt/conda/lib/python3.10/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/opt/conda/lib/python3.10/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 

KeyboardInterrupt



In [41]:
# s = compute_metrics(pred_df_dv3, gt_df)
# s

In [38]:
s

{'f5_prec': 0.0,
 'f5_rec': 0.0,
 'f5_micro': 0.0,
 'ents_per_type': {'NAME_STUDENT': 0.0,
  'URL_PERSONAL': 0.0,
  'PHONE_NUM': 0.0,
  'ID_NUM': 0.0,
  'STREET_ADDRESS': 0.0,
  'EMAIL': 0.0,
  'USERNAME': 0.0}}

In [33]:
s = compute_metrics(pred_df, gt_df)
s

{'f5_prec': 0.7384843982169391,
 'f5_rec': 0.980276134122288,
 'f5_micro': 0.9680851063829788,
 'ents_per_type': {'NAME_STUDENT': 0.9674318388994778,
  'URL_PERSONAL': 0.988593155893536,
  'PHONE_NUM': 0.5777777777777778,
  'ID_NUM': 0.9873417721518988,
  'STREET_ADDRESS': 0.9862068965517242,
  'EMAIL': 0.9936305732484076,
  'USERNAME': 1.0}}

In [30]:
s = compute_metrics(pred_df, gt_df)
s

{'f5_prec': 0.7285513361462729,
 'f5_rec': 0.9592592592592593,
 'f5_micro': 0.9477165575962282,
 'ents_per_type': {'NAME_STUDENT': 0.9465371554923795,
  'ID_NUM': 0.8965517241379312,
  'EMAIL': 1.0,
  'URL_PERSONAL': 0.989345509893455,
  'USERNAME': 0.9285714285714286,
  'PHONE_NUM': 1.0}}

In [55]:
s = compute_metrics(pred_df, gt_df)
s

{'f5_prec': 0.5807692307692308,
 'f5_rec': 0.8388888888888889,
 'f5_micro': 0.8247899159663866,
 'ents_per_type': {'NAME_STUDENT': 0.8326382878275238,
  'USERNAME': 0.896551724137931,
  'URL_PERSONAL': 0.7384155455904335,
  'ID_NUM': 0.6511627906976745,
  'EMAIL': 1.0,
  'STREET_ADDRESS': 0.0,
  'PHONE_NUM': 1.0}}

In [31]:
gt_df

Unnamed: 0,document,token,label,I,labels,label_gt,row_id
0,7,9,B-NAME_STUDENT,0,0-0,0,0
1,7,10,I-NAME_STUDENT,1,0-1,1,1
2,7,482,B-NAME_STUDENT,0,0-0,0,2
3,7,483,I-NAME_STUDENT,1,0-1,1,3
4,7,741,B-NAME_STUDENT,0,0-0,0,4
...,...,...,...,...,...,...,...
2734,15717,365,B-ID_NUM,0,3-0,6,2734
2735,15717,964,B-ID_NUM,0,3-0,6,2735
2736,19280,54,B-ID_NUM,0,3-0,6,2736
2737,19280,55,I-ID_NUM,1,3-1,7,2737


In [32]:
pred_df

Unnamed: 0,document,token,tokens,label,score,label_next_e_prev,label_next,score_next,I,labels,label_pred,row_id
0,7,9,Nathalie,B-NAME_STUDENT,0.9999,0,7.0000,1.0000,0,0-0,0,0
1,7,10,Sylla,I-NAME_STUDENT,0.9999,0,0.0000,0.9999,1,0-1,1,1
2,7,482,Nathalie,B-NAME_STUDENT,0.9999,0,7.0000,1.0000,0,0-0,0,2
3,7,483,Sylla,I-NAME_STUDENT,0.9999,0,0.0000,0.9999,1,0-1,1,3
4,7,741,Nathalie,B-NAME_STUDENT,1.0000,0,7.0000,1.0000,0,0-0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
3629,22106,682,LAURA,I-NAME_STUDENT,0.9991,0,0.0000,0.9991,1,0-1,1,3629
3630,22106,683,SOFÍA,I-NAME_STUDENT,0.9893,0,0.0000,0.9991,1,0-1,1,3630
3631,22106,684,CARMONA,I-NAME_STUDENT,0.9604,0,0.0000,0.9893,1,0-1,1,3631
3632,22109,8,Hamzah,B-NAME_STUDENT,0.9995,0,7.0000,0.9999,0,0-0,0,3632


In [33]:
s = compute_metrics(pred_df, gt_df)
s

{'f5_prec': 0.6323610346725371,
 'f5_rec': 0.8389923329682366,
 'f5_micro': 0.8285789568569806,
 'ents_per_type': {'NAME_STUDENT': 0.8311162611046737,
  'ID_NUM': 0.8003848003848004,
  'USERNAME': 0.6011560693641618,
  'EMAIL': 0.9990147783251231,
  'URL_PERSONAL': 0.6862679585699967,
  'STREET_ADDRESS': 0.9930555555555556,
  'PHONE_NUM': 0.9963503649635036}}

In [34]:
# pred_df['token_size'].value_counts()

In [35]:
# pred_df.shape,pred_df[pred_df.token_size==1].shape

In [36]:
# pred_df[pred_df.token_size==1]

In [37]:
# s = compute_metrics(pred_df, gt_df)
# s

In [38]:
# s['ents_per_type']

In [39]:
# pd.DataFrame({x:[y] for x,y in s['ents_per_type'].items()})

In [32]:
documents = df[df[FOLD_NAME]==FOLD].document.unique() #[df[FOLD_NAME]==FOLD]
len(documents)

1361

In [33]:
df_score = []
for doc in tqdm(documents):
    p = pred_df[pred_df.document==doc].reset_index(drop=True)
    gp = gt_df[gt_df.document==doc].reset_index(drop=True)
    
    s = compute_metrics(p, gp)
    
    d = pd.DataFrame({x:[y] for x,y in s['ents_per_type'].items()})
    d["f5_micro"] = s['f5_micro']
    d['document'] = doc
    df_score.append(d)

  0%|          | 0/1361 [00:00<?, ?it/s]

In [34]:
df_score = pd.concat(df_score).reset_index(drop=True)

In [35]:
df_score[df_score.f5_micro==0]

Unnamed: 0,NAME_STUDENT,f5_micro,document,URL_PERSONAL,ID_NUM,PHONE_NUM,EMAIL,USERNAME
4,0.0,0.0,204,,,,,
122,0.0,0.0,8758,,,,,
127,0.0,0.0,9031,,,,,
132,0.0,0.0,9313,,,,,
134,0.0,0.0,9399,,,,,
146,0.0,0.0,9961,,,,,
150,0.0,0.0,10070,,,,,
171,0.0,0.0,11856,,,,,
174,0.0,0.0,11896,,,,,
175,0.0,0.0,11901,,,,,0.0


In [39]:
pdf = pd.concat(pred,axis=0)

In [41]:
doc = 5001

In [40]:
pdf = make_pred_df(pdf,threshold=0.15)

In [41]:
dx = pdf.groupby("document")['label'].agg(list).reset_index()

In [42]:
dx

Unnamed: 0,document,label
0,16,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
1,56,"[O, O, O, O, O, O, O, O, O, O, O, B-NAME_STUDE..."
2,112,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, I..."
3,166,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
4,204,"[O, O, O, B-NAME_STUDENT, O, O, O, B-NAME_STUD..."
...,...,...
1356,22637,"[O, O, O, O, O, O, O, O, O, O, O, B-NAME_STUDE..."
1357,22646,"[O, O, O, O, B-NAME_STUDENT, O, B-NAME_STUDENT..."
1358,22652,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1359,22664,"[O, O, O, O, O, B-NAME_STUDENT, O, O, O, O, O,..."


In [43]:
dfold = df[df[FOLD_NAME]==FOLD].copy().reset_index(drop=True)#[df[FOLD_NAME]==FOLD].reset_index(drop=True)
dfold.shape

(1361, 13)

In [44]:
dfold = dfold.merge(dx,how='left',on='document')

In [49]:
dfold.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,fold_msk_5_seed_42,label,NAME_STUDENT_s,f5_micro,URL_PERSONAL_s,ID_NUM_s,PHONE_NUM_s,EMAIL_s,USERNAME_s
0,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",1,0,0,0,0,0,0,0,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",1.0,1.0,,,,,
1,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",1,0,0,0,0,0,0,0,"[O, O, O, O, O, O, O, O, O, O, O, B-NAME_STUDE...",0.4906,0.4906,,,,,
2,112,Reflection – Learning Launch\n\nFrancisco Ferr...,"[Reflection, –, Learning, Launch, \n\n, Franci...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",1,0,0,0,0,0,0,0,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, I...",0.4906,0.4906,,,,,
3,166,Pepa Medrano\n\nDesign Thinking for Innovation...,"[Pepa, Medrano, \n\n, Design, Thinking, for, I...","[True, False, False, True, True, True, True, F...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",1,0,0,0,0,0,0,0,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",1.0,1.0,,,,,
4,204,Reflection – Visualization Deiby\n\nChalleng...,"[Reflection, –, Visualization, , Deiby, \n\n...","[True, True, True, False, False, False, True, ...","[O, O, O, O, B-NAME_STUDENT, O, O, O, O, O, O,...",1,0,0,0,0,0,0,0,"[O, O, O, B-NAME_STUDENT, O, O, O, B-NAME_STUD...",0.0,0.0,,,,,


In [46]:
dfold = dfold.merge(df_score,how='left',on='document',suffixes=('','_s'))

In [61]:
dfold[(dfold.URL_PERSONAL>0)&(dfold.URL_PERSONAL_s.isna())].index

Int64Index([], dtype='int64')

In [63]:
dfold[(dfold.URL_PERSONAL_s==0)]

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,fold_msk_5_seed_42,label,NAME_STUDENT_s,f5_micro,URL_PERSONAL_s,ID_NUM_s,PHONE_NUM_s,EMAIL_s,USERNAME_s,len_tok,len_tok_p
41,4381,"WRITING CENTRE Level 3 East, Hub Central Nor...","[WRITING, CENTRE, , Level, 3, East, ,, Hub, C...","[True, True, False, True, True, False, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,1,0,0,1,0,0,0,"[O, O, B-NAME_STUDENT, O, O, O, O, O, O, B-NAM...",,0.9924,0.0,,1.0,1.0,,1222,1222
74,6393,Student Name: Rania Mohammed\n\nTool: Story Te...,"[Student, Name, :, Rania, Mohammed, \n\n, Tool...","[True, False, True, True, False, False, False,...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O...",1,0,0,0,0,0,0,0,"[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O...",1.0,0.9811,0.0,,,,,1280,1280
101,7713,WHY PEOPLE ARE NOT COMMITTED TO RECYCLING IN L...,"[WHY, PEOPLE, ARE, NOT, COMMITTED, TO, RECYCLI...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,0,0,0,0,0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1.0,0.9811,0.0,,,,,582,582
629,14027,Visualization\n\nChallenge:\n\nDesigning a new...,"[Visualization, \n\n, Challenge, :, \n\n, Desi...","[False, False, False, False, False, True, True...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,0,0,0,0,0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,0.0,0.0,,,,,695,695
682,14664,Challenge I am a product manager of a tech Sa...,"[Challenge, , I, am, a, product, manager, of,...","[True, False, True, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,0,0,0,0,0,"[O, B-NAME_STUDENT, O, O, O, O, O, O, O, O, O,...",,0.0,0.0,,,,,1090,1090
1209,21071,Visualization\n\nChallenge\n\nThe business of ...,"[Visualization, \n\n, Challenge, \n\n, The, bu...","[False, False, False, False, True, True, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0,0,0,0,0,0,0,0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,0.0,0.0,,,,,833,833


In [51]:
dfold['len_tok'] = dfold['labels'].transform(lambda x:len(x))
dfold['len_tok_p'] = dfold['label'].transform(lambda x:len(x))

In [52]:
(dfold['len_tok']==dfold['len_tok_p']).value_counts()

True    1361
dtype: int64

In [53]:
from data.data_utils import get_offset_mapping

In [54]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [91]:
idx = random.choice(dfold[(dfold.URL_PERSONAL_s==0)].index)
# idx = dfold[dfold.document==doc].index[0]
# Example usage:
# idx = "uzvemcjmik"
full_text_ds = dfold.iloc[idx]['full_text']
tokens_ds = dfold.iloc[idx]['tokens']
labels_ds = dfold.iloc[idx]['labels']
labels = dfold.iloc[idx]['label']
idx,dfold.iloc[idx]['document'],doc

(101, 7713, 22677)

In [92]:
# dfold.iloc[idx]

In [93]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]
labels_

['B-NAME_STUDENT', 'I-NAME_STUDENT']

In [94]:
visualize(full_text_ds,offset_mapping_,labels_)

In [97]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [98]:
visualize(full_text_ds,offset_mapping_,labels_)

In [96]:
# doc = 204
x = pred_df[pred_df.document==doc]

In [98]:
x['tokens'] = tokens_ds

In [57]:
x['label_gt'] = labels_ds

In [131]:
x['label_e'] = x['label']

In [123]:
x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'] = x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'].shift(1)

In [132]:
x.label.value_counts()

O                 851
B-NAME_STUDENT     61
Name: label, dtype: int64

In [61]:
x[x.label!="O"]

Unnamed: 0,document,token,tokens,label,score,label_next_e_prev,label_next,score_next,I,labels,label_pred,row_id,label_gt
42619,5001,34,Rya,B-NAME_STUDENT,0.9701,0,7.0,1.0,0,0-0,0,42619,O
42643,5001,58,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42643,O
42666,5001,81,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42666,O
42681,5001,96,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42681,O
42704,5001,119,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42704,O
42725,5001,140,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42725,O
42767,5001,182,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42767,O
42794,5001,209,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42794,O
42839,5001,254,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42839,O
42863,5001,278,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42863,O


In [60]:
x[x(.score>0.125) & (x.label!="O")]

SyntaxError: invalid syntax (2834079789.py, line 1)

In [1]:
pip install faker-schema

Defaulting to user installation because normal site-packages is not writeable
Collecting faker-schema
  Downloading faker-schema-0.1.4.tar.gz (6.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Faker>=0.7.17
  Downloading Faker-23.2.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Building wheels for collected packages: faker-schema
  Building wheel for faker-schema (setup.py) ... [?25ldone
[?25h  Created wheel for faker-schema: filename=faker_schema-0.1.4-py3-none-any.whl size=6013 sha256=3e6563773e0030f3e8e8a35181b6a9c3c8492d633862a05b51e9fedddd369c9d
  Stored in directory: /home/jovyan/.cache/pip/wheels/d9/34/1e/1e6be31918d66a34f1f4901302688380f33d1dfae7ededd6ad
Successfully built faker-schema
Installing collected packages: Faker, faker-schema
Successfully installed Faker-23.2.1 faker-schema-0.1.4
Note: you may need to restart the kernel to use updated pa

In [2]:
from faker_schema.faker_schema import FakerSchema

schema = {
    'name': 'name',
    'address': 'address',
    'phone_number': 'phone_number',
}

fake = FakerSchema(schema=schema)
print(fake.generate())


TypeError: FakerSchema.__init__() got an unexpected keyword argument 'schema'

In [3]:
from faker_schema.faker_schema import FakerSchema

schema = {'employee_id': 'uuid4', 'employee_name': 'name', 'employee address': 'address',
          'email_address': 'email'}
faker = FakerSchema()
data = faker.generate_fake(schema)
print(data)
# {'employee_id': '956f0cf3-a954-5bff-0aaf-ee0e1b7e1e1b', 'employee_name': 'Adam Wells',
#  'employee address': '189 Kyle Springs Suite 110\nNorth Robin, OR 73512',
#  'email_address': 'jmcgee@gmail.com'}

{'employee_id': '44438f12-3228-488d-8922-ef9dc99f5574', 'employee_name': 'Thomas Jones', 'employee address': '03264 Martin Falls\nLopezville, SC 12733', 'email_address': 'carralexander@example.org'}
