In [4]:
cd ../src

/workspace/Script/NLP/PII/src


In [5]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [6]:
import re
import os
import gc
import math
import time
import json
import random
import numpy as np
import pandas as pd
import wandb

from pathlib import Path

import torch 
import torch.nn as nn
from torch.cuda import amp
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoConfig
 
from data.data_utils import to_gpu,to_np
from data.dataset import FeedbackDataset,CustomCollator
from torch.utils.data import DataLoader

from model_zoo.models import FeedbackModel,span_nms,aggregate_tokens_to_words
from metrics_loss.metrics import score_feedback,score,pii_fbeta_score_v2,compute_metrics
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.metrics import log_loss 
from tqdm.auto import tqdm

from utils.utils import count_parameters
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [7]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Params

In [8]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'dubai-ar.zip',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'openaddr-collected-global.zip',
 'lecture2.pptx',
 'openaddr-collected-us_west-sa.zip',
 'test.json',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv']

In [9]:
CHECKPOINT_PATH = Path(r"/database/kaggle/PII/checkpoint")

In [22]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [23]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [24]:
from sklearn.model_selection import GroupKFold,StratifiedGroupKFold,KFold,StratifiedKFold

In [25]:
# df['has_label'] = (df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))>0)*1
# seeds = [42]
# folds_names = []
# for K in [5]:  
#     for seed in seeds:
#         mskf = StratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
#         name = f"fold_sk_{K}_seed_{seed}"
#         df[name] = -1
#         for fold, (trn_, val_) in enumerate(mskf.split(df,df['has_label'])):
#             df.loc[val_, name] = fold

In [26]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [27]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])))>0)*1

seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

In [16]:
external_data = True

In [17]:
if external_data:
    print("Using external data")
    dx = pd.read_json(data_path/f'mixtral-8x7b-v1.json')
    LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
    for name in LABEL2TYPE[:-1]:
        dx[name] = ((dx['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])))>0)*1

    seeds = [42]
    folds_names = []
    for K in [5]:  
        for seed in seeds:
            mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
            name = f"fold_msk_{K}_seed_{seed}"
            dx[name] = -1
            for fold, (trn_, val_) in enumerate(mskf.split(dx,dx[list(LABEL2TYPE)[:-1]])):
                dx.loc[val_, name] = fold

    # dx[name] = -1
    df = pd.concat([df,dx],axis=0).reset_index(drop=True)

Using external data


# Data

In [28]:
from train_utils import inference_step
from types import SimpleNamespace

In [29]:
ID_TYPE = {"0-0":0,"0-1":1,
           "1-0":2,"1-1":3,
           "2-0":4,"2-1":5,
           "3-0":6,"3-1":7,
           "4-0":8,"4-1":9,
           "5-0":10,"5-1":11,
           "6-0":12,"6-1":13
          }
ID_NAME = {"0-0":"B-NAME_STUDENT","0-1":"I-NAME_STUDENT",
           "1-0":"B-EMAIL","1-1":"I-EMAIL",
           "2-0":"B-USERNAME","2-1":"I-USERNAME",
           "3-0":"B-ID_NUM","3-1":"I-ID_NUM",
           "4-0":"B-PHONE_NUM","4-1":"I-PHONE_NUM",
           "5-0":"B-URL_PERSONAL","5-1":"I-URL_PERSONAL",
           "6-0":"B-STREET_ADDRESS","6-1":"I-STREET_ADDRESS",
           "7-0":"O","7-1":"O"
          }

def inference_steps(df,folder,bs=1,folds=[0]):
    
    # ==== Loading Args =========== #
    f = open(f'{folder}/params.json')
    args = json.load(f)
    args = SimpleNamespace(**args)
    args.val_loader['batch_size'] = bs
    args.model['pretrained_tokenizer'] = f"{folder}/tokenizer"
    args.model['model_params']['config_path'] = f"{folder}/config.pth"
    args.model['pretrained_weights'] = None
    args.model["model_params"]['pretrained_path'] = None
#     args.model["model_params"]['max_len'] = 3048
    
    args.device = 1
    f.close()
    device = torch.device(f"cuda:{args.device}" if torch.cuda.is_available() else "cpu")
    
    # ==== Loading dataset =========== #
    tokenizer = AutoTokenizer.from_pretrained(args.model["model_params"]['model_name'])
    valid_dataset = eval(args.dataset)(df,tokenizer,**args.data["params_valid"])
    
    
    
    # ==== Loading checkpoints =========== #
    checkpoints = [x.as_posix() for x in (Path(folder)).glob("*.pth") if f"config" not in x.as_posix()]
    checkpoints = [ x for x in checkpoints if any([f"fold_{fold}" in x for fold in folds])]
    print(checkpoints)
    weights = [1/len(checkpoints)]*len(checkpoints)
    
    
    # ==== Loop Inference =========== #
    doc_ids = []
    tokens = []
    tokens_v = []
    predictions = None
    gt_df = []
    for j,(checkpoint,weight) in enumerate(zip(checkpoints,weights)):
        net = FeedbackModel(**args.model["model_params"])
        net.load_state_dict(torch.load(checkpoint, map_location=lambda storage, loc: storage))
        net = net.to(device)
        net.eval()
        
        collator = CustomCollator(tokenizer,net)
        val_loader = DataLoader(valid_dataset,**args.val_loader,collate_fn=collator)
    

        
        preds = []
        with torch.no_grad():
            for data in tqdm(val_loader):
                data = to_gpu(data, device)
                
                pred = net(data)['pred']
                preds.append(pred.detach().cpu().to(torch.float32))
#                 pred  = pred.softmax(-1)
                
                
                if j==0:
                
                    doc_ids+=[data['text_id']]*pred.shape[0]
                    tokens+=np.arange(pred.shape[0]).tolist()
                    tokens_v += data['tokens']
                    data = to_np(data)
                    gt = pd.DataFrame({
                                      "document":data['text_id'],
                                      "token":np.arange(pred.shape[0]),
                                      "label":data["gt_spans"][:,1],
                                      "I":data["gt_spans"][:,2],
                                     })
                    gt_df.append(gt)

        
        
        
        if predictions is not None:
            predictions = torch.cat([torch.max(predictions[:, :-1], torch.cat(preds,dim=0)[:, :-1]),
                                     torch.min(predictions[:, -1:], torch.cat(preds,dim=0)[:, -1:])],dim=-1)
            
#             predictions+= torch.cat(preds,dim=0)*weight
        else:
            predictions = torch.cat(preds,dim=0)#*weight
            
#         if predictions is not None:
# #             predictions = torch.max(predictions,torch.cat(preds,dim=0))
#             predictions+= torch.cat(preds,dim=0)*weight
#         else:
#             predictions = torch.cat(preds,dim=0)*weight
#             predictions+= torch.cat(preds,dim=0)*weight
#         print(predictions.shape)
    
    predictions = predictions.softmax(-1)
    s,i = predictions.max(-1)
    pred_df = pd.DataFrame({"document":doc_ids,
                                 "token" : tokens,
                                 "tokens":tokens_v,
                                 "label" : i.numpy() ,
                                 "score" : s.numpy() ,
#                                  "o_score":predictions[:,-1].numpy()
                                 })
    
    # ==== Loop Inference =========== #
    del valid_dataset
    del val_loader
    del net
    del s,i
    del predictions

    gc.collect()
#     torch.cuda.empty_cache()
    
    # ==== Loop Inference =========== #
#     pred_df = pred_df[(pred_df.label!=7) & (pred_df.score>0.5)].reset_index(drop=True)
#     pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
#     pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
#     pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
#     pred_df['row_id'] = np.arange(len(pred_df))
    

    gt_df = pd.concat(gt_df,axis=0).reset_index(drop=True)
    gt_df = gt_df[gt_df.label!=7].reset_index(drop=True)
    gt_df['labels'] = gt_df['label'].astype(str)+'-'+gt_df['I'].astype(str)
    gt_df["label_gt"] = gt_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    gt_df['row_id'] = np.arange(len(gt_df))

    
    
    return pred_df , gt_df

In [30]:
FOLD_NAME = "fold_msk_5_seed_42"
model_name = "deberta-v3-large"
exp_name = "2024-02-04--vsub1"
folder = str(CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/{exp_name}')) 
os.listdir(folder)

['fold_3 _epoch_5 _step_5856 _valid_loss_0.0071 _f5_micro_0.9985 _train_loss_0.0006.pth',
 'tokenizer',
 'params.json',
 'fold_2 _epoch_9 _step_7331 _valid_loss_0.0072 _f5_micro_0.9984 _train_loss_0.0001.pth',
 'fold_1 _epoch_5 _step_4397 _valid_loss_0.0060 _f5_micro_0.9982 _train_loss_0.0012.pth',
 'fold_4 _epoch_5 _step_7322 _valid_loss_0.0051 _f5_micro_0.9978 _train_loss_0.0011.pth',
 'config.pth',
 'fold_0 _epoch_8 _step_7331 _valid_loss_0.0080 _f5_micro_0.9981 _train_loss_0.0004.pth',
 'tokenizer.zip']

In [31]:
df[FOLD_NAME].value_counts()

4    1362
2    1362
3    1361
0    1361
1    1361
Name: fold_msk_5_seed_42, dtype: int64

In [32]:
pred = []
gt = []
for FOLD in range(5):
    pred_df_dv3,gt_df = inference_steps(df[df[FOLD_NAME]==FOLD],folder,bs=1,folds=[FOLD])
    pred.append(pred_df_dv3)
    gt.append(gt_df)

Loaded 1361 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-large/2024-02-04--vsub1/fold_0 _epoch_8 _step_7331 _valid_loss_0.0080 _f5_micro_0.9981 _train_loss_0.0004.pth']
Pooling: MeanPooling


  0%|          | 0/1361 [00:00<?, ?it/s]

Loaded 1361 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-large/2024-02-04--vsub1/fold_1 _epoch_5 _step_4397 _valid_loss_0.0060 _f5_micro_0.9982 _train_loss_0.0012.pth']
Pooling: MeanPooling


  0%|          | 0/1361 [00:00<?, ?it/s]

Loaded 1362 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-large/2024-02-04--vsub1/fold_2 _epoch_9 _step_7331 _valid_loss_0.0072 _f5_micro_0.9984 _train_loss_0.0001.pth']
Pooling: MeanPooling


  0%|          | 0/1362 [00:00<?, ?it/s]

Loaded 1361 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-large/2024-02-04--vsub1/fold_3 _epoch_5 _step_5856 _valid_loss_0.0071 _f5_micro_0.9985 _train_loss_0.0006.pth']
Pooling: MeanPooling


  0%|          | 0/1361 [00:01<?, ?it/s]

Loaded 1362 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-large/2024-02-04--vsub1/fold_4 _epoch_5 _step_7322 _valid_loss_0.0051 _f5_micro_0.9978 _train_loss_0.0011.pth']
Pooling: MeanPooling


  0%|          | 0/1362 [00:01<?, ?it/s]

In [33]:
gt_df = pd.concat(gt,axis=0)

In [34]:
gt_df.shape

(2739, 7)

In [35]:
gt_df.head()

Unnamed: 0,document,token,label,I,labels,label_gt,row_id
0,16,4,0,0,0-0,0,0
1,16,5,0,1,0-1,1,1
2,56,12,0,0,0-0,0,2
3,56,13,0,1,0-1,1,3
4,112,5,0,0,0-0,0,4


In [21]:
g  = gt_df.copy()

In [36]:
pred_df = pd.concat(pred,axis=0)

In [37]:
pred_df.shape

(4992533, 5)

In [38]:
gt_df['label'] = gt_df['labels'].map(ID_NAME)

In [23]:
pred_df = pred_df_dv3.copy()#pred_df_dv3[(pred_df_dv3.label!=7) & (pred_df_dv3.score>0.15)].reset_index(drop=True)

In [60]:
def make_pred_df(pred_df,threshold=0.15):
    
    pred_df["label_next_e_prev"] = pred_df.groupby('document')['label'].transform(lambda x: (x.shift(1)==x.shift(-1))*1)
    pred_df["label_next"] = pred_df.groupby('document')['label'].transform(lambda x: x.shift(1))
    pred_df["label_next_e_prev"] = ((pred_df["label_next_e_prev"]==1) & (pred_df["label_next"]==6))*1
    pred_df["score_next"] = pred_df.groupby('document')['score'].transform(lambda x: x.shift(1))
    pred_df.loc[pred_df["label_next_e_prev"]==1,"label"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"label_next"]
    pred_df.loc[pred_df["label_next_e_prev"]==1,"score"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"score_next"]
    
    
#     pred_df = pred_df[(pred_df.label!=7) & ((pred_df.score>threshold))].reset_index(drop=True)
    
#     pred_df['token_size'] = pred_df['tokens'].transform(len)
#     pred_df = pred_df[~((pred_df.label==0) & ((pred_df.token_size<=1)))].reset_index(drop=True)
    
    pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
    pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
    pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    pred_df['row_id'] = np.arange(len(pred_df))
    
    pred_df['label'] = pred_df['labels'].map(ID_NAME)
    return pred_df

In [40]:
pred_df = make_pred_df(pred_df,threshold=0.15)

In [41]:
pred_df.shape

(3734, 12)

In [42]:
s = compute_metrics(pred_df, gt_df)
s

{'f5_prec': 0.7241564006427423,
 'f5_rec': 0.9872216137276378,
 'f5_micro': 0.9736182470329185,
 'ents_per_type': {'NAME_STUDENT': 0.974754406797958,
  'URL_PERSONAL': 0.9694915254237287,
  'STREET_ADDRESS': 0.9930555555555556,
  'ID_NUM': 0.9298998569384834,
  'EMAIL': 0.9990147783251231,
  'PHONE_NUM': 0.9646643109540636,
  'USERNAME': 0.9629629629629629}}

In [26]:
s = compute_metrics(pred_df, gt_df)
s

{'f5_prec': 0.9671957671957672,
 'f5_rec': 0.9985713085133204,
 'f5_micro': 0.9973269628099173,
 'ents_per_type': {'PHONE_NUM': 0.9998883679392722,
  'STREET_ADDRESS': 0.9998934942753173,
  'ID_NUM': 0.9783760864956541,
  'NAME_STUDENT': 0.9960412504188332,
  'EMAIL': 1.0,
  'URL_PERSONAL': 0.9990062934746605,
  'USERNAME': 0.9948293691830404}}

In [44]:
# pred_df['token_size'].value_counts()

In [45]:
# pred_df.shape,pred_df[pred_df.token_size==1].shape

In [46]:
# pred_df[pred_df.token_size==1]

In [47]:
# s = compute_metrics(pred_df, gt_df)
# s

In [48]:
# s['ents_per_type']

In [176]:
# pd.DataFrame({x:[y] for x,y in s['ents_per_type'].items()})

In [51]:
documents = df.document.unique() #[df[FOLD_NAME]==FOLD]
len(documents)

6807

In [52]:
df_score = []
for doc in tqdm(documents):
    p = pred_df[pred_df.document==doc].reset_index(drop=True)
    gp = gt_df[gt_df.document==doc].reset_index(drop=True)
    
    s = compute_metrics(p, gp)
    
    d = pd.DataFrame({x:[y] for x,y in s['ents_per_type'].items()})
    d["f5_micro"] = s['f5_micro']
    d['document'] = doc
    df_score.append(d)

  0%|          | 0/6807 [00:00<?, ?it/s]

In [54]:
df_score = pd.concat(df_score).reset_index(drop=True)

In [55]:
df_score[df_score.f5_micro==0]

Unnamed: 0,NAME_STUDENT,f5_micro,document,URL_PERSONAL,EMAIL,ID_NUM,USERNAME,PHONE_NUM,STREET_ADDRESS
12,0.0,0.0,204,,,,,,
105,0.0,0.0,3214,,,,,,
107,0.0,0.0,3241,,,,,,
564,0.0,0.0,8758,,,,,,
574,0.0,0.0,8875,,,,,,
580,0.0,0.0,8935,,,,,,
581,0.0,0.0,8936,,,,,,
583,0.0,0.0,8960,,,,,,
589,0.0,0.0,9031,,,,,,
592,0.0,0.0,9098,,,,,,


In [57]:
pdf = pd.concat(pred,axis=0)

In [41]:
doc = 5001

In [61]:
pdf = make_pred_df(pdf,threshold=0.15)

In [62]:
dx = pdf.groupby("document")['label'].agg(list).reset_index()

In [63]:
dx

Unnamed: 0,document,label
0,7,"[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,"[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,"[O, O, B-NAME_STUDENT, O, B-NAME_STUDENT, O, B..."
...,...,...
6802,22678,"[O, O, O, O, O, O, O, B-NAME_STUDENT, O, O, O,..."
6803,22679,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6804,22681,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6805,22684,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [81]:
dfold = df.copy().reset_index(drop=True)#[df[FOLD_NAME]==FOLD].reset_index(drop=True)
dfold.shape

(6807, 13)

In [82]:
dfold = dfold.merge(dx,how='left',on='document')

In [83]:
dfold.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,fold_msk_5_seed_42,label
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",1,0,0,0,0,0,0,3,"[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",1,0,0,0,0,0,0,4,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",1,0,0,0,0,0,0,0,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",1,0,0,0,0,0,0,3,"[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",1,0,0,0,0,0,0,0,"[O, O, B-NAME_STUDENT, O, B-NAME_STUDENT, O, B..."


In [84]:
dfold = dfold.merge(df_score,how='left',on='document',suffixes=('','_s'))

In [184]:
dfold[(dfold.PHONE_NUM>0)&(dfold.f5_micro==0)].index

Int64Index([], dtype='int64')

In [85]:
dfold['len_tok'] = dfold['labels'].transform(lambda x:len(x))
dfold['len_tok_p'] = dfold['label'].transform(lambda x:len(x))

In [86]:
(dfold['len_tok']==dfold['len_tok_p']).value_counts()

True    6807
dtype: int64

In [87]:
from data.data_utils import get_offset_mapping

In [177]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [178]:
idx = random.choice(dfold[(dfold.ID_NUM>0)&(dfold.f5_micro==0)].index)
# idx = dfold[dfold.document==doc].index[0]
# Example usage:
# idx = "uzvemcjmik"
full_text_ds = dfold.iloc[idx]['full_text']
tokens_ds = dfold.iloc[idx]['tokens']
labels_ds = dfold.iloc[idx]['labels']
labels = dfold.iloc[idx]['label']
idx,dfold.iloc[idx]['document'],doc

(3769, 15717, 22687)

In [179]:
# dfold.iloc[idx]

In [180]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]
labels_

['B-ID_NUM', 'B-ID_NUM']

In [181]:
visualize(full_text_ds,offset_mapping_,labels_)

In [182]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [183]:
visualize(full_text_ds,offset_mapping_,labels_)

In [136]:
# doc = 204
x = pred_df[pred_df.document==doc]

In [98]:
x['tokens'] = tokens_ds

In [57]:
x['label_gt'] = labels_ds

In [131]:
x['label_e'] = x['label']

In [123]:
x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'] = x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'].shift(1)

In [132]:
x.label.value_counts()

O                 851
B-NAME_STUDENT     61
Name: label, dtype: int64

In [61]:
x[x.label!="O"]

Unnamed: 0,document,token,tokens,label,score,label_next_e_prev,label_next,score_next,I,labels,label_pred,row_id,label_gt
42619,5001,34,Rya,B-NAME_STUDENT,0.9701,0,7.0,1.0,0,0-0,0,42619,O
42643,5001,58,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42643,O
42666,5001,81,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42666,O
42681,5001,96,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42681,O
42704,5001,119,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42704,O
42725,5001,140,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42725,O
42767,5001,182,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42767,O
42794,5001,209,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42794,O
42839,5001,254,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42839,O
42863,5001,278,,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,42863,O


In [60]:
x[x(.score>0.125) & (x.label!="O")]

SyntaxError: invalid syntax (2834079789.py, line 1)

In [1]:
pip install faker-schema

Defaulting to user installation because normal site-packages is not writeable
Collecting faker-schema
  Downloading faker-schema-0.1.4.tar.gz (6.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting Faker>=0.7.17
  Downloading Faker-23.2.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Building wheels for collected packages: faker-schema
  Building wheel for faker-schema (setup.py) ... [?25ldone
[?25h  Created wheel for faker-schema: filename=faker_schema-0.1.4-py3-none-any.whl size=6013 sha256=3e6563773e0030f3e8e8a35181b6a9c3c8492d633862a05b51e9fedddd369c9d
  Stored in directory: /home/jovyan/.cache/pip/wheels/d9/34/1e/1e6be31918d66a34f1f4901302688380f33d1dfae7ededd6ad
Successfully built faker-schema
Installing collected packages: Faker, faker-schema
Successfully installed Faker-23.2.1 faker-schema-0.1.4
Note: you may need to restart the kernel to use updated pa

In [2]:
from faker_schema.faker_schema import FakerSchema

schema = {
    'name': 'name',
    'address': 'address',
    'phone_number': 'phone_number',
}

fake = FakerSchema(schema=schema)
print(fake.generate())


TypeError: FakerSchema.__init__() got an unexpected keyword argument 'schema'

In [3]:
from faker_schema.faker_schema import FakerSchema

schema = {'employee_id': 'uuid4', 'employee_name': 'name', 'employee address': 'address',
          'email_address': 'email'}
faker = FakerSchema()
data = faker.generate_fake(schema)
print(data)
# {'employee_id': '956f0cf3-a954-5bff-0aaf-ee0e1b7e1e1b', 'employee_name': 'Adam Wells',
#  'employee address': '189 Kyle Springs Suite 110\nNorth Robin, OR 73512',
#  'email_address': 'jmcgee@gmail.com'}

{'employee_id': '44438f12-3228-488d-8922-ef9dc99f5574', 'employee_name': 'Thomas Jones', 'employee address': '03264 Martin Falls\nLopezville, SC 12733', 'email_address': 'carralexander@example.org'}
