In [2]:
cd ../src

/workspace/Script/NLP/PII/src


In [3]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [4]:
import re
import os
import gc
import math
import time
import json
import random
import numpy as np
import pandas as pd
import wandb

from pathlib import Path

import torch 
import torch.nn as nn
from torch.cuda import amp
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoConfig
 
from data.data_utils import to_gpu,to_np
from data.dataset import FeedbackDataset,CustomCollator
from torch.utils.data import DataLoader

from model_zoo.models import FeedbackModel,span_nms,aggregate_tokens_to_words
from metrics_loss.metrics import score_feedback,score,pii_fbeta_score_v2,compute_metrics
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.metrics import log_loss 
from tqdm.auto import tqdm

from utils.utils import count_parameters
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [6]:
# pip install deepspeed

In [7]:
from tqdm.auto import tqdm
import re
from difflib import SequenceMatcher

import codecs
import os
from collections import Counter
from typing import Dict, List, Tuple

from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from text_unidecode import unidecode
import joblib
import torch

# ======================================================================================== #
def get_text_start_end(txt, s, search_from=0):
    txt = txt[int(search_from):]
    try:
        idx = txt.find(s)
        if idx >= 0:
            st = idx
            ed = st + len(s)
        else:
            raise ValueError('Error')
    except:
        res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
        if len(res):
            st, ed = res[0][0], res[0][1]
        else:
            m = SequenceMatcher(None, s, txt).get_opcodes()
            for tag, i1, i2, j1, j2 in m:
                if tag == 'replace':
                    s = s[:i1] + txt[j1:j2] + s[i2:]
                if tag == "delete":
                    s = s[:i1] + s[i2:]

            res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
            if len(res):
                st, ed = res[0][0], res[0][1]
            else:
                idx = txt.find(s)
                if idx >= 0:
                    st = idx
                    ed = st + len(s)
                else:
                    st, ed = 0, 0
    return st + search_from, ed + search_from
    
def get_offset_mapping(full_text, tokens):
    offset_mapping = []

    current_offset = 0
    for token in tokens:
        start, end = get_text_start_end(full_text, token, search_from=current_offset)
        offset_mapping.append((start, end))
        current_offset = end

    return offset_mapping


import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)


# Params

In [8]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'mpware_mixtral8x7b_v1.1-no-i-username.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'pii-masking-200k.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [9]:
CHECKPOINT_PATH = Path(r"/database/kaggle/PII/checkpoint")

In [341]:
os.listdir(CHECKPOINT_PATH/"fold_msk_5_seed_42"/'deberta-v3-large')

['2024-04-04--dv3l_cp_nbrod_add05_rep_05_v1',
 '2024-04-06--dv3l_cp_nbrod_add00_rep_00_v1',
 '2024-02-04--vsub1',
 '2024-04-03--dv3l_cp_nbrod_add05_rep_05_v1']

# Data

In [395]:
FOLD_NAME = "fold_msk_5_seed_42"
model_name = "deberta-v3-large" 
exp_name = "2024-02-04--vsub1"
folder = str(CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/{exp_name}')) 
os.listdir(folder)

['fold_3 _epoch_5 _step_5856 _valid_loss_0.0071 _f5_micro_0.9985 _train_loss_0.0006.pth',
 'tokenizer',
 'params.json',
 'oof_blend.csv',
 'fold_2 _epoch_9 _step_7331 _valid_loss_0.0072 _f5_micro_0.9984 _train_loss_0.0001.pth',
 'fold_1 _epoch_5 _step_4397 _valid_loss_0.0060 _f5_micro_0.9982 _train_loss_0.0012.pth',
 'fold_4 _epoch_5 _step_7322 _valid_loss_0.0051 _f5_micro_0.9978 _train_loss_0.0011.pth',
 'config.pth',
 'fold_0 _epoch_8 _step_7331 _valid_loss_0.0080 _f5_micro_0.9981 _train_loss_0.0004.pth',
 'oof.csv',
 'tokenizer.zip']

In [369]:
df = pd.read_csv(folder+'/oof.csv')
df.shape

(6807, 23)

In [396]:
df_blend = pd.read_csv(folder+'/oof_blend.csv')
df_blend.shape

(6807, 23)

In [361]:
df1 = df.copy()

In [370]:
df = df.merge(df1,how='left',on='document',suffixes=('','_m2'))

In [397]:
df = df.merge(df_blend,how='left',on='document',suffixes=('','_mbl'))

In [398]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,fold_msk_5_seed_42,NAME_STUDENT_s,f5_micro,URL_PERSONAL_s,ID_NUM_s,EMAIL_s,USERNAME_s,PHONE_NUM_s,STREET_ADDRESS_s,label,score,full_text_m2,tokens_m2,trailing_whitespace_m2,labels_m2,NAME_STUDENT_m2,EMAIL_m2,USERNAME_m2,ID_NUM_m2,PHONE_NUM_m2,URL_PERSONAL_m2,STREET_ADDRESS_m2,fold_msk_5_seed_42_m2,NAME_STUDENT_s_m2,f5_micro_m2,URL_PERSONAL_s_m2,EMAIL_s_m2,ID_NUM_s_m2,USERNAME_s_m2,PHONE_NUM_s_m2,STREET_ADDRESS_s_m2,label_m2,score_m2,full_text_size,full_text_mbl,tokens_mbl,trailing_whitespace_mbl,labels_mbl,NAME_STUDENT_mbl,EMAIL_mbl,USERNAME_mbl,ID_NUM_mbl,PHONE_NUM_mbl,URL_PERSONAL_mbl,STREET_ADDRESS_mbl,fold_msk_5_seed_42_mbl,NAME_STUDENT_s_mbl,f5_micro_mbl,URL_PERSONAL_s_mbl,ID_NUM_s_mbl,EMAIL_s_mbl,USERNAME_s_mbl,PHONE_NUM_s_mbl,STREET_ADDRESS_s_mbl,label_mbl,score_mbl
0,7,Design Thinking for innovation reflexion-Avril...,"['Design', 'Thinking', 'for', 'innovation', 'r...","[True, True, True, True, False, False, True, F...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",1,0,0,0,0,0,0,3,0.9811,0.9811,,,,,,,"['O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT...","[0.9989989399909973, 0.9996565580368042, 0.999...",Design Thinking for innovation reflexion-Avril...,"['Design', 'Thinking', 'for', 'innovation', 'r...","[True, True, True, True, False, False, True, F...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",1,0,0,0,0,0,0,3,1.0,1.0,,,,,,,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[0.9999632835388184, 0.9999855756759644, 0.999...",3709,Design Thinking for innovation reflexion-Avril...,"['Design', 'Thinking', 'for', 'innovation', 'r...","[True, True, True, True, False, False, True, F...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",1,0,0,0,0,0,0,3,0.9811,0.9811,,,,,,,"['O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT...","[0.9989393353462219, 0.9996445178985596, 0.999..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"['Diego', 'Estrada', '\n\n', 'Design', 'Thinki...","[True, False, False, True, True, False, False,...","['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O',...",1,0,0,0,0,0,0,4,1.0,1.0,,,,,,,"['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O',...","[0.999984622001648, 0.9999905824661255, 0.9999...",Diego Estrada\n\nDesign Thinking Assignment\n\...,"['Diego', 'Estrada', '\n\n', 'Design', 'Thinki...","[True, False, False, True, True, False, False,...","['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O',...",1,0,0,0,0,0,0,4,1.0,1.0,,,,,,,"['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O',...","[0.9999055862426758, 0.9999181032180786, 0.999...",2915,Diego Estrada\n\nDesign Thinking Assignment\n\...,"['Diego', 'Estrada', '\n\n', 'Design', 'Thinki...","[True, False, False, True, True, False, False,...","['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O',...",1,0,0,0,0,0,0,4,1.0,1.0,,,,,,,"['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'O',...","[0.9999833106994629, 0.9999881982803345, 0.999..."


In [372]:
df['full_text_size'] = df['full_text'].str.len()

In [215]:
LABELS = ['NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS']

In [216]:
df[df['f5_micro'].isna()][LABELS].sum()

NAME_STUDENT      0
EMAIL             0
USERNAME          0
ID_NUM            0
PHONE_NUM         0
URL_PERSONAL      0
STREET_ADDRESS    0
dtype: int64

In [217]:
df[df['f5_micro'].notna()][LABELS+[x+'_s' for x in LABELS]+['full_text_size']].corr()

Unnamed: 0,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,NAME_STUDENT_s,EMAIL_s,USERNAME_s,ID_NUM_s,PHONE_NUM_s,URL_PERSONAL_s,STREET_ADDRESS_s,full_text_size
NAME_STUDENT,1.0,0.0259,-0.0542,0.0244,-0.0009,-0.1937,-0.0246,0.9691,0.2254,0.0328,0.5615,0.0913,0.1566,0.5,-0.0386
EMAIL,0.0259,1.0,0.083,0.0119,0.3015,0.0886,-0.0059,0.0625,1.0,0.1771,0.1299,0.4167,0.0092,-1.0,0.0176
USERNAME,-0.0542,0.083,1.0,-0.0111,-0.0038,-0.0167,-0.0027,0.0212,0.0577,1.0,,,,,-0.0171
ID_NUM,0.0244,0.0119,-0.0111,1.0,-0.0099,-0.0433,-0.007,0.0664,0.0577,-0.0892,0.8434,-0.7303,,,-0.1186
PHONE_NUM,-0.0009,0.3015,-0.0038,-0.0099,1.0,0.046,0.352,0.026,-0.277,-0.0622,,1.0,-0.0406,-0.5,-0.009
URL_PERSONAL,-0.1937,0.0886,-0.0167,-0.0433,0.046,1.0,0.0756,0.0271,-0.1845,-0.0622,-0.2223,-0.0913,0.9782,0.5,0.0649
STREET_ADDRESS,-0.0246,-0.0059,-0.0027,-0.007,0.352,0.0756,1.0,-0.0347,-0.6928,,,0.3536,0.0719,1.0,0.0091
NAME_STUDENT_s,0.9691,0.0625,0.0212,0.0664,0.026,0.0271,-0.0347,1.0,-0.0634,0.1968,0.4112,0.6124,0.0526,,-0.0475
EMAIL_s,0.2254,1.0,0.0577,0.0577,-0.277,-0.1845,-0.6928,-0.0634,1.0,,,-0.25,-0.2648,-1.0,-0.0625
USERNAME_s,0.0328,0.1771,1.0,-0.0892,-0.0622,-0.0622,,0.1968,,1.0,,,,,-0.1093


In [324]:
idx = random.choice(df[(df.PHONE_NUM_s<1)].index)
# idx = dfold[dfold.document==doc].index[0]
# Example usage:
# idx = "uzvemcjmik"
full_text_ds = df.iloc[idx]['full_text']
tokens_ds = eval(df.iloc[idx]['tokens'])
labels_ds = eval(df.iloc[idx]['labels'])
labels = eval(df.iloc[idx]['label'])
scores = eval(df.iloc[idx]['score'])
idx,df.iloc[idx]['document']

(374, 6537)

In [399]:
def visualize_preds(df,mask):
    
    idx = random.choice(df[mask].index)
    full_text_ds = df.iloc[idx]['full_text']
    tokens_ds = eval(df.iloc[idx]['tokens'])
    labels_ds = eval(df.iloc[idx]['labels'])
    labels = eval(df.iloc[idx]['label'])
    scores = eval(df.iloc[idx]['score'])
    
    labels2 = eval(df.iloc[idx]['label_m2'])
    scores2 = eval(df.iloc[idx]['score_m2'])
    
    labelsbl = eval(df.iloc[idx]['label_mbl'])
    scoresbl = eval(df.iloc[idx]['score_mbl'])
    
    
    offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
    offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
    labels_ = [x for x in labels_ds if x!="O"]
    
    print(" -------------------------------------- GT ---------------------------------------")
    print(" --------------------------------------   ---------------------------------------")
    visualize(full_text_ds,offset_mapping_,labels_)
    
    print(" --------------------------------------   ---------------------------------------\n")
    
    print(" -------------------------------------- Pred ---------------------------------------")
    print(" --------------------------------------   ---------------------------------------\n")
    offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
    offset_mapping_ = [x for (x,y,s) in zip(offset_mapping,labels,scores) if y!="O" and s>0.15]
    labels_ = [x for (x,s) in zip(labels,scores) if x!="O" and s>0.15]
    visualize(full_text_ds,offset_mapping_,labels_)
    
    
    print(" -------------------------------------- Pred m2---------------------------------------")
    print(" --------------------------------------   ---------------------------------------\n")
    offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
    offset_mapping_ = [x for (x,y,s) in zip(offset_mapping,labels2,scores2) if y!="O" and s>0.15]
    labels_ = [x for (x,s) in zip(labels2,scores2) if x!="O" and s>0.15]
    visualize(full_text_ds,offset_mapping_,labels_)
    
    
    
    print(" -------------------------------------- Pred blend---------------------------------------")
    print(" --------------------------------------   ---------------------------------------\n")
    offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
    offset_mapping_ = [x for (x,y,s) in zip(offset_mapping,labelsbl,scoresbl) if y!="O" and s>0.15]
    labels_ = [x for (x,s) in zip(labelsbl,scoresbl) if x!="O" and s>0.15]
    visualize(full_text_ds,offset_mapping_,labels_)

In [380]:
# doc test 9674

In [381]:
df[(df.f5_micro!=df.f5_micro_m2)][LABELS].sum()

NAME_STUDENT      105
EMAIL               4
USERNAME            0
ID_NUM              8
PHONE_NUM           1
URL_PERSONAL       11
STREET_ADDRESS      1
dtype: int64

In [413]:
visualize_preds(df,((df.f5_micro!=df.f5_micro_m2) & (df.f5_micro_m2.notna())))

 -------------------------------------- GT ---------------------------------------
 --------------------------------------   ---------------------------------------


 --------------------------------------   ---------------------------------------

 -------------------------------------- Pred ---------------------------------------
 --------------------------------------   ---------------------------------------



 -------------------------------------- Pred m2---------------------------------------
 --------------------------------------   ---------------------------------------



 -------------------------------------- Pred blend---------------------------------------
 --------------------------------------   ---------------------------------------

