In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
import re
import os
import gc
import math
import time
import json
import random
import numpy as np
import pandas as pd
import wandb

from pathlib import Path

import torch 
import torch.nn as nn
from torch.cuda import amp
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoConfig
 
from data.data_utils import to_gpu,to_np
from data.dataset import FeedbackDataset,CustomCollator
from torch.utils.data import DataLoader

from model_zoo.models import FeedbackModel,span_nms,aggregate_tokens_to_words
from metrics_loss.metrics import score_feedback,score,pii_fbeta_score_v2,compute_metrics
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.metrics import log_loss 
from tqdm.auto import tqdm

from utils.utils import count_parameters
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'dubai-ar.zip',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'openaddr-collected-global.zip',
 'lecture2.pptx',
 'openaddr-collected-us_west-sa.zip',
 'test.json',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv']

In [6]:
CHECKPOINT_PATH = Path(r"/database/kaggle/PII/checkpoint")

In [7]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [8]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [9]:
external_data = True

In [10]:
if external_data:
    print("Using external data")
    dx = pd.read_json(data_path/f'mixtral-8x7b-v1.json')
    # dx[name] = -1
    df = pd.concat([df,dx],axis=0).reset_index(drop=True)

Using external data


In [11]:
df.shape

(9162, 5)

In [12]:
from sklearn.model_selection import GroupKFold,StratifiedGroupKFold,KFold,StratifiedKFold

In [13]:
df['has_label'] = (df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))>0)*1
seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = StratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_sk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df['has_label'])):
            df.loc[val_, name] = fold

In [14]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [15]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])))>0)*1

seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

# Data

In [16]:
from train_utils import inference_step
from types import SimpleNamespace

In [17]:
ID_TYPE = {"0-0":0,"0-1":1,
           "1-0":2,"1-1":3,
           "2-0":4,"2-1":5,
           "3-0":6,"3-1":7,
           "4-0":8,"4-1":9,
           "5-0":10,"5-1":11,
           "6-0":12,"6-1":13
          }
ID_NAME = {"0-0":"B-NAME_STUDENT","0-1":"I-NAME_STUDENT",
           "1-0":"B-EMAIL","1-1":"I-EMAIL",
           "2-0":"B-USERNAME","2-1":"I-USERNAME",
           "3-0":"B-ID_NUM","3-1":"I-ID_NUM",
           "4-0":"B-PHONE_NUM","4-1":"I-PHONE_NUM",
           "5-0":"B-URL_PERSONAL","5-1":"I-URL_PERSONAL",
           "6-0":"B-STREET_ADDRESS","6-1":"I-STREET_ADDRESS",
           "7-0":"O","7-1":"O"
          }

def inference_steps(df,folder,bs=1,fold=0):
    
    # ==== Loading Args =========== #
    f = open(f'{folder}/params.json')
    args = json.load(f)
    args = SimpleNamespace(**args)
    args.val_loader['batch_size'] = bs
    args.model['pretrained_tokenizer'] = f"{folder}/tokenizer"
    args.model['model_params']['config_path'] = f"{folder}/config.pth"
    args.model['pretrained_weights'] = None
    args.model["model_params"]['pretrained_path'] = None
#     args.model["model_params"]['max_len'] = 3048
    
    args.device = 1
    f.close()
    device = torch.device(f"cuda:{args.device}" if torch.cuda.is_available() else "cpu")
    
    # ==== Loading dataset =========== #
    tokenizer = AutoTokenizer.from_pretrained(args.model["model_params"]['model_name'])
    valid_dataset = eval(args.dataset)(df,tokenizer,**args.data["params_valid"])
    
    
    
    # ==== Loading checkpoints =========== #
    checkpoints = [x.as_posix() for x in (Path(folder)).glob("*.pth") if f"fold_{fold}" in x.as_posix()]
    print(checkpoints)
    weights = [1/len(checkpoints)]*len(checkpoints)
    
    
    # ==== Loop Inference =========== #
    doc_ids = []
    tokens = []
    predictions = None
    gt_df = []
    for j,(checkpoint,weight) in enumerate(zip(checkpoints,weights)):
        net = FeedbackModel(**args.model["model_params"])
        net.load_state_dict(torch.load(checkpoint, map_location=lambda storage, loc: storage))
        net = net.to(device)
        net.eval()
        
        collator = CustomCollator(tokenizer,net)
        val_loader = DataLoader(valid_dataset,**args.val_loader,collate_fn=collator)
    

        
        preds = []
        with torch.no_grad():
            for data in tqdm(val_loader):
                data = to_gpu(data, device)
                
                pred = net(data)['pred']
                preds.append(pred.detach().cpu().to(torch.float32))
#                 pred  = pred.softmax(-1)
                
                
                if j==0:
                
                    doc_ids+=[data['text_id']]*pred.shape[0]
                    tokens+=np.arange(pred.shape[0]).tolist()
                    
                    data = to_np(data)
                    gt = pd.DataFrame({
                                      "document":data['text_id'],
                                      "token":np.arange(pred.shape[0]),
                                      "label":data["gt_spans"][:,1],
                                      "I":data["gt_spans"][:,2],
                                     })
                    gt_df.append(gt)

        
        if predictions is not None:
#             predictions = torch.max(predictions,torch.cat(preds,dim=0))
            predictions+= torch.cat(preds,dim=0)*weight
        else:
            predictions = torch.cat(preds,dim=0)*weight
#             predictions+= torch.cat(preds,dim=0)*weight
        print(predictions.shape)
    
    predictions = predictions.softmax(-1)
    s,i = predictions.max(-1)
    pred_df = pd.DataFrame({"document":doc_ids,
                                 "token" : tokens,
                                 "label" : i.numpy() ,
                                 "score" : s.numpy() ,
#                                  "o_score":predictions[:,-1].numpy()
                                 })
    
    # ==== Loop Inference =========== #
    del valid_dataset
    del val_loader
    del net
    del s,i
    del predictions

    gc.collect()
#     torch.cuda.empty_cache()
    
    # ==== Loop Inference =========== #
#     pred_df = pred_df[(pred_df.label!=7) & (pred_df.score>0.5)].reset_index(drop=True)
#     pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
#     pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
#     pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
#     pred_df['row_id'] = np.arange(len(pred_df))
    

    gt_df = pd.concat(gt_df,axis=0).reset_index(drop=True)
    gt_df = gt_df[gt_df.label!=7].reset_index(drop=True)
    gt_df['labels'] = gt_df['label'].astype(str)+'-'+gt_df['I'].astype(str)
    gt_df["label_gt"] = gt_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    gt_df['row_id'] = np.arange(len(gt_df))

    
    
    return pred_df , gt_df

In [18]:
FOLD_NAME = "fold_msk_5_seed_42"
model_name = "deberta-v3-xsmall"
exp_name = "2024-02-03--v1"
folder = str(CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/{exp_name}')) 
os.listdir(folder)

['fold_1 _epoch_8 _step_4397 _valid_loss_0.0240 _f5_micro_0.9189 _train_loss_0.0003.pth',
 'tokenizer',
 'params.json',
 'config.pth',
 'fold_0 _epoch_9 _step_5127 _valid_loss_0.0137 _f5_micro_0.9212 _train_loss_0.0002.pth']

In [23]:
df[FOLD_NAME].value_counts()

4    1833
0    1833
3    1832
2    1832
1    1832
Name: fold_msk_5_seed_42, dtype: int64

In [19]:
FOLD = 1

In [20]:
pred_df_dv3,gt_df = inference_steps(df[df[FOLD_NAME]==FOLD],folder,bs=1,fold=FOLD)

Loaded 1832 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-xsmall/2024-02-03--v1/fold_1 _epoch_8 _step_4397 _valid_loss_0.0240 _f5_micro_0.9189 _train_loss_0.0003.pth']
Pooling: MeanPooling


  0%|          | 0/1832 [00:01<?, ?it/s]

torch.Size([1403405, 8])


In [21]:
pred_df_dv3.shape

(1403405, 4)

In [22]:
pred_df_dv3.head()

Unnamed: 0,document,token,label,score
0,104,0,7,1.0
1,104,1,0,0.125
2,104,2,7,1.0
3,104,3,7,1.0
4,104,4,7,1.0


In [23]:
gt_df.shape

(11476, 7)

In [24]:
g  = gt_df.copy()

In [25]:
gt_df.columns

Index(['document', 'token', 'label', 'I', 'labels', 'label_gt', 'row_id'], dtype='object')

In [26]:
gt_df.head(10)

Unnamed: 0,document,token,label,I,labels,label_gt,row_id
0,104,8,0,0,0-0,0,0
1,104,9,0,1,0-1,1,1
2,204,4,0,0,0-0,0,2
3,269,783,0,0,0-0,0,3
4,269,784,0,1,0-1,1,4
5,288,0,0,0,0-0,0,5
6,288,1,0,1,0-1,1,6
7,375,5,0,0,0-0,0,7
8,375,6,0,1,0-1,1,8
9,470,0,0,0,0-0,0,9


In [27]:
pred_df_dv3.head(10)

Unnamed: 0,document,token,label,score
0,104,0,7,1.0
1,104,1,0,0.125
2,104,2,7,1.0
3,104,3,7,1.0
4,104,4,7,1.0
5,104,5,7,1.0
6,104,6,7,1.0
7,104,7,0,0.7145
8,104,8,0,0.9999
9,104,9,0,0.9999


In [266]:
# p = pred_df_dv3.copy()

In [267]:
# pred_df = p.copy()

In [36]:
pred_df = pred_df_dv3.copy()#pred_df_dv3[(pred_df_dv3.label!=7) & (pred_df_dv3.score>0.15)].reset_index(drop=True)

In [37]:
pred_df["label_next_e_prev"] = pred_df.groupby('document')['label'].transform(lambda x: (x.shift(1)==x.shift(-1))*1)
pred_df["label_next"] = pred_df.groupby('document')['label'].transform(lambda x: x.shift(1))
pred_df["label_next_e_prev"] = ((pred_df["label_next_e_prev"]==1) & (pred_df["label_next"]==6))*1
pred_df["score_next"] = pred_df.groupby('document')['score'].transform(lambda x: x.shift(1))
pred_df.loc[pred_df["label_next_e_prev"]==1,"label"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"label_next"]
pred_df.loc[pred_df["label_next_e_prev"]==1,"score"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"score_next"]

In [38]:
# pred_df = pred_df_dv3.copy()#pred_df_dv3[(pred_df_dv3.label!=7) & (pred_df_dv3.score>0.15)].reset_index(drop=True)
pred_df = pred_df[(pred_df.label!=7) & ((pred_df.score>0.15))].reset_index(drop=True)
pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
pred_df['row_id'] = np.arange(len(pred_df))
pred_df.shape

(11650, 11)

In [39]:
pred_df.head()

Unnamed: 0,document,token,label,score,label_next_e_prev,label_next,score_next,I,labels,label_pred,row_id
0,104,7,0,0.7145,0,7.0,1.0,0,0-0,0,0
1,104,8,0,0.9999,0,0.0,0.7145,1,0-1,1,1
2,104,9,0,0.9999,0,0.0,0.9999,1,0-1,1,2
3,204,4,0,0.7271,0,0.0,0.125,0,0-0,0,3
4,269,783,0,0.9998,0,7.0,1.0,0,0-0,0,4


In [40]:
pred_df['label'] = pred_df['labels'].map(ID_NAME)

In [41]:
gt_df['label'] = gt_df['labels'].map(ID_NAME)

In [42]:
pred_df.head()

Unnamed: 0,document,token,label,score,label_next_e_prev,label_next,score_next,I,labels,label_pred,row_id
0,104,7,B-NAME_STUDENT,0.7145,0,7.0,1.0,0,0-0,0,0
1,104,8,I-NAME_STUDENT,0.9999,0,0.0,0.7145,1,0-1,1,1
2,104,9,I-NAME_STUDENT,0.9999,0,0.0,0.9999,1,0-1,1,2
3,204,4,B-NAME_STUDENT,0.7271,0,0.0,0.125,0,0-0,0,3
4,269,783,B-NAME_STUDENT,0.9998,0,7.0,1.0,0,0-0,0,4


In [43]:
COL_SUBS = ["row_id",'document', 'token', 'label']

In [44]:
compute_metrics(pred_df, gt_df)

{'f5_prec': 0.9792274678111588,
 'f5_rec': 0.994074590449634,
 'f5_micro': 0.9934952269301625,
 'ents_per_type': {'STREET_ADDRESS': 0.999972257414206,
  'ID_NUM': 0.9908927431985614,
  'NAME_STUDENT': 0.979500320307495,
  'PHONE_NUM': 0.9999803168979432,
  'EMAIL': 0.9999190348959599,
  'USERNAME': 0.9933952010701448,
  'URL_PERSONAL': 0.9978087059520284}}

In [47]:
dx = pred_df.groupby("document")['label'].agg(list).reset_index()

In [215]:
dx

Unnamed: 0,document,label
0,356,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,607,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,1175,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
3,1210,"[O, O, O, O, B-NAME_STUDENT, O, O, O, B-NAME_S..."
4,1325,"[O, O, B-NAME_STUDENT, O, O, O, O, O, B-NAME_S..."
...,...,...
1828,zqugxseyet,"[B-NAME_STUDENT, I-NAME_STUDENT, O, B-URL_PERS..."
1829,zrclapokyt,"[B-NAME_STUDENT, I-NAME_STUDENT, O, B-USERNAME..."
1830,ztopzlcozg,"[B-NAME_STUDENT, I-NAME_STUDENT, O, B-EMAIL, O..."
1831,zxyfihlfes,"[B-NAME_STUDENT, I-NAME_STUDENT, O, B-EMAIL, O..."


In [216]:
dfold = df[df[FOLD_NAME]==0].reset_index(drop=True)
dfold.shape

(1833, 13)

In [217]:
dfold = dfold.merge(dx,how='left',on='document')

In [218]:
dfold.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,fold_msk_5_seed_42,label
0,356,Final reflection on Design thinking. Agile met...,"[Final, reflection, on, Design, thinking, ., A...","[True, True, True, True, False, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,0,0,0,0,0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,607,Reflection – Storytelling\n\nChallenge & Selec...,"[Reflection, –, Storytelling, \n\n, Challenge,...","[True, True, False, False, True, True, False, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1,0,0,0,0,0,0,0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,1175,Jose Torres\n\nTool: Learning Launch\n\nChalle...,"[Jose, Torres, \n\n, Tool, :, Learning, Launch...","[True, False, False, False, True, True, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",1,0,0,0,0,0,0,0,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
3,1210,NGO funding - Storytelling Challenge & Sele...,"[NGO, funding, -, Storytelling, , Challenge...","[True, True, True, True, False, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",1,0,0,0,0,0,0,0,"[O, O, O, O, B-NAME_STUDENT, O, O, O, B-NAME_S..."
4,1325,Reflection Assignment Design Thinking for Inn...,"[Reflection, Assignment, , Design, Thinking, ...","[True, True, False, True, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-N...",1,0,0,0,0,0,0,0,"[O, O, B-NAME_STUDENT, O, O, O, O, O, B-NAME_S..."


In [219]:
dfold['len_tok'] = dfold['labels'].transform(lambda x:len(x))
dfold['len_tok_p'] = dfold['label'].transform(lambda x:len(x))

In [220]:
(dfold['len_tok']==dfold['len_tok_p']).value_counts()

True    1833
dtype: int64

In [221]:
from data.data_utils import get_offset_mapping

In [244]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [223]:
idx = random.choice(dfold[dfold.STREET_ADDRESS>0].index)
# Example usage:
# idx = "uzvemcjmik"
full_text_ds = dfold.iloc[idx]['full_text']
tokens_ds = dfold.iloc[idx]['tokens']
labels_ds = dfold.iloc[idx]['labels']
labels = dfold.iloc[idx]['label']
idx,dfold.iloc[idx]['document']

(1469, 'ahsmdepqts')

In [224]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [225]:
visualize(full_text_ds,offset_mapping_,labels_)

In [226]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [227]:
visualize(full_text_ds,offset_mapping_,labels_)

In [289]:
x = pred_df[pred_df.document==dfold.iloc[idx]['document']]

In [290]:
x['tokens'] = tokens_ds

In [291]:
x['label_gt'] = labels_ds

In [292]:
x['label_e'] = x['label']

In [249]:
x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'] = x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'].shift(1)

In [293]:
x.iloc[:50]

Unnamed: 0,document,token,label,score,label_next_e_prev,label_next,score_next,tokens,label_gt,label_e
1085982,ahsmdepqts,0,0,1.0,0,,,Amir,B-NAME_STUDENT,0
1085983,ahsmdepqts,1,0,0.9999,0,0.0,1.0,Ogrady,I-NAME_STUDENT,0
1085984,ahsmdepqts,2,7,0.9999,0,0.0,0.9999,;,O,7
1085985,ahsmdepqts,3,5,1.0,0,7.0,0.9999,https://www.linkedin.com/ogrady31,B-URL_PERSONAL,5
1085986,ahsmdepqts,4,7,1.0,0,5.0,1.0,;,O,7
1085987,ahsmdepqts,5,1,1.0,0,7.0,1.0,amirogrady86@outlook.com,B-EMAIL,1
1085988,ahsmdepqts,6,7,0.9999,0,1.0,1.0,;,O,7
1085989,ahsmdepqts,7,4,1.0,0,7.0,0.9999,(,B-PHONE_NUM,4
1085990,ahsmdepqts,8,4,1.0,0,4.0,1.0,823)607,I-PHONE_NUM,4
1085991,ahsmdepqts,9,4,1.0,0,4.0,1.0,-,I-PHONE_NUM,4
