In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
import re
import os
import gc
import math
import time
import json
import random
import numpy as np
import pandas as pd
import wandb

from pathlib import Path

import torch 
import torch.nn as nn
from torch.cuda import amp
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoConfig
 
from data.data_utils import to_gpu,to_np
from data.dataset import FeedbackDataset,CustomCollator
from torch.utils.data import DataLoader

from model_zoo.models import FeedbackModel,span_nms,aggregate_tokens_to_words
from metrics_loss.metrics import score_feedback,score,pii_fbeta_score_v2,compute_metrics
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.metrics import log_loss 
from tqdm.auto import tqdm

from utils.utils import count_parameters
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'dubai-ar.zip',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'openaddr-collected-global.zip',
 'lecture2.pptx',
 'openaddr-collected-us_west-sa.zip',
 'test.json',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv']

In [6]:
CHECKPOINT_PATH = Path(r"/database/kaggle/PII/checkpoint")

In [7]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [8]:
df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [9]:
external_data = True

In [10]:
if external_data:
    print("Using external data")
    dx = pd.read_json(data_path/f'mixtral-8x7b-v1.json')
    # dx[name] = -1
    df = pd.concat([df,dx],axis=0).reset_index(drop=True)

Using external data


In [11]:
df.shape

(9162, 5)

In [12]:
from sklearn.model_selection import GroupKFold,StratifiedGroupKFold,KFold,StratifiedKFold

In [13]:
df['has_label'] = (df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))>0)*1
seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = StratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_sk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df['has_label'])):
            df.loc[val_, name] = fold

In [9]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [10]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ])))>0)*1

seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

# Data

In [11]:
from train_utils import inference_step
from types import SimpleNamespace

In [12]:
ID_TYPE = {"0-0":0,"0-1":1,
           "1-0":2,"1-1":3,
           "2-0":4,"2-1":5,
           "3-0":6,"3-1":7,
           "4-0":8,"4-1":9,
           "5-0":10,"5-1":11,
           "6-0":12,"6-1":13
          }
ID_NAME = {"0-0":"B-NAME_STUDENT","0-1":"I-NAME_STUDENT",
           "1-0":"B-EMAIL","1-1":"I-EMAIL",
           "2-0":"B-USERNAME","2-1":"I-USERNAME",
           "3-0":"B-ID_NUM","3-1":"I-ID_NUM",
           "4-0":"B-PHONE_NUM","4-1":"I-PHONE_NUM",
           "5-0":"B-URL_PERSONAL","5-1":"I-URL_PERSONAL",
           "6-0":"B-STREET_ADDRESS","6-1":"I-STREET_ADDRESS",
           "7-0":"O","7-1":"O"
          }

def inference_steps(df,folder,bs=1,fold=0):
    
    # ==== Loading Args =========== #
    f = open(f'{folder}/params.json')
    args = json.load(f)
    args = SimpleNamespace(**args)
    args.val_loader['batch_size'] = bs
    args.model['pretrained_tokenizer'] = f"{folder}/tokenizer"
    args.model['model_params']['config_path'] = f"{folder}/config.pth"
    args.model['pretrained_weights'] = None
    args.model["model_params"]['pretrained_path'] = None
#     args.model["model_params"]['max_len'] = 3048
    
    args.device = 1
    f.close()
    device = torch.device(f"cuda:{args.device}" if torch.cuda.is_available() else "cpu")
    
    # ==== Loading dataset =========== #
    tokenizer = AutoTokenizer.from_pretrained(args.model["model_params"]['model_name'])
    valid_dataset = eval(args.dataset)(df,tokenizer,**args.data["params_valid"])
    
    
    
    # ==== Loading checkpoints =========== #
    checkpoints = [x.as_posix() for x in (Path(folder)).glob("*.pth") if f"fold_{fold}" in x.as_posix()]
    print(checkpoints)
    weights = [1/len(checkpoints)]*len(checkpoints)
    
    
    # ==== Loop Inference =========== #
    doc_ids = []
    tokens = []
    predictions = None
    gt_df = []
    for j,(checkpoint,weight) in enumerate(zip(checkpoints,weights)):
        net = FeedbackModel(**args.model["model_params"])
        net.load_state_dict(torch.load(checkpoint, map_location=lambda storage, loc: storage))
        net = net.to(device)
        net.eval()
        
        collator = CustomCollator(tokenizer,net)
        val_loader = DataLoader(valid_dataset,**args.val_loader,collate_fn=collator)
    

        
        preds = []
        with torch.no_grad():
            for data in tqdm(val_loader):
                data = to_gpu(data, device)
                
                pred = net(data)['pred']
                preds.append(pred.detach().cpu().to(torch.float32))
#                 pred  = pred.softmax(-1)
                
                
                if j==0:
                
                    doc_ids+=[data['text_id']]*pred.shape[0]
                    tokens+=np.arange(pred.shape[0]).tolist()
                    
                    data = to_np(data)
                    gt = pd.DataFrame({
                                      "document":data['text_id'],
                                      "token":np.arange(pred.shape[0]),
                                      "label":data["gt_spans"][:,1],
                                      "I":data["gt_spans"][:,2],
                                     })
                    gt_df.append(gt)

        
        if predictions is not None:
#             predictions = torch.max(predictions,torch.cat(preds,dim=0))
            predictions+= torch.cat(preds,dim=0)*weight
        else:
            predictions = torch.cat(preds,dim=0)*weight
#             predictions+= torch.cat(preds,dim=0)*weight
        print(predictions.shape)
    
    predictions = predictions.softmax(-1)
    s,i = predictions.max(-1)
    pred_df = pd.DataFrame({"document":doc_ids,
                                 "token" : tokens,
                                 "label" : i.numpy() ,
                                 "score" : s.numpy() ,
#                                  "o_score":predictions[:,-1].numpy()
                                 })
    
    # ==== Loop Inference =========== #
    del valid_dataset
    del val_loader
    del net
    del s,i
    del predictions

    gc.collect()
#     torch.cuda.empty_cache()
    
    # ==== Loop Inference =========== #
#     pred_df = pred_df[(pred_df.label!=7) & (pred_df.score>0.5)].reset_index(drop=True)
#     pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
#     pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
#     pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
#     pred_df['row_id'] = np.arange(len(pred_df))
    

    gt_df = pd.concat(gt_df,axis=0).reset_index(drop=True)
    gt_df = gt_df[gt_df.label!=7].reset_index(drop=True)
    gt_df['labels'] = gt_df['label'].astype(str)+'-'+gt_df['I'].astype(str)
    gt_df["label_gt"] = gt_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    gt_df['row_id'] = np.arange(len(gt_df))

    
    
    return pred_df , gt_df

In [13]:
FOLD_NAME = "fold_msk_5_seed_42"
model_name = "deberta-v3-large"
exp_name = "2024-02-04--vsub1"
folder = str(CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/{exp_name}')) 
os.listdir(folder)

['fold_3 _epoch_5 _step_5856 _valid_loss_0.0071 _f5_micro_0.9985 _train_loss_0.0006.pth',
 'tokenizer',
 'params.json',
 'fold_2 _epoch_9 _step_7331 _valid_loss_0.0072 _f5_micro_0.9984 _train_loss_0.0001.pth',
 'fold_1 _epoch_5 _step_4397 _valid_loss_0.0060 _f5_micro_0.9982 _train_loss_0.0012.pth',
 'fold_4 _epoch_5 _step_7322 _valid_loss_0.0051 _f5_micro_0.9978 _train_loss_0.0011.pth',
 'config.pth',
 'fold_0 _epoch_8 _step_7331 _valid_loss_0.0080 _f5_micro_0.9981 _train_loss_0.0004.pth']

In [14]:
df[FOLD_NAME].value_counts()

4    1362
2    1362
3    1361
0    1361
1    1361
Name: fold_msk_5_seed_42, dtype: int64

In [15]:
FOLD = 0

In [16]:
pred_df_dv3,gt_df = inference_steps(df[df[FOLD_NAME]==FOLD],folder,bs=1,fold=FOLD)

Loaded 1361 samples.
['/database/kaggle/PII/checkpoint/fold_msk_5_seed_42/deberta-v3-large/2024-02-04--vsub1/fold_0 _epoch_8 _step_7331 _valid_loss_0.0080 _f5_micro_0.9981 _train_loss_0.0004.pth']
Pooling: MeanPooling


  0%|          | 0/1361 [00:01<?, ?it/s]

torch.Size([997332, 8])


In [17]:
g  = gt_df.copy()

In [18]:
gt_df['label'] = gt_df['labels'].map(ID_NAME)

In [53]:
pred_df = pred_df_dv3.copy()#pred_df_dv3[(pred_df_dv3.label!=7) & (pred_df_dv3.score>0.15)].reset_index(drop=True)

In [54]:
def make_pred_df(pred_df,threshold=0.15):
    
    pred_df["label_next_e_prev"] = pred_df.groupby('document')['label'].transform(lambda x: (x.shift(1)==x.shift(-1))*1)
    pred_df["label_next"] = pred_df.groupby('document')['label'].transform(lambda x: x.shift(1))
    pred_df["label_next_e_prev"] = ((pred_df["label_next_e_prev"]==1) & (pred_df["label_next"]==6))*1
    pred_df["score_next"] = pred_df.groupby('document')['score'].transform(lambda x: x.shift(1))
    pred_df.loc[pred_df["label_next_e_prev"]==1,"label"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"label_next"]
    pred_df.loc[pred_df["label_next_e_prev"]==1,"score"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"score_next"]
    
    
#     pred_df = pred_df[(pred_df.label!=7) & ((pred_df.score>threshold))].reset_index(drop=True)
    
    pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
    pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
    pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    pred_df['row_id'] = np.arange(len(pred_df))
    
    pred_df['label'] = pred_df['labels'].map(ID_NAME)
    return pred_df

In [55]:
pred_df = make_pred_df(pred_df,threshold=0.15)

In [36]:
s = compute_metrics(pred_df, gt_df)
s

{'f5_prec': 0.7753623188405797,
 'f5_rec': 0.9907407407407407,
 'f5_micro': 0.9802677942212826,
 'ents_per_type': {'NAME_STUDENT': 0.9830722986869167,
  'ID_NUM': 0.8965517241379312,
  'URL_PERSONAL': 0.989345509893455,
  'EMAIL': 1.0,
  'USERNAME': 0.9285714285714286,
  'PHONE_NUM': 1.0}}

In [37]:
s['ents_per_type']

{'NAME_STUDENT': 0.9830722986869167,
 'ID_NUM': 0.8965517241379312,
 'URL_PERSONAL': 0.989345509893455,
 'EMAIL': 1.0,
 'USERNAME': 0.9285714285714286,
 'PHONE_NUM': 1.0}

In [38]:
pd.DataFrame({x:[y] for x,y in s['ents_per_type'].items()})

Unnamed: 0,NAME_STUDENT,ID_NUM,URL_PERSONAL,EMAIL,USERNAME,PHONE_NUM
0,0.9831,0.8966,0.9893,1.0,0.9286,1.0


In [39]:
documents = df[df[FOLD_NAME]==FOLD].document.unique()
len(documents)

1361

In [47]:
df_score = []
for doc in tqdm(documents):
    p = pred_df[pred_df.document==doc].reset_index(drop=True)
    gp = gt_df[gt_df.document==doc].reset_index(drop=True)
    
    s = compute_metrics(p, gp)
    
    d = pd.DataFrame({x:[y] for x,y in s['ents_per_type'].items()})
    d["f5_micro"] = s['f5_micro']
    d['document'] = doc
    df_score.append(d)

  0%|          | 0/1361 [00:00<?, ?it/s]

In [48]:
gp

Unnamed: 0,document,token,label,I,labels,label_gt,row_id


In [50]:
df_score = pd.concat(df_score).reset_index(drop=True)

In [52]:
df_score[df_score.f5_micro==0]

Unnamed: 0,NAME_STUDENT,f5_micro,document,URL_PERSONAL,ID_NUM,EMAIL,PHONE_NUM,USERNAME
4,0.0,0.0,204,,,,,
122,0.0,0.0,8758,,,,,
127,0.0,0.0,9031,,,,,
132,0.0,0.0,9313,,,,,
134,0.0,0.0,9399,,,,,
146,0.0,0.0,9961,,,,,
150,0.0,0.0,10070,,,,,
171,0.0,0.0,11856,,,,,
174,0.0,0.0,11896,,,,,
175,0.0,0.0,11901,,,,,0.0


In [104]:
doc = 204

In [105]:
dx = pred_df.groupby("document")['label'].agg(list).reset_index()

In [106]:
dx

Unnamed: 0,document,label
0,16,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
1,56,"[O, O, B-NAME_STUDENT, O, B-NAME_STUDENT, O, B..."
2,112,"[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
3,166,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
4,204,"[O, O, O, B-NAME_STUDENT, O, O, O, B-NAME_STUD..."
...,...,...
1356,22637,"[O, O, O, O, O, O, O, O, O, O, O, B-NAME_STUDE..."
1357,22646,"[O, O, O, O, B-NAME_STUDENT, O, B-NAME_STUDENT..."
1358,22652,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1359,22664,"[O, O, O, O, O, B-NAME_STUDENT, O, O, O, O, O,..."


In [107]:
dfold = df[df[FOLD_NAME]==FOLD].reset_index(drop=True)
dfold.shape

(1361, 13)

In [108]:
dfold = dfold.merge(dx,how='left',on='document')

In [109]:
dfold.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,fold_msk_5_seed_42,label
0,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",1,0,0,0,0,0,0,0,"[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
1,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",1,0,0,0,0,0,0,0,"[O, O, B-NAME_STUDENT, O, B-NAME_STUDENT, O, B..."
2,112,Reflection – Learning Launch\n\nFrancisco Ferr...,"[Reflection, –, Learning, Launch, \n\n, Franci...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",1,0,0,0,0,0,0,0,"[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
3,166,Pepa Medrano\n\nDesign Thinking for Innovation...,"[Pepa, Medrano, \n\n, Design, Thinking, for, I...","[True, False, False, True, True, True, True, F...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",1,0,0,0,0,0,0,0,"[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
4,204,Reflection – Visualization Deiby\n\nChalleng...,"[Reflection, –, Visualization, , Deiby, \n\n...","[True, True, True, False, False, False, True, ...","[O, O, O, O, B-NAME_STUDENT, O, O, O, O, O, O,...",1,0,0,0,0,0,0,0,"[O, O, O, B-NAME_STUDENT, O, O, O, B-NAME_STUD..."


In [110]:
dfold['len_tok'] = dfold['labels'].transform(lambda x:len(x))
dfold['len_tok_p'] = dfold['label'].transform(lambda x:len(x))

In [111]:
(dfold['len_tok']==dfold['len_tok_p']).value_counts()

True    1361
dtype: int64

In [112]:
from data.data_utils import get_offset_mapping

In [113]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [114]:
idx = random.choice(dfold[dfold.NAME_STUDENT>0].index)
idx = dfold[dfold.document==doc].index[0]
# Example usage:
# idx = "uzvemcjmik"
full_text_ds = dfold.iloc[idx]['full_text']
tokens_ds = dfold.iloc[idx]['tokens']
labels_ds = dfold.iloc[idx]['labels']
labels = dfold.iloc[idx]['label']
idx,dfold.iloc[idx]['document']

(4, 204)

In [115]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [116]:
visualize(full_text_ds,offset_mapping_,labels_)

In [117]:
offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]

In [118]:
visualize(full_text_ds,offset_mapping_,labels_)

In [119]:
# doc = 204
x = pred_df[pred_df.document==doc]

In [120]:
x['tokens'] = tokens_ds

In [121]:
x['label_gt'] = labels_ds

In [122]:
x['label_e'] = x['label']

In [123]:
x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'] = x.loc[x.label_e.shift(-1)==x.label_e.shift(1),'label_e'].shift(1)

In [124]:
x.label.value_counts()

O                 851
B-NAME_STUDENT     61
Name: label, dtype: int64

In [125]:
x[x.label_gt!="O"]

Unnamed: 0,document,token,label,score,label_next_e_prev,label_next,score_next,I,labels,label_pred,row_id,tokens,label_gt,label_e
4179,204,4,O,0.9991,0,0.0,0.125,0,7-0,0,4179,Deiby,B-NAME_STUDENT,O


In [127]:
x.head(10) #[x.score>0.125]

Unnamed: 0,document,token,label,score,label_next_e_prev,label_next,score_next,I,labels,label_pred,row_id,tokens,label_gt,label_e
4175,204,0,O,1.0,0,,,0,7-0,0,4175,Reflection,O,O
4176,204,1,O,1.0,0,7.0,1.0,1,7-1,0,4176,–,O,
4177,204,2,O,0.9999,0,7.0,1.0,1,7-1,0,4177,Visualization,O,O
4178,204,3,B-NAME_STUDENT,0.125,0,7.0,0.9999,0,0-0,0,4178,,O,O
4179,204,4,O,0.9991,0,0.0,0.125,0,7-0,0,4179,Deiby,B-NAME_STUDENT,O
4180,204,5,O,1.0,0,7.0,0.9991,1,7-1,0,4180,\n\n,O,B-NAME_STUDENT
4181,204,6,O,1.0,0,7.0,1.0,1,7-1,0,4181,Challenge,O,O
4182,204,7,B-NAME_STUDENT,0.125,0,7.0,1.0,0,0-0,0,4182,,O,O
4183,204,8,O,1.0,0,0.0,0.125,0,7-0,0,4183,For,O,O
4184,204,9,O,1.0,0,7.0,1.0,1,7-1,0,4184,some,O,B-NAME_STUDENT
