In [1]:
cd ../src

/workspace/Script/NLP/PII/src


In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [49]:
import re
import os
import gc
import math
import time
import json
import random
import numpy as np
import pandas as pd
import wandb

from pathlib import Path

import torch 
import torch.nn as nn
from torch.cuda import amp
import torch.optim as optim
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoConfig
 
from data.data_utils import to_gpu,to_np
from data.dataset import FeedbackDataset,CustomCollator
from torch.utils.data import DataLoader

from model_zoo.models import FeedbackModel,span_nms,aggregate_tokens_to_words
from metrics_loss.metrics import score_feedback,score,pii_fbeta_score_v2,compute_metrics,compute_metrics_new
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup,get_polynomial_decay_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.metrics import log_loss 
from tqdm.auto import tqdm

from utils.utils import count_parameters
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Params

In [5]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'Fake_data_1850_218.json',
 'test.json',
 'archive.zip',
 'archive',
 'pii-masking-200k.csv',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv',
 'mpware_mixtral8x7b_v1.1.json']

In [6]:
CHECKPOINT_PATH = Path(r"/database/kaggle/PII/checkpoint")

In [7]:
df = pd.read_csv(data_path/'pii-masking-200k.csv')
df.shape

(165750, 5)

In [8]:
df["offset_mapping"] = df["offset_mapping"].transform(lambda x:eval(x))
df["labels"] = df["labels"].transform(lambda x:eval(x))
df["tokens"] = df["tokens"].transform(lambda x:eval(x))

In [10]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i==name ])))>0)*1

In [130]:
FOLD_NAME = "fold_msk_5_seed_42"
model_name = "deberta-v3-xsmall"
exp_name = "2024-02-03--v2_5fold"
FOLD = 0
df_pred = pd.read_csv(CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/{exp_name}')/f'pii-200-ms-blend.csv')
df_pred.shape

(5393041, 5)

In [131]:
gt_df = pd.read_csv(CHECKPOINT_PATH/Path(fr'{FOLD_NAME}/{model_name}/{exp_name}')/'pii-200-ms-gt.csv')
gt_df.shape

(475354, 7)

In [132]:
df_pred.head(10)

Unnamed: 0,document,token,tokens,label,score
0,0.0,0,A,7.0,0.9999
1,0.0,1,student,7.0,0.9999
2,0.0,2,'s,7.0,0.9983
3,0.0,3,assessment,7.0,0.9996
4,0.0,4,was,7.0,0.9999
5,0.0,5,found,7.0,0.9999
6,0.0,6,on,7.0,0.9998
7,0.0,7,device,7.0,0.9996
8,0.0,8,bearing,7.0,0.9996
9,0.0,9,IMEI,7.0,0.9949


In [133]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
TYPE2LABEL = {t: l for l, t in enumerate(LABEL2TYPE)}
LABEL2TYPE = {l: t for l, t in enumerate(LABEL2TYPE)}
    
def make_pred_df(pred_df,threshold=0.15):
    
    pred_df["label_next_e_prev"] = pred_df.groupby('document')['label'].transform(lambda x: (x.shift(1)==x.shift(-1))*1)
    pred_df["label_next"] = pred_df.groupby('document')['label'].transform(lambda x: x.shift(1))
    pred_df["label_next_e_prev"] = ((pred_df["label_next_e_prev"]==1) & (pred_df["label_next"]==6))*1
    pred_df["score_next"] = pred_df.groupby('document')['score'].transform(lambda x: x.shift(1))
    pred_df.loc[pred_df["label_next_e_prev"]==1,"label"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"label_next"]
    pred_df.loc[pred_df["label_next_e_prev"]==1,"score"] = pred_df.loc[pred_df["label_next_e_prev"]==1,"score_next"]
    
    
    if threshold>0:
        pred_df = pred_df[(pred_df.label!=7) & ((pred_df.score>threshold))].reset_index(drop=True)
    
#     pred_df['token_size'] = pred_df['tokens'].transform(len)
#     pred_df = pred_df[~((pred_df.label==0) & ((pred_df.token_size<=1)))].reset_index(drop=True)
    
#     pred_df["I"] = ((pred_df.groupby('document')['label'].transform(lambda x:x.diff())==0) & (pred_df.groupby('document')['token'].transform(lambda x:x.diff())==1))*1
#     pred_df['labels'] = pred_df['label'].astype(str)+'-'+pred_df['I'].astype(str)
#     pred_df["label_pred"] = pred_df["labels"].map(ID_TYPE).fillna(0).astype(int)
    pred_df['row_id'] = np.arange(len(pred_df))
    
    pred_df['label'] = pred_df['label'].map(LABEL2TYPE)
    return pred_df

In [134]:
pred_df = make_pred_df(df_pred.copy(),threshold=0.15)

In [135]:
pred_df.shape

(558266, 9)

In [138]:
gt_df.sample(20)

Unnamed: 0,document,token,label,I,labels,label_gt,row_id
165046,57345,18,3,1,3-1,7,165046
359512,125290,27,4,1,4-1,9,359512
424800,147978,39,4,1,4-1,9,424800
52780,18369,6,6,1,6-1,13,52780
466668,162702,17,0,1,0-1,1,466668
425459,148190,2,0,1,0-1,1,425459
200003,69537,30,3,1,3-1,7,200003
204738,71184,18,4,1,4-1,9,204738
19472,6788,20,3,1,3-1,7,19472
62937,21818,37,4,1,4-1,9,62937


In [139]:
gt_df['label'] = gt_df['label'].map(LABEL2TYPE)

In [140]:
s = compute_metrics_new(pred_df, gt_df)
s

{'f5_prec': 0.6343714286737864,
 'f5_rec': 0.7450195012559061,
 'f5_micro': 0.740054826687036,
 'ents_per_type': {'NAME_STUDENT': 0.9161869519044439,
  'PHONE_NUM': 0.8196143689770669,
  'ID_NUM': 0.6773444304848816,
  'EMAIL': 0.9979601644684145,
  'URL_PERSONAL': 0.0,
  'STREET_ADDRESS': 0.5094447680377986,
  'USERNAME': 0.04618525732792002}}

In [147]:
gt_df

Unnamed: 0,document,token,label,I,labels,label_gt,row_id
0,0,11,PHONE_NUM,1,4-1,9,0
1,0,12,PHONE_NUM,1,4-1,9,1
2,0,13,PHONE_NUM,1,4-1,9,2
3,0,14,PHONE_NUM,1,4-1,9,3
4,0,15,PHONE_NUM,1,4-1,9,4
...,...,...,...,...,...,...,...
475349,165749,55,ID_NUM,1,3-1,7,475349
475350,165749,56,ID_NUM,1,3-1,7,475350
475351,165749,57,ID_NUM,1,3-1,7,475351
475352,165749,58,ID_NUM,1,3-1,7,475352


In [142]:
pred_df

Unnamed: 0,document,token,tokens,label,score,label_next_e_prev,label_next,score_next,row_id
0,0.0000,11,06,ID_NUM,0.5595,0,7.0000,0.9993,0
1,0.0000,12,-,PHONE_NUM,0.9366,0,3.0000,0.5595,1
2,0.0000,13,184755,ID_NUM,0.6575,0,4.0000,0.9366,2
3,0.0000,14,-,PHONE_NUM,0.9981,0,3.0000,0.6575,3
4,0.0000,15,866851,PHONE_NUM,0.9660,0,4.0000,0.9981,4
...,...,...,...,...,...,...,...,...,...
558261,165748.0000,20,01T15:45:44.147Z,ID_NUM,0.9864,0,7.0000,0.8685,558261
558262,165748.0000,22,Lourdes,NAME_STUDENT,0.8713,0,7.0000,0.9602,558262
558263,165749.0000,16,https://bruised-snob.name,URL_PERSONAL,0.9991,0,7.0000,0.9994,558263
558264,165749.0000,38,10_9_8,ID_NUM,0.8942,0,7.0000,0.9398,558264


In [143]:
documents = df.document.unique() #[df[FOLD_NAME]==FOLD]
len(documents)

165750

In [144]:
df_score = []
for doc in tqdm(documents):
    p = pred_df[pred_df.document==doc].reset_index(drop=True)
    gp = gt_df[gt_df.document==doc].reset_index(drop=True)
    
    s = compute_metrics_new(p, gp)
    
    d = pd.DataFrame({x:[y] for x,y in s['ents_per_type'].items()})
    d["f5_micro"] = s['f5_micro']
    d['document'] = doc
    df_score.append(d)

  0%|          | 0/165750 [00:00<?, ?it/s]

In [145]:
df_score = pd.concat(df_score).reset_index(drop=True)

In [148]:
df_score[df_score.f5_micro==0]

Unnamed: 0,PHONE_NUM,ID_NUM,f5_micro,document,NAME_STUDENT,STREET_ADDRESS,EMAIL,USERNAME,URL_PERSONAL
27,0.0000,0.0000,0.0000,27,,,,,
61,,,0.0000,61,0.0000,,,0.0000,
62,,,0.0000,62,0.0000,,,0.0000,
82,,0.0000,0.0000,82,0.0000,,,0.0000,
90,,0.0000,0.0000,90,,,,,
...,...,...,...,...,...,...,...,...,...
165709,0.0000,0.0000,0.0000,165709,,,,,
165712,,,0.0000,165712,0.0000,,,,
165730,,,0.0000,165730,,,0.0000,0.0000,
165743,,0.0000,0.0000,165743,0.0000,,,,0.0000


In [149]:
pdf = make_pred_df(df_pred.copy(),threshold=-1)

In [150]:
dx = pdf.groupby("document")['label'].agg(list).reset_index()

In [57]:
dx

Unnamed: 0,document,label
0,0.0000,"[O, O, O, O, O, O, O, O, O, O, O, ID_NUM, ID_N..."
1,1.0000,"[O, NAME_STUDENT, O, O, O, O, O, O, O, O, ID_N..."
2,2.0000,"[NAME_STUDENT, O, O, O, O, O, O, O, O, O, O, O..."
3,3.0000,"[O, O, O, O, O, O, O, O, O, ID_NUM, O, O, O, O..."
4,4.0000,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
165745,165745.0000,"[O, NAME_STUDENT, O, O, O, O, O, O, O, O, O, O..."
165746,165746.0000,"[NAME_STUDENT, NAME_STUDENT, O, O, O, O, O, O,..."
165747,165747.0000,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
165748,165748.0000,"[O, O, O, O, O, O, O, O, O, O, O, ID_NUM, ID_N..."


In [151]:
dfold = df.copy().reset_index(drop=True)#[df[FOLD_NAME]==FOLD].reset_index(drop=True)
dfold.shape

(165750, 12)

In [152]:
dfold = dfold.merge(dx,how='left',on='document')

In [153]:
dfold.head()

Unnamed: 0,document,full_text,tokens,labels,offset_mapping,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,label
0,0,A student's assessment was found on device bea...,"[A, student, 's, assessment, was, found, on, d...","[O, O, O, O, O, O, O, O, O, O, O, PHONE_NUM, P...","[(0, 1), (2, 9), (9, 11), (12, 22), (23, 26), ...",0,0,0,0,1,0,0,"[O, O, O, O, O, O, O, O, O, O, O, ID_NUM, PHON..."
1,1,"Dear Omer, as per our records, your license 78...","[Dear, Omer, ,, as, per, our, records, ,, your...","[O, NAME_STUDENT, O, O, O, O, O, O, O, O, ID_N...","[(0, 4), (5, 9), (9, 10), (11, 13), (14, 17), ...",1,0,0,1,0,0,0,"[O, NAME_STUDENT, O, O, O, O, O, O, O, O, ID_N..."
2,2,Kattie could you please share your recomndatio...,"[Kattie, could, you, please, share, your, reco...","[NAME_STUDENT, O, O, O, O, O, O, O, O, O, O, O...","[(0, 6), (7, 12), (13, 16), (17, 23), (24, 29)...",1,0,0,0,0,0,0,"[NAME_STUDENT, O, O, O, O, O, O, O, O, O, O, O..."
3,3,Emergency supplies in 16356 need a refill. Use...,"[Emergency, supplies, in, 16356, need, a, refi...","[O, O, O, O, O, O, O, O, O, ID_NUM, O, O, O, O...","[(0, 9), (10, 18), (19, 21), (22, 27), (28, 32...",0,0,0,1,0,0,0,"[O, O, O, NAME_STUDENT, O, O, O, O, O, ID_NUM,..."
4,4,"The 88 old child at 5862, has showcased an unu...","[The, 88, old, child, at, 5862, ,, has, showca...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(0, 3), (4, 6), (7, 10), (11, 16), (17, 19), ...",0,0,0,1,0,0,0,"[O, O, O, O, O, STREET_ADDRESS, O, O, O, O, O,..."


In [154]:
dfold = dfold.merge(df_score,how='left',on='document',suffixes=('','_s'))

In [62]:
dfold[(dfold.STREET_ADDRESS>0)&(dfold.f5_micro<1)].index

Int64Index([    11,     86,    132,    199,    264,    268,    298,    326,    335,    359,
            ...
            165073, 165182, 165189, 165272, 165406, 165517, 165519, 165561, 165563, 165735], dtype='int64', length=4057)

In [155]:
dfold['len_tok'] = dfold['labels'].transform(lambda x:len(x))
dfold['len_tok_p'] = dfold['label'].transform(lambda x:len(x))

In [156]:
(dfold['len_tok']==dfold['len_tok_p']).value_counts()

True    165750
dtype: int64

In [157]:
from data.data_utils import get_offset_mapping

In [158]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [190]:
dfold = dfold.reset_index(drop=True)
dfold.shape

(165750, 23)

In [1038]:
idx = random.choice(dfold[(dfold.PHONE_NUM>0) & (dfold.PHONE_NUM_s<0.9)].index)
# idx = dfold[dfold.document==doc].index[0]
# Example usage:
# idx = "uzvemcjmik"
full_text_ds = dfold.iloc[idx]['full_text']
tokens_ds = dfold.iloc[idx]['tokens']
labels_ds = dfold.iloc[idx]['labels']
labels = dfold.iloc[idx]['label']
doc = dfold.iloc[idx]['document']
offset_mapping = dfold.iloc[idx]['offset_mapping']
idx,dfold.iloc[idx]['document'],doc

(53167, 53167, 53167)

In [1039]:
# dfold.iloc[idx]

In [1040]:
# offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels_ds) if y!="O"]
labels_ = [x for x in labels_ds if x!="O"]
labels_

['PHONE_NUM', 'PHONE_NUM', 'PHONE_NUM', 'PHONE_NUM']

In [1041]:
visualize(full_text_ds,offset_mapping_,labels_)

In [1042]:
full_text_ds

'Chers parents, notre excursion annuelle est prévue pour le May 27, 1951 et nous nous dirigerons vers le Northwest. Veuillez appeler le +425.71-055 3960 si vous avez des questions.'

In [1043]:
# offset_mapping = get_offset_mapping(full_text_ds, tokens_ds)
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [1044]:
visualize(full_text_ds,offset_mapping_,labels_)

In [1045]:
# RE_ID_PHONE = r"""(\b\d{1,}-\d{1,}|
#                    \b\d{1,}\s*\d{1,}\s*\d{1,}|
#                    \b\d{2,}-\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|
#                    \b\d{2,}-\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|
#                    \b\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|
#                    \b\d{2,}\-\d{2,}\-\d{2,}\b|
#                    \b\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|
#                    \b\d{2,}\.\d{2,}\.\d{2,}\b|
#                    \(\s*\d{3}\s*\)\s*\d{3}\s*\-\s*\d{4}\s*\w{0,3}(\s*\d{1,8}\s*)?|
#                    \d{3}\s*\.\s*\d{3}\s*\.\s*\d{1,5}|
#                    \d{3}\s*\-\s*\d{3}\s*\-\s*\d{1,5}|
#                    \d{3}\s*x\s*\d{3}\s*x\s*\d{1,5}|
#                    \(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,3}\s*[\.\-x]?\d{1,3}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|
#                    \d{1,3}\s{0,2}\d{1,}\s{0,2}\d{1,})"""

In [1046]:
RE_ID_PHONE = r"""(\(?\+\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(?\s*\d{1,4}\s*\)?\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s*[\.\-x]?\d{1,5}\s{0,2}\d{0,5}|\(\s*\d{3}\s*\)\s*\d{3}\s*\-\s*\d{4}\s*\w{0,3}(\s*\d{1,8}\s*)?|\b\d{2,}-\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{2,}-\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\-\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\.\d{2,}\b|\d{3}\s*\.\s*\d{3}\s*\.\s*\d{1,5}|\d{3}\s*\-\s*\d{3}\s*\-\s*\d{1,5}|\d{3}\s*x\s*\d{3}\s*x\s*\d{1,5}|\d{1,3}\s{0,2}\d{1,}\s{0,2}\d{1,}|\b\d{1,}\s*\d{1,}\s*\d{1,}|\b\d{2,}\-\d{2,}\-\d{2,}\b|\b\d{2,}\.\d{2,}\.\d{2,}\b|\b\d{1,}-\d{1,})"""

In [1047]:
# Compile the regex pattern
regex = re.compile(RE_ID_PHONE)

In [1049]:
def strip_offset_mapping(text, offset_mapping):
    ret = []
    for start, end in offset_mapping:
        match = list(re.finditer('\S+', text[start:end]))
        
        if len(match) == 0:
            ret.append((start, end))
        else:
            span_start, span_end = match[0].span()
            ret.append((start + span_start, start + span_end))
    return np.array(ret)

In [1050]:
def find_patterns(text,regex):
    matches = [(match.group(0), match.start(), match.end()) for match in regex.finditer(text)]
    offsets = strip_offset_mapping(text,[(m[1],m[2]) for m in matches])
    return [m[0].strip() for m in matches],offsets

In [1051]:
find_patterns(full_text_ds,regex)

(['1951', '+425.71-055 3960'],
 array([[ 67,  71],
        [135, 146]]))

In [305]:
dfold[(dfold.document==doc)]

Unnamed: 0,document,full_text,tokens,labels,offset_mapping,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,label,PHONE_NUM_s,ID_NUM_s,f5_micro,NAME_STUDENT_s,STREET_ADDRESS_s,EMAIL_s,USERNAME_s,URL_PERSONAL_s,len_tok,len_tok_p
103630,103630,Der Angiogramm von Lila zeigt einen akuten Han...,"[Der, Angiogramm, von, Lila, zeigt, einen, aku...","[O, O, O, NAME_STUDENT, O, O, O, O, O, O, O, O...","[(0, 3), (4, 14), (15, 18), (19, 23), (24, 29)...",1,0,0,0,1,0,0,"[O, O, O, NAME_STUDENT, O, O, O, O, O, O, O, O...",0.0,0.0,0.25,1.0,,,,,25,25


In [306]:
dfold[(dfold.document==doc)].label.values[-1]

['O',
 'O',
 'O',
 'NAME_STUDENT',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'ID_NUM',
 'ID_NUM',
 'ID_NUM',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [307]:
pdf[(pdf.document==doc)]

Unnamed: 0,document,token,tokens,label,score,label_next_e_prev,label_next,score_next,row_id
3402557,103630.0,0,Der,O,0.9895,0,,,3402557
3402558,103630.0,1,Angiogramm,O,0.9886,0,7.0,0.9895,3402558
3402559,103630.0,2,von,O,0.9918,0,7.0,0.9886,3402559
3402560,103630.0,3,Lila,NAME_STUDENT,0.7391,0,7.0,0.9918,3402560
3402561,103630.0,4,zeigt,O,0.9986,0,0.0,0.7391,3402561
3402562,103630.0,5,einen,O,0.9991,0,7.0,0.9986,3402562
3402563,103630.0,6,akuten,O,0.9961,0,7.0,0.9991,3402563
3402564,103630.0,7,Handlungsbedarf,O,0.9959,0,7.0,0.9961,3402564
3402565,103630.0,8,.,O,0.9987,0,7.0,0.9959,3402565
3402566,103630.0,9,Bitte,O,0.9986,0,7.0,0.9987,3402566


In [206]:
labels

['NAME_STUDENT',
 'NAME_STUDENT',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'ID_NUM',
 'ID_NUM',
 'ID_NUM',
 'PHONE_NUM',
 'PHONE_NUM',
 'PHONE_NUM',
 'ID_NUM',
 'O']