Descriptions

[github code hugingface run_ner](https://github.com/huggingface/transformers/blob/master/examples/pytorch/token-classification/run_ner.py)

[hugingface readme Named Entity Recognition](https://huggingface.co/transformers/task_summary.html#named-entity-recognition)

In [1]:
from IPython.display import clear_output
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pytorch_lightning
!pip install transformers
!pip install sentencepiece
clear_output()

In [3]:
# https://github.com/huggingface/datasets
!pip install datasets
!pip install seqeval
clear_output()

In [None]:
!pip install wandb -qqq
clear_output()

In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbeifa[0m (use `wandb login --relogin` to force relogin)


True

In [None]:
import json
import torch
import datetime
from tqdm import tqdm
import torch.nn as nn
import os, glob, re
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt

import transformers

from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
pl.seed_everything(13)
print(torch.__version__)
PATH = '/content/drive/MyDrive/Coleridge_Initiative/input'

Global seed set to 13


1.8.1+cu101


In [None]:
!nvidia-smi

## Make data to ner

1. full txt split by 64 words
2. my 512 not split
3. split my 512

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Coleridge_Initiative/input/v6_data_qa.csv')
df[df.answer.isna()]

Unnamed: 0,question,text,answer,answer_start,answer_end,origin_text,origin_answer_start,origin_answer_end,len text,id


In [None]:
def check_len(data:list, len_skip:int = 10)->list:
    """
    checked len for all text in data  must be  more > 10 words
    """
    tmp = []
    for txt in data:
        w = txt.split()
        if len(w) >= len_skip:
            tmp.append(txt)
    return tmp


def make_split_txt(txt:str, split:bool = None, len_split:int = 64, crossing:int = 12)->list:
    """
    we split text on small part
    txt:str
    len_split: bool  split text or no
    crossing:int    
        txt =  is simply dummy text of the printing and typesetting industry 
            Lorem Ipsum has been the industry's standard dummy text ever since the 1500s

            len_split = 5, crossing= 2
            1. is, simply, dummy, text, of,
            2.                    text, of, the, printing, and
            3.                                   printing, and, typesetting, industry, Lorem


    return: list   

            [
                ['the',
                 'quality',
                 ...],
                 ...
                 ]
        len current example 13 array by 42 or less words 
   
    """
    
    words = txt.split() # array
    tmp =[]
    if split is None:
        tmp.append(txt)
    else:
        if len(words) > len_split:
            for i in range(0, len(words), len_split - crossing):        
                tmp.append(' '.join(words[i:i + len_split]))
        else:
            tmp.append(txt)
    return tmp


def find_idx_eq_words(txt_w:np.array, label_w: np.array)->list:
    """
    two array text & labels

    if word in text == word in label
        add position index   
        ['trends', 'in', 'international', 'mathematics', 'and', 'science', 'study']
                    == ['trends', 'in', 'international', 'mathematics', 'and', 'science', 'study']

    get index position full label
    return: list index
    """
    pos = []
    for i in range(len(txt_w)-len(label_w)):
        if label_w == txt_w[i:i+len(label_w)]:
            pos.append(i)
    return pos


def make_tag_ner(txt: str, label: str )->list:
    """
    txt: str current text
    label: str label
    return: list
        [
        ...
        ('and', 'O'),
        ('the', 'O'),

        ('trends', 'B'),
        ('in', 'I'),
        ('international', 'I'),
        ('mathematics', 'I'),
        ('and', 'I'),
        ('science', 'I'),
        ('study', 'I'),

        ('timss', 'O'),
        ('show', 'O'),
        ...
        ]
    
    """
    words_txt = txt.split() # array
    words_label = label.split() # array
    tmp = ['O'] * len(words_txt)
    idx = find_idx_eq_words(words_txt, words_label)    
    if len(idx) > 0:
        for j in idx:
            tmp[j]= 'B'
            for i in range(j+1, j + len(words_label)):
                tmp[i] = 'I'
    return list(zip(words_txt, tmp))

In [None]:
#test
t = df.origin_text[0]
l = df.answer[0]

o = make_split_txt(t, split = True, len_split=64 ,crossing= 0)
assert len(t) == len(' '.join(o)), 'woops!!'
assert len(check_len(o)) == 20, 'mot eq 20 realy why?'
assert len(check_len(o, len_skip=46)) == 19, 'watss...'

In [None]:
df = df.drop_duplicates(subset=['question']).sample(frac = 1, random_state = 13).reset_index(drop=True)
df = df[['text', 'answer']]
df.head(2)

Unnamed: 0,text,answer
0,we were able to associate mycoplasmal conjunct...,north american breeding bird survey bbs
1,mass index and lean mass were both directly co...,baltimore longitudinal study of aging


In [None]:
tmp_ner = []
all_in = False
for i, t, a in tqdm(df.itertuples()):
    mst = make_split_txt(t, split = True, len_split=64 ,crossing= 12)
    out = check_len(mst)
    for txt in out:
        if all_in:
            if a is not None and any(re.findall(f'\\b{a}\\b', txt)): 
                ner = make_tag_ner(txt, a)
                tmp_ner.append(ner)
        else:
            ner = make_tag_ner(txt, a)
            tmp_ner.append(ner)

13902it [00:05, 2691.48it/s]


In [None]:
# shuffling
import random
random.shuffle(tmp_ner)
with open('/content/drive/MyDrive/Coleridge_Initiative/input/train_ner.json', 'w') as f:
    for row in tmp_ner:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

In [None]:
!python /content/kaggle_run_ner.py \
--model_name_or_path 'bert-base-cased' \
--train_file '/content/drive/MyDrive/Coleridge_Initiative/input/train_ner.json' \
--validation_file '/content/drive/MyDrive/Coleridge_Initiative/input/train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 15000 \
--output_dir './output' \
--report_to 'none' \
--seed 123 \
--do_train 