## 1. load and convert data into common training format

In [1]:
import pandas
from tqdm.auto import tqdm
import spacy.gold

In [2]:
ROOT = '../../data/kaggle-ru/'
train_data = pandas.read_csv(ROOT+'ru_train.csv')

In [3]:
train_data[['before', 'after']] = train_data[['before', 'after']].astype(str)

In [4]:
train_data.head(15)

Unnamed: 0,sentence_id,token_id,class,before,after
0,0,0,PLAIN,По,По
1,0,1,PLAIN,состоянию,состоянию
2,0,2,PLAIN,на,на
3,0,3,DATE,1862 год,тысяча восемьсот шестьдесят второй год
4,0,4,PUNCT,.,.
5,1,0,PLAIN,Оснащались,Оснащались
6,1,1,PLAIN,латными,латными
7,1,2,PLAIN,рукавицами,рукавицами
8,1,3,PLAIN,и,и
9,1,4,PLAIN,сабатонами,сабатонами


In [5]:
fix = train_data['class']=='PLAIN'
train_data.loc[fix, 'after'] = train_data[fix]['before']

In [6]:
fix = train_data['class']=='LETTERS'
train_data.loc[fix, 'after'] = train_data[fix]['before']

In [7]:
train_data[train_data['class']=='LETTERS'].sample(10)

Unnamed: 0,sentence_id,token_id,class,before,after
6042265,435799,0,LETTERS,отд,отд
9205327,663144,4,LETTERS,И.,И.
7280851,525001,0,LETTERS,Д. И.,Д. И.
2980366,215777,8,LETTERS,М.,М.
10206284,734977,12,LETTERS,др,др
9459787,681394,4,LETTERS,СПбГУ,СПбГУ
4918892,355261,6,LETTERS,П.Г.,П.Г.
5942271,428637,0,LETTERS,ISBN,ISBN
7753276,558916,1,LETTERS,Т. Д.,Т. Д.
5598248,403959,1,LETTERS,ХК,ХК


In [8]:
train_data.groupby('class')['class'].count().sort_values()

class
TIME             1945
DIGIT            2012
FRACTION         2460
MONEY            2690
ELECTRONIC       5832
DECIMAL          7297
TELEPHONE       10088
MEASURE         40534
ORDINAL         46738
VERBATIM       157912
DATE           185959
LETTERS        189528
CARDINAL       272442
PUNCT         2288640
PLAIN         7360439
Name: class, dtype: int64

In [9]:
EXTRACT = {
    'TIME': 'TIME',
    'DIGIT': 'CARDINAL',
    'FRACTION': 'CARDINAL',
    'MONEY': 'MONEY',
    'ELECTRONIC': None,
    'DECIMAL': 'CARDINAL',
    'TELEPHONE': 'PHONE',
    'MEASURE': 'QUANTITY',
    'ORDINAL': 'ORDINAL',
    'VERBATIM': None,
    'DATE': 'DATE',
    'LETTERS': None,
    'CARDINAL': 'CARDINAL',
    'PUNCT': None,
    'PLAIN': None    
}

#### Task:
- 1) Split words into sentences
- 2) For each sentence, find token start and token end position
- 2.1) Ignore the following tokens, rename others, see EXTRACT map
- 2.2) Convert tokens list into plain text.
- 2.3) Remember start and end character position of each token in the text.
- 2.4) Save the text into target format

#### Source format:
```
sentence_id | token_id | class_name | before | after
----------------------------------------------------
 NNN        | NNN      | PLAIN      | 123    | сто двадцать три
...
```

#### Internal representation:
```
TRAIN_DATA = [
    {"raw": "Who is Shaka Khan?", "entities": [(7, 17, "PERSON")]},
    {"raw": "I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}
]
```
#### Output format:
One line per sentence, .jsonl.

In [10]:
EXAMPLES = [{
    "tokens": ['По', 'состоянию', 'на', '1862 год', '-', 'часы', '«', 'отставали', '»', 'на', '5 секунд', '.'],
    "tags": ['PLAIN', 'PLAIN', 'PLAIN', 'DATE', 'PUNCT', 'PLAIN', 'PUNCT', 'PLAIN', 'PUNCT', 'PLAIN', 'TIME', 'PUNCT'],
    "text": "По состоянию на 1862 год - часы «отставали» на 5 секунд.",
    "entities": [
        (16, 24, 'DATE'),
        (47, 55, 'TIME'),
    ]
},{
    "tokens": ['По', 'состоянию', 'на', 'тысяча восемьсот шестьдесят второй год', '.'],
    "tags": ['PLAIN', 'PLAIN', 'PLAIN', 'DATE', 'PUNCT'],
    "text": "По состоянию на тысяча восемьсот шестьдесят второй год.",
    "entities": [
        (16, 54, 'DATE')
    ]
}]

def convert_sentence(words, tags):
    pos = 0
    positions = []
    chunks = []
    prev_w, prev_t = '', 'START'
    for w, t, prev_t in zip(words, tags, ['START']+tags[:-1]):
        if (t != 'PUNCT' or w in '--—–«([{') and prev_t != 'START' and prev_w not in "«([{":
            pos += 1  # add whitespace
            chunks.append(' ')
        start = pos
        pos += len(w)
        chunks.append(w)
        end = pos
        target_type = EXTRACT.get(t)
        if target_type is not None:
            positions.append((start, end, target_type))
        prev_w, prev_t = w, t
    text = ''.join(chunks).rstrip()
    return {'raw': text, 'entities': positions}

In [12]:
def test(nlp_pipe):
    for e in EXAMPLES:
        result = convert_sentence(e['tokens'], e['tags'])
        result_text, result_entities = result['raw'], result['entities']
        assert result_text == e['text'], f"'{result_text}' !=\n'{e['text']}'"
        assert result_entities == e['entities'], f"'{result_entities}' !=\n'{e['entities']}'"

nlp_pipe = spacy.blank('ru')
test(nlp_pipe)

In [13]:
def iter_group(group):
    #display(name, group)
    tags = list(group['class'])
    tokens = list(group['before']) 
    tokens2 = list(group['after']) 
    yield tokens, tags
    if tokens != tokens2:
        yield tokens2, tags

In [14]:
sentences = []
df_sentences = train_data.groupby('sentence_id')
for sentence_id, group in tqdm(df_sentences, total=len(train_data['sentence_id'].unique())):
    for tokens, tags in iter_group(group):
        sentences.append(convert_sentence(tokens, tags))

HBox(children=(FloatProgress(value=0.0, max=761436.0), HTML(value='')))




## 2. splitting into train/test and saving

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
ds_train, ds_test = train_test_split(sentences, test_size=20000, random_state=42)

print(len(ds_train), len(ds_test))

1087147 20000


In [17]:
import gzip, json
def save_entries(fn, entries):
    with gzip.open(fn, 'wt') as f:
        for z in tqdm(entries):
            json.dump(z, f, ensure_ascii=False)
            f.write('\n')
        f.close()

In [18]:
save_entries('../../data/datasets/kaggle_ru_test.jsonl.gz', ds_test)

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




In [19]:
save_entries('../../data/datasets/kaggle_ru_train.jsonl.gz', ds_train)

HBox(children=(FloatProgress(value=0.0, max=1087147.0), HTML(value='')))




## 3. Loading and checking everything went smoothly

In [20]:
import gzip, json
def load_entries(fn): # '../data/datasets/nerus.jsonl.gz'
    entries = []
    with gzip.open(fn, 'r') as f:
        for line in tqdm(f):
            entry = json.loads(line)
            entries.append(entry)
    return entries
    #del entries

In [21]:
ds_test = load_entries('../../data/datasets/kaggle_ru_test.jsonl.gz')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [22]:
ds_train = load_entries('../../data/datasets/kaggle_ru_train.jsonl.gz')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


