# Classification markup

In [5]:
import os
import pandas as pd
from nltk.tokenize.punkt import PunktSentenceTokenizer
import numpy as np
from unidecode import unidecode


def sents_token_bounds(text):
    sents_starts = []
    for start, end in PunktSentenceTokenizer().span_tokenize(text):
        sents_starts.append(start)
    sents_starts.append(100000)
    return np.array(sents_starts)


def clear(text):
    return unidecode(text.strip().replace('\t', ' ').replace('\n', ' '))


def get_context(article, span_start, span_end):
    bounds = sents_token_bounds(article)
    context_start = bounds[np.where(bounds <= span_start)[0][-1]]
    context_end = bounds[np.where(bounds >= span_end)[0][0]]
    return clear(article[context_start:context_end])


def get_markup(part):
    span_data = []
    ids = list(set([file[:-4] for file in os.listdir('alt_data/{}'.format(part))]))
    for file_id in ids:
        data = pd.read_csv(os.path.join('alt_data/{}'.format(part), file_id + '.ann'), sep='\t', header=None)
        with open(os.path.join('alt_data/{}'.format(part), file_id + '.txt'), 'r') as f:
            text = f.read()
        for i, row in data.iterrows():
            try:
                label, span_start, span_end = row[1].split()
                span_start, span_end = int(span_start), int(span_end)
            except:
                continue
            assert text[span_start:span_end] == row[2]
            context = get_context(text, span_start, span_end)
            span_data.append((row[2], context, label, span_start, span_end, file_id))
    df = pd.DataFrame(span_data)
    if part == 'train':
        df = df.sample(frac=1)
    df.to_csv('alt_data/{}.tsv'.format(part), header=None, sep='\t', index=False)

In [6]:
for part in ['train', 'dev', 'test']:
    get_markup(part)

# BIO

In [62]:
import spacy


def token_label_from_spans(pos, spans):
    for el in spans:
        if el[0] <= int(pos) < el[1]:
            return "PROP"
    return 'O'


def create_BIO_labeled(part, nlp, max_length=32):
    prev_label = 'O'
    cur_length = 0
    with open('alt_data/{}_bio.tsv'.format(part), 'w') as f:
        ids = list(set([file[:-4] for file in os.listdir('alt_data/{}'.format(part))]))
        for file_id in ids:
            spans = []
            data = pd.read_csv(os.path.join('alt_data/{}'.format(part), file_id + '.ann'), sep='\t', header=None)
            with open(os.path.join('alt_data/{}'.format(part), file_id + '.txt'), 'r') as fin:
                text = fin.read()
            
            for i, row in data.iterrows():
                try:
                    label, span_start, span_end = row[1].split()
                    span_start, span_end = int(span_start), int(span_end)
                except:
                    continue
                assert text[span_start:span_end] == row[2]
                spans.append((span_start, span_end))
            
            tokens = [(token.idx, token.text) for token in nlp(text)]
            idx = np.array(tokens)[:,0]
            tokens = np.array(tokens)[:,1]
            prev_tok = '\n'
            
            for i in range(len(tokens)):
                tok = tokens[i].replace('\n', ' ').replace('\t', ' ').strip()
                if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'):
                    tok = tokens[i].strip().replace('\n', ' ').replace('\t', ' ')
                    label =  token_label_from_spans(idx[i], spans)
                    if label != 'O':
                        if prev_label != 'O':
                            label = 'I-' + 'KEY'
                        else:
                            label = 'B-' + 'KEY'
                    if cur_length + 1 + 2 > max_length:
                        f.write('\n')
                        cur_length = 0
                    f.write(tok + '\t' + label + '\n')
                    cur_length += 1
                    prev_label = label
                    prev_tok = tok
                else:
                    if prev_tok != '\n':
                        # f.write('\n')
                        prev_tok = '\n'
                    prev_label = 'O'

In [47]:
nlp = spacy.load("en")

In [63]:
for part in ['train', 'dev', 'test']:
    create_BIO_labeled(part, nlp)