In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *

from overrides import overrides
import warnings

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util


DATA_ROOT = Path("../data/jigsaw")

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
label_cols = ["toxic", "severe_toxic", "obscene",
              "threat", "insult", "identity_hate"]

In [3]:
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import TextField, SequenceLabelField, LabelField, MetadataField, ArrayField
import csv

max_seq_len = 512

class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None, # TODO: Handle mapping from BERT
                 max_seq_len: Optional[int]=max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[str], id: str,
                         labels: np.ndarray) -> Instance:
        sentence_field = MemoryOptimizedTextField([proc(x) for x in tokens],
                                   self.token_indexers)
        fields = {"tokens": sentence_field}
        
        wl_feats = np.array([[func(w) for func in word_level_features] for w in tokens])
        fields["word_level_features"] = ArrayField(array=wl_feats)
        
        sl_feats = np.array([func(tokens) for func in sentence_level_features])
        fields["sentence_level_features"] = ArrayField(array=sl_feats)

        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as f:
            reader = csv.reader(f)
            next(reader)
            for i, line in enumerate(reader):
                if len(line) == 9:
                    _, id_, text, *labels = line
                elif len(line) == 8:
                    id_, text, *labels = line
                else: raise ValueError(f"line has {len(line)} values")
                yield self.text_to_instance(
                    self.tokenizer(text),
                    id_, np.array([int(x) for x in labels]),
                )

In [13]:
#from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from pytorch_pretrained_bert.tokenization import BasicTokenizer
from allennlp.data.token_indexers import WordpieceIndexer, SingleIdTokenIndexer

#_spacy_tok = SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words
_bert_tok = BasicTokenizer(do_lower_case=True)

from allennlp.data.token_indexers import SingleIdTokenIndexer
token_indexer = SingleIdTokenIndexer(
    lowercase_tokens=True,
)

from itertools import groupby

def remove_extra_chars(s, max_qty=2):
    res = [c * min(max_qty, len(list(group_iter))) for c, group_iter in groupby(s)] 
    return ''.join(res)

def tokenizer(x: str):
    return [remove_extra_chars(w) for w in _bert_tok.tokenize(x)]
    #return [w.text for w in _spacy_tok(x.lower())]

In [14]:
reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [15]:
from allennlp.data.fields import TextField, SequenceLabelField, LabelField, MetadataField, ArrayField
import string
alphabet = set(string.ascii_lowercase)

sentence_level_features: List[Callable[[List[str]], float]] = [
#     lambda x: (np.log1p(len(x)) - 3.628) / 1.065, # stat computed on train set
]

word_level_features: List[Callable[[str], float]] = [
    lambda x: 1 if (x.lower() == x) else 0,
    lambda x: len([c for c in x.lower() if c not in alphabet]) / len(x),
]

def proc(x: str) -> str:
    return x.lower()

class MemoryOptimizedTextField(TextField):
    @overrides
    def __init__(self, tokens: List[str], token_indexers: Dict[str, TokenIndexer]) -> None:
        self.tokens = tokens
        self._token_indexers = token_indexers
        self._indexed_tokens: Optional[Dict[str, TokenList]] = None
        self._indexer_name_to_indexed_token: Optional[Dict[str, List[str]]] = None
        # skip checks for tokens
    @overrides
    def index(self, vocab):
        super().index(vocab)
        self.tokens = None # empty tokens

In [18]:
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["train.csv", "test_proced.csv"])
len(train_ds), len(test_ds)



0it [00:00, ?it/s][A[A

42it [00:00, 419.71it/s][A[A

79it [00:00, 402.99it/s][A[A

125it [00:00, 417.87it/s][A[A

161it [00:00, 374.30it/s][A[A

207it [00:00, 394.70it/s][A[A

246it [00:00, 392.89it/s][A[A

303it [00:00, 432.97it/s][A[A

355it [00:00, 451.94it/s][A[A

407it [00:00, 468.33it/s][A[A

454it [00:01, 456.51it/s][A[A

500it [00:01, 437.27it/s][A[A

549it [00:01, 449.22it/s][A[A

594it [00:01, 377.13it/s][A[A

636it [00:01, 388.29it/s][A[A

677it [00:01, 371.84it/s][A[A

717it [00:01, 379.45it/s][A[A

762it [00:01, 395.50it/s][A[A

803it [00:01, 377.09it/s][A[A

847it [00:02, 392.49it/s][A[A

893it [00:02, 408.42it/s][A[A

941it [00:02, 422.59it/s][A[A

984it [00:02, 419.03it/s][A[A

1031it [00:02, 431.63it/s][A[A

1076it [00:02, 421.65it/s][A[A

1137it [00:02, 459.67it/s][A[A

1185it [00:02, 457.10it/s][A[A

1232it [00:02, 426.80it/s][A[A

1285it [00:03, 449.16it/s][A[A

1331it [00:03, 425.34it/s][A[A

1380it [00

([<allennlp.data.instance.Instance at 0x7f4440917e48>,
  <allennlp.data.instance.Instance at 0x7f4491c68048>,
  <allennlp.data.instance.Instance at 0x7f443f4ed358>,
  <allennlp.data.instance.Instance at 0x7f4491c720b8>,
  <allennlp.data.instance.Instance at 0x7f443d9ae2b0>,
  <allennlp.data.instance.Instance at 0x7f443d9ae470>,
  <allennlp.data.instance.Instance at 0x7f443d9aee10>,
  <allennlp.data.instance.Instance at 0x7f4491c72320>,
  <allennlp.data.instance.Instance at 0x7f443d25b128>,
  <allennlp.data.instance.Instance at 0x7f443d27eac8>,
  <allennlp.data.instance.Instance at 0x7f4491c7e9b0>,
  <allennlp.data.instance.Instance at 0x7f443bffa0f0>,
  <allennlp.data.instance.Instance at 0x7f443c00b1d0>,
  <allennlp.data.instance.Instance at 0x7f443c00e3c8>,
  <allennlp.data.instance.Instance at 0x7f4491cd3588>,
  <allennlp.data.instance.Instance at 0x7f443c00ecf8>,
  <allennlp.data.instance.Instance at 0x7f4491c78208>,
  <allennlp.data.instance.Instance at 0x7f4491c78748>,
  <allennl

In [22]:
full_ds = train_ds + test_ds

In [24]:
from allennlp.data.vocabulary import Vocabulary
vocab = Vocabulary.from_instances(full_ds)

100%|██████████| 223549/223549 [00:19<00:00, 11318.24it/s]


In [25]:
# vocab should have been saved using
#vocab.save_to_files(DATA_ROOT / "vocab")

To avoid memory errors, restart here and build embedding matrix

In [None]:
#from allennlp.data.vocabulary import Vocabulary
#vocab = Vocabulary.from_files(DATA_ROOT / "vocab")

In [20]:
import fastText
ft_model = fastText.load_model(str(DATA_ROOT / "wiki.en.bin"))

In [25]:
with (DATA_ROOT / "ft_model_bert_basic_tok.txt").open("wt") as f:
    for idx, token in vocab.get_index_to_token_vocabulary().items():
        emb = ft_model.get_word_vector(token)
        emb_as_str = " ".join(["%.4f" % x for x in emb])
        f.write(f"{token} {emb_as_str}\n")

In [30]:
vocab.get_token_index("cano")

150336