In [1]:
from overrides import overrides

import numpy as np

import pandas as pd

import torch
import torch.nn as nn

from spacy.tokenizer import Tokenizer

from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.data import Instance
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.fields import TextField, MetadataField, ArrayField
from allennlp.nn.util import get_text_field_mask

In [2]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [3]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed=1,
    batch_size=64,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)

class JigsawDatasetReader(DatasetReader):
    def __init__(self, tokenizer=lambda x: x.split(),
                 token_indexers=None,
                 max_seq_len=config.max_seq_len):
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len
 
    @overrides
    def text_to_instance(self, tokens, id=None, labels=None):
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
         
        id_field = MetadataField(id)
        fields["id"] = id_field
         
        if labels is None:
            labels = np.zeros(len(label_cols))
        label_field = ArrayField(array=labels)
        fields["label"] = label_field
 
        return Instance(fields)
     
    @overrides
    def _read(self, file_path):
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["comment_text"])],
                row["id"], row[label_cols].values,
            )

In [4]:
token_indexer = SingleIdTokenIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_lg', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]

# the token indexer is responsible for mapping tokens to integers
token_indexer = SingleIdTokenIndexer()

reader = JigsawDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

# Note: this csv contains the same information as toxic-train, but the comment text has been cleaned.
train = reader.read("toxic-train-clean.csv")

1000it [00:10, 97.78it/s]


In [5]:
vocab = Vocabulary.from_instances(train, max_vocab_size=config.max_vocab_size)

100%|██████████| 1000/1000 [00:00<00:00, 24457.01it/s]


In [6]:
class BaselineModel(Model):
    def __init__(self, word_embeddings,
                 encoder,
                 out_sz=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = nn.BCEWithLogitsLoss()
         
    def forward(self, tokens,
                id, label):
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
         
        output = {"class_logits": class_logits, "state": state}
        output["loss"] = self.loss(class_logits, label)
 
        return output

In [7]:
token_embedding = Embedding(num_embeddings=config.max_vocab_size + 2,
                            embedding_dim=300, padding_index=0)

word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [8]:
encoder = PytorchSeq2VecWrapper(nn.LSTM(300, config.hidden_sz, bidirectional=True, batch_first=True))

In [9]:
# Note: model can be saved via
#    torch.save(model.state_dict(), "toxic_RNN.pt")

model = BaselineModel(word_embeddings, encoder)
model.load_state_dict(torch.load('toxic_RNN.pt'))

<All keys matched successfully>

In [10]:
from allennlp.predictors.sentence_tagger import SentenceTaggerPredictor
tagger = SentenceTaggerPredictor(model, reader)

In [14]:
# Note: class_logits correspond to predictions,
# state corresponds to the encoding.
tagger.predict("This thing is pretty cool.")

{'class_logits': [-5.918819427490234,
  -8.09105396270752,
  -8.28278636932373,
  -7.772166728973389,
  -8.256393432617188,
  -8.967432022094727],
 'state': [0.9399575591087341,
  -0.9350807070732117,
  0.956086277961731,
  -0.9590500593185425,
  -0.13831201195716858,
  -0.7496212124824524,
  0.9502302408218384,
  -0.8554535508155823,
  0.9269254803657532,
  0.9353356957435608,
  0.9392806887626648,
  0.817321240901947,
  0.9371330142021179,
  -0.9617658853530884,
  0.9054552316665649,
  -0.9566709995269775,
  0.9256322979927063,
  -0.9423669576644897,
  0.7850815653800964,
  0.8274006247520447,
  -0.9281306266784668,
  -0.931768536567688,
  -0.9299911260604858,
  -0.8938233256340027,
  -0.95037442445755,
  -0.8536920547485352,
  -0.9124063849449158,
  -0.8764339089393616,
  0.8114773035049438,
  -0.8715393543243408,
  0.9086158275604248,
  -0.9413347840309143,
  0.9197977185249329,
  0.8901531100273132,
  0.8719948530197144,
  -0.9166281223297119,
  0.871108889579773,
  -0.92228144407

In [15]:
df = pd.read_csv("toxic-train-clean.csv")

In [38]:
def get_vector(text):
    try:
        return tagger.predict(text)["state"]
    except Exception:
        return [0] * 128

In [39]:
embeddings = np.stack(df["comment_text"].apply(get_vector).values)

In [40]:
np.savetxt('toxic_rnn_matrix.out', embeddings, delimiter=',')

In [42]:
y = df.iloc[:, 2:8]
x_matrix = np.loadtxt('toxic_rnn_matrix.out', delimiter=',')

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_matrix, y, test_size= 0.2)