In [16]:

id_label = 'id'
text_label = 'comment_text'
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_file = 'train_pp.csv'
val_file = 'val_pp.csv'
test_file = 'test_pp.csv'

extern_cols = ['toxicity', 'aggression', 'attack']
extern_mask_cols = ['mask_tox', 'mask_agg', 'mask_att']
extern_text_label = 'comment'
train_ext_file = 'train_external.csv'
val_ext_file = 'val_external.csv'

In [17]:
import torch
from torchtext import data

# some iterators produce StopIteration, which is no longer a warning, we don't need to hear about it
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

from torchtext.vocab import Vectors
import re
import io,os,csv

class ToxicDataset(data.Dataset):
    """Defines a Dataset of columns stored in CSV format."""

    def __init__(self, path, fields, skip_header=True, **kwargs):
        with io.open(os.path.expanduser(path), encoding="utf8") as f:
            reader = csv.reader(f)
                
            if skip_header:
                next(reader)

            examples = [data.Example.fromlist(line, fields) for line in reader]

        super(ToxicDataset, self).__init__(examples, fields, **kwargs)
        
re_symbol = re.compile(r'[^\w ]')
def tokenize(x):
    x = re_symbol.sub('',str(x))
    a = x.split()
    if len(a) <= 0:
        a = ['<empty>']
    return a

class ToxData:
    def __init__(self, path):

        # Define all the types of fields
        # pip install spacy for the tokenizer to work (or remove to use default)
        self.TEXT = data.Field(lower=True, include_lengths=True, tokenize=tokenize)
        # import dill as pickle
        # TEXT = pickle.load(open(f'{path}TEXT_wlm.pkl','rb'))
        self.LABEL = data.Field(sequential=False, use_vocab=False)

        # we use the index field to re-sort test data after processing
        self.INDEX = data.Field(sequential=False, use_vocab=False)

        train_fields=[
            (id_label, self.INDEX),
            ('nid', None),
            (text_label, self.TEXT)
        ]
        for label in label_cols:
            train_fields.append((label,self.LABEL))

        self.train_data, self.val_data = ToxicDataset.splits(
                    path=path, train=train_file, validation=val_file,
                    fields=train_fields
                )

        test_fields=[
            (id_label, self.INDEX),
            ('nid', None),
            (text_label, self.TEXT)
        ]
        self.test_data = ToxicDataset(
                    path=f'{path}{test_file}',
                    fields=test_fields
                )
        self.LABEL_FLOAT = data.Field(sequential=False, use_vocab=False, tensor_type=torch.cuda.FloatTensor)
        extern_fields=[
            (id_label, None),
            (extern_text_label, self.TEXT)
        ]
        for label in extern_cols:
            extern_fields.append((label,self.LABEL_FLOAT))
        for label in extern_mask_cols:
            extern_fields.append((label,self.LABEL))
        self.train_ext_data, self.val_ext_data = ToxicDataset.splits(
                    path=path, train=train_ext_file, validation=val_ext_file,
                    fields=extern_fields
                )
        
    def build_vocab(self,vocab):
        # This will download the glove vectors, see torchtext source for other options
        max_size = 500000
        self.TEXT.build_vocab(self.train_data, self.val_data, self.test_data, vectors=Vectors(vocab), max_size=max_size)
        self.INDEX.build_vocab()

        # print vocab information
        self.ntokens = len(self.TEXT.vocab)
        return self.ntokens
    
    def make_iter(self):
        self.train = data.BucketIterator(self.train_data, batch_size=32,
                                sort_key=lambda x: len(x.comment_text),
                                sort_within_batch=True, repeat=False)
        self.val = data.BucketIterator(self.val_data, batch_size=32,
                                        sort_key=lambda x: len(x.comment_text),
                                        sort_within_batch=True, train=False, repeat=False)
        self.test = data.BucketIterator(self.test_data, batch_size=128,
                                        sort_key=lambda x: len(x.comment_text),
                                        sort_within_batch=True, train=False, repeat=False)
        self.train_ext = data.BucketIterator(self.train_ext_data, batch_size=64,
                                sort_key=lambda x: len(x.comment),
                                sort_within_batch=True, repeat=True)
        self.val_ext = data.BucketIterator(self.val_ext_data, batch_size=128,
                                        sort_key=lambda x: len(x.comment),
                                        sort_within_batch=True, train=False, repeat=False)

The BucketIterator will shuffle the data and produce batches with sequences of roughly the same length. If we didn't want to split into epochs, we could set repeat=True and run for a set number of batches (rather than epochs). Must have `sort_within_batch=True` to use the lengths we picked up earlier.

We also define convenience methods to access the comment text and labels from the batch

In [18]:


def get_text(batch):
    return getattr(batch, text_label)
def get_labels(batch):
    # Get the labels as one tensor from the batch object
    return torch.cat([getattr(batch, label).unsqueeze(1) for label in label_cols], dim=1).float()


In [19]:


def get_ext_text(batch):
    return getattr(batch, extern_text_label)
def get_ext_labels(batch):
    # Get the labels as one tensor from the batch object
    return torch.cat([getattr(batch, label).unsqueeze(1) for label in extern_cols], dim=1).float()
def get_ext_mask(batch):
    # Get the labels as one tensor from the batch object
    return torch.cat([getattr(batch, label).unsqueeze(1) for label in extern_mask_cols], dim=1).float()

In [20]:
dl=ToxData('./toxic-data/')
dl.build_vocab('crawl-300d-2M.vec')
dl.make_iter()

In [21]:
batch=next(iter(dl.train_ext))
get_ext_text(batch)
y=get_ext_labels(batch)
w=get_ext_mask(batch)
l=torch.nn.L1Loss(reduce=False)(y,torch.ones_like(w))
torch.sum(l.mul(w))

#train_ext_data[0].__dict__

Variable containing:
 96.2396
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Here is the meat of the model. A few points to notice in `__init__`:
- `Dropout2d` is a spatial dropout function, which will drop entire layers (rather than just individial connections). It doesn't necessarily require 2d data
- We define `self.rnns` as a ModuleList so that all of the sub-components will be discovered properly
- The pools require an argument that is number of output segments, but we just want a global one for each avg/max

and in `forward`:
- We move to/from a packed sequence for the rnn section if we have the lengths
- We need to rearrange the output of the rnn to have sequence last for pooling layers
- We don't have a sigmoid output because we will later use a special loss function that takes the logit output directly

In [74]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, ndense, nout, nlayers, dropemb=0.2, droprnn=0.0, droplin=0.0, bidirectional=True):
        super(RNNModel, self).__init__()
        self.encoder = nn.Embedding(ntoken, ninp)
        self.dropemb = nn.Dropout2d(dropemb)
        self.ndir = 2 if bidirectional else 1
        assert rnn_type in ['LSTM', 'GRU'], 'RNN type is not supported'
        if rnn_type == 'LSTM':
            self.rnns = [torch.nn.LSTM(ninp if l == 0 else nhid*self.ndir, nhid, 1, dropout=droprnn, bidirectional=bidirectional) for l in range(nlayers)]
        if rnn_type == 'GRU':
            self.rnns = [torch.nn.GRU(ninp if l == 0 else nhid*self.ndir, nhid, 1, dropout=droprnn, bidirectional=bidirectional) for l in range(nlayers)]
        
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.avg_pool = torch.nn.AdaptiveAvgPool1d(1)
        self.max_pool = torch.nn.AdaptiveMaxPool1d(1)
        
        self.droplin = nn.Dropout(droplin)
        self.dense = nn.Linear(nhid*self.ndir, ndense) #*2 if pooling
        self.decoder_three = nn.Linear(ndense, 3) 
        self.decoder_six = nn.Linear(ndense, 6) 
        

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def forward(self, input, lengths=None):
        emb = self.encoder(input)
        
        raw_output = self.dropemb(emb)
        
        if lengths is not None:
            lengths = lengths.view(-1).tolist()
            raw_output = nn.utils.rnn.pack_padded_sequence(raw_output, lengths)
            
        for rnn in self.rnns:
            raw_output,_ = rnn(raw_output)
        
        if lengths is not None:
            raw_output, lengths = nn.utils.rnn.pad_packed_sequence(raw_output)
            
        bsz = raw_output.size(1)
#         rnn_avg = self.avg_pool(raw_output.permute(1,2,0))
#         rnn_max = self.max_pool(raw_output.permute(1,2,0))
#         rnn_out = torch.cat([rnn_avg.view(bsz,-1),rnn_max.view(bsz,-1)], dim=1)
        
        rnn_out = raw_output[0].view(bsz,-1)
        if (self.ndir == 2):
            rnn_fwd = raw_output[-1,:,:self.nhid]
            rnn_rev = raw_output[0,:,self.nhid:]
            rnn_out = torch.cat([rnn_fwd,rnn_rev], dim=1)
            
        dense_out=self.dense(self.droplin(rnn_out))
        
        return dense_out

In [76]:
import torchnet
class MultiAUCMeter(object):
    def __init__(self, n):
        self.meters = []
        self.n = n
        for i in range(n):
            self.meters.append(torchnet.meter.AUCMeter())
    def reset(self):
        for meter in self.meters:
            meter.reset()
    def add(self, preds, targets):
        targets = targets.data.cpu().numpy()
        preds = preds.data.cpu().numpy()
        for i in range(self.n):
            self.meters[i].add(preds[:,i], targets[:,i])
    def avg(self):
        total = 0.0
        for meter in self.meters:
            value, _, _ = meter.value()
            total += value
        return total/self.n
    def values(self):
        return [m.value()[0] for m in self.meters]

These are parameters from other example kernels--not necessarily optimized yet.

In [77]:
use_cuda = torch.cuda.is_available()
nhidden=160
emsize=300
nlayers = 2
dropemb = 0.0
droprnn = 0.2
droplin = 0.0
model = RNNModel('GRU', ntokens, emsize, nhidden, 16, 3, nlayers, dropemb=dropemb, droprnn=droprnn, droplin=droplin, bidirectional=True)
model.encoder.weight.data.copy_(TEXT.vocab.vectors)
#model.encoder.load_state_dict(torch.load(f'{path}encoder_wlm.pt'))

import torch.optim as optim

#optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.7, 0.99))
model.encoder.weight.requires_grad=False
optimizer = optim.Adam([p for p in model.parameters()][1:], lr=3e-4, betas=(0.7, 0.99))
if use_cuda:
    model=model.cuda()

In [78]:
criterion=nn.BCEWithLogitsLoss()

This is the main pytorch training loop!

In [79]:
criterion_ext = torch.nn.MSELoss(reduce=False)
criterion_reg=nn.BCEWithLogitsLoss()
def get_opt(model):
    optim_reg = optim.Adam([p for p in model.parameters()][1:], lr=3e-4, betas=(0.7, 0.99))
    optim_ext = optim.Adam([p for p in model.parameters()][1:], lr=3e-4, betas=(0.7, 0.99))    
    return optim_reg,optim_ext
    
def  train_batch(model, batch, opts):
    opt_reg,opt_ext=opts
    ext_data=False
    if hasattr(batch,'comment'):
        ext_data=True
    if ext_data:
        (x,xl) = get_ext_text(batch)
        y = get_ext_labels(batch)
        w = get_ext_mask(batch)
        
        opt_ext.zero_grad()
        
        model_out = model(x, lengths=xl)
        preds = F.sigmoid(model.decoder_three(model_out))
        loss_mat = criterion_ext(preds, y).mul_(w)
        loss = torch.sum(loss_mat)
        loss.backward()
        opt_ext.step()
        return 0
    else:
        (x,xl) = get_text(batch)
        y = get_labels(batch)
        
        
        opt_reg.zero_grad()
        
        model_out = model(x, lengths=xl)
        preds = model.decoder_six(model_out)
        
        loss = criterion_reg(preds, y)
        loss.backward()
        opt_reg.step()
        return loss.data[0]

class combine_iters:
    def __init__(self, iter1, iter2):
        self.iter1 = iter1
        self.iter2 = iter2
    def __iter__(self):
        self.i1 = iter(self.iter1)
        self.i2 = iter(self.iter2)
        return self
    def __next__(self):
        return next(np.random.choice([self.i1,self.i2]))

In [None]:
c=combine_iters(train,train_ext)
batch=next(iter(c))

In [None]:
from tqdm import tqdm_notebook as tqdm

epochs = 4
train_both=combine_iters(train,train_ext)
opts=get_opt(model)
if use_cuda:
    criterion=criterion.cuda()
val_meter = MultiAUCMeter(6)
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_count = 0
    model.train() 
    t = (tqdm(train_both))
    for i,batch in enumerate(t):
        loss_data = train_batch(model, batch, opts)
        if (loss_data > 0):
            running_loss += loss_data
            running_count += 1
            t.set_postfix(loss=(running_loss/running_count))
        
        
    
    model.eval()
    val_loss = 0.0
    val_count = 0
    val_meter.reset()
    for batch in val:
        (x,xl) = get_text(batch)
        y = get_labels(batch)
        model_out = model(x, lengths=xl)
        preds = model.decoder_six(model_out)
        
        loss = criterion(preds, y)
        
        val_loss += loss.data[0]*len(x)
        val_count += len(x)
        
        val_meter.add(preds,y)
        
    epoch_loss = running_loss / running_count

    print('Epoch: {}, Train Loss: {:.5f}, Val Loss: {:.5f}, Val AUC: {:.5f}'.format(epoch, epoch_loss, val_loss/val_count, val_meter.avg()))

A Jupyter Widget

In [60]:
val_meter.values()

[0.98491247519009295,
 0.99142120234843145,
 0.9929592848428539,
 0.99335421016005565,
 0.98737421728431163,
 0.98692318461323092]

In [30]:
from tqdm import tqdm_notebook as tqdm

epochs = 10
criterion = torch.nn.MSELoss(reduce=False)
if use_cuda:
    criterion=criterion.cuda()
val_meter = MultiAUCMeter(6)
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_count = 0
    model.train() 
    t = tqdm(train_ext)
    for batch in t:
        (x,xl) = get_ext_text(batch)
        y = get_ext_labels(batch)
        w = get_ext_mask(batch)
        
        optimizer.zero_grad()
        
        model_out = model(x, lengths=xl)
        preds = F.sigmoid(model.decoder_three(model_out))
        loss_mat = criterion(preds, y).mul_(w)
        loss = torch.sum(loss_mat)
        loss.backward()
        
        optimizer.step()

        running_loss += loss.data[0]*len(x)
        running_count += len(x)
        t.set_postfix(loss=(running_loss/running_count))
    
    model.eval()
    val_loss = 0.0
    val_count = 0
    for batch in val_ext:
        (x,xl) = get_ext_text(batch)
        y = get_ext_labels(batch)
        w = get_ext_mask(batch)
        model_out = model(x, lengths=xl)
        preds = F.sigmoid(model.decoder_three(model_out))
        
        loss_mat = criterion(preds, y).mul_(w)
        loss = torch.sum(loss_mat)
        
        val_loss += loss.data[0]*len(x)
        val_count += len(x)
        
    tox_val = validate_tox(model)
    epoch_loss = running_loss / running_count

    print('Epoch: {}, Train Loss: {:.5f}, Val Loss: {:.5f}, Tox AUC: {:.5f}'.format(epoch, epoch_loss, val_loss/val_count, tox_val))

A Jupyter Widget


Epoch: 1, Train Loss: 2.40372, Val Loss: 1.60996, Tox AUC: 0.98158


A Jupyter Widget


Epoch: 2, Train Loss: 1.85265, Val Loss: 1.54917, Tox AUC: 0.98232


A Jupyter Widget


Epoch: 3, Train Loss: 1.73819, Val Loss: 1.55202, Tox AUC: 0.98277


A Jupyter Widget


Epoch: 4, Train Loss: 1.64156, Val Loss: 1.47239, Tox AUC: 0.98339


A Jupyter Widget


Epoch: 5, Train Loss: 1.57524, Val Loss: 1.48783, Tox AUC: 0.98412


A Jupyter Widget


Epoch: 6, Train Loss: 1.50035, Val Loss: 1.49340, Tox AUC: 0.98464


A Jupyter Widget


Epoch: 7, Train Loss: 1.43389, Val Loss: 1.62279, Tox AUC: 0.98446


A Jupyter Widget

KeyboardInterrupt: 

In [16]:
from tqdm import tqdm_notebook as tqdm

epochs = 5
criterion=nn.BCEWithLogitsLoss()
if use_cuda:
    criterion=criterion.cuda()
val_meter = MultiAUCMeter(6)
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_count = 0
    model.train() 
    t = tqdm(train)
    for batch in t:
        (x,xl) = get_text(batch)
        y = get_labels(batch)
        
        optimizer.zero_grad()

        model_out = model(x, lengths=xl)
        preds = model.decoder_six(model_out)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.data[0]*len(x)
        running_count += len(x)
        t.set_postfix(loss=(running_loss/running_count))
    
    model.eval()
    val_meter.reset()
    val_loss = 0.0
    val_count = 0
    for batch in val:
        (x,xl) = get_text(batch)
        y = get_labels(batch)
        model_out = model(x, lengths=xl)
        preds = model.decoder_six(model_out)
        
        loss = criterion(preds, y)
        
        val_loss += loss.data[0]*len(x)
        val_count += len(x)
        
        val_meter.add(preds,y)
        
    epoch_loss = running_loss / running_count

    print('Epoch: {}, Train Loss: {:.5f}, Val Loss: {:.5f}, Val AUC: {:.5f}'.format(epoch, epoch_loss, val_loss/val_count, val_meter.avg()))

A Jupyter Widget


Epoch: 1, Train Loss: 0.04972, Val Loss: 0.04207, Val AUC: 0.98212


A Jupyter Widget


Epoch: 2, Train Loss: 0.03863, Val Loss: 0.03744, Val AUC: 0.98812


A Jupyter Widget


Epoch: 3, Train Loss: 0.03425, Val Loss: 0.03842, Val AUC: 0.98851


A Jupyter Widget

KeyboardInterrupt: 

In [29]:
val_meter.values()

[0.9842980150327072,
 0.99155026575566363,
 0.99304883423953738,
 0.99339808175244326,
 0.98700536570157626,
 0.99110479328874956]

Great! Now we define a quick convenience function to access the ids from the test data

In [19]:
test_fields=[
    (id_label, INDEX),
    ('nid', None),
    (text_label, TEXT)
]
test_data = ToxicDataset(
            path=f'{path}{test_file}',
            fields=test_fields
        )
INDEX.build_vocab(test_data)
test = data.BucketIterator(test_data, batch_size=128,
                                sort_key=lambda x: len(x.comment_text),
                                sort_within_batch=True, train=False, repeat=False)

def get_ids(batch):
    return getattr(batch, id_label).data.cpu().numpy().astype(int)

In [22]:
from sklearn.metrics import roc_auc_score
import pandas as pd
val_raw = pd.read_csv(f'{path}val.csv')
def score_val(val_raw, val_preds):
    avg_auc = 0 
    for i,label in enumerate(label_cols):
        auc_score = roc_auc_score(val_raw[label].values,val_preds[:,i])
        avg_auc += auc_score
        print('{}: {}'.format(label, auc_score))
    print('avg: {}'.format(avg_auc/6))

NameError: name 'path' is not defined

In [15]:
score_val(val_preds)

NameError: name 'val_preds' is not defined

In [16]:
import numpy as np
def update_val(model, val_preds):
    model.eval() # turn on evaluation mode
    for batch in val:
        (x,xl) = get_text(batch)
        ids = get_ids(batch)
        preds=model.decoder_six(model(x,lengths=xl))
        preds = preds.data.cpu().numpy()
        preds = 1/(1+np.exp(-np.clip(preds,-10,10)))
        val_preds[ids]+=preds
    return val_preds
def update_test(model, test_preds):
    model.eval()
    for batch in test:
        (x,xl) = get_text(batch)
        ids = get_ids(batch)
        preds=model.decoder_six(model(x,lengths=xl))
        preds = preds.data.cpu().numpy()
        preds = 1/(1+np.exp(-np.clip(preds,-10,10)))
        test_preds[ids]+=preds

In [143]:
val_preds = []
val_targets = []
model.eval() # turn on evaluation mode
for batch in val:
    (x,xl) = get_text(batch)
    y = get_labels(batch)
    preds=model(x,lengths=xl)
    preds = preds.data.cpu().numpy()
    targets = y.data.cpu().numpy()
    val_preds.append(preds[:,0])
    val_targets.append(targets[:,0])

In [144]:
import numpy as np
from sklearn.metrics import roc_auc_score
preds_flat=np.hstack(val_preds)
targets_flat=np.hstack(val_targets)
roc_auc_score(targets_flat,preds_flat)

0.98500197729892547

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score
def validate_tox(three_model):
    val_preds = []
    val_targets = []
    model.eval() # turn on evaluation mode
    for batch in val:
        (x,xl) = get_text(batch)
        y = get_labels(batch)
        preds=model(x,lengths=xl)
        preds = preds.data.cpu().numpy()
        targets = y.data.cpu().numpy()
        val_preds.append(preds[:,0])
        val_targets.append(targets[:,0])
    preds_flat=np.hstack(val_preds)
    targets_flat=np.hstack(val_targets)
    return roc_auc_score(targets_flat,preds_flat)

In [132]:
val_targets[0][:,0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)

In [24]:
from tqdm import tqdm

import torch
import torch.nn as nn
    
def make_model(dl):
    use_cuda = torch.cuda.is_available()
    nhidden=160
    emsize=300
    nlayers = 2
    dropemb = 0.0
    droprnn = 0.2
    droplin = 0.0
    model = RNNModel('GRU', dl.ntokens, emsize, nhidden, 16, 3, nlayers, dropemb=dropemb, droprnn=droprnn, droplin=droplin, bidirectional=True)
    model.encoder.weight.data.copy_(dl.TEXT.vocab.vectors)
    model.encoder.weight.requires_grad=False
    if use_cuda:
        model=model.cuda()
    return model

def run_iters(dl, model, epochs=5):
    opts=get_opt(model)
    criterion=nn.BCEWithLogitsLoss()
    val_meter = MultiAUCMeter(6)
    for epoch in range(1, epochs + 1):
        running_loss = 0.0
        running_count = 0
        model.train() 
        t = tqdm(dl.train_both)
        for batch in t:
            loss_data = train_batch(model, batch, opts)
            if (loss_data > 0):
                running_loss += loss_data
                running_count += 1
                t.set_postfix(loss=(running_loss/running_count))

        model.eval()
        val_loss = 0.0
        val_count = 0
        val_meter.reset()
        for batch in val:
            (x,xl) = get_text(batch)
            y = get_labels(batch)
            model_out = model(x, lengths=xl)
            preds = model.decoder_six(model_out)

            loss = criterion(preds, y)

            val_loss += loss.data[0]*len(x)
            val_count += len(x)

            val_meter.add(preds,y)

        epoch_loss = running_loss / running_count

        print('Epoch: {}, Train Loss: {:.5f}, Val Loss: {:.5f}, Val AUC: {:.5f}'.format(epoch, epoch_loss, val_loss/val_count, val_meter.avg()))

In [25]:
test_preds = np.zeros((len(test_data),6))
val_preds = np.zeros((len(val_data),6))
for i in range(5):
    model = make_model()
    run_iters(model, epochs=1)
    update_val(model, val_preds)
    score_val(val_preds)
    update_test(model, test_preds)

A Jupyter Widget


Epoch: 1, Train Loss: 0.05951, Val Loss: 0.04520, Val AUC: 0.97821
toxic: 0.9827326813307198
severe_toxic: 0.9909718317807407
obscene: 0.9900863439246017
threat: 0.9544718767964661
insult: 0.9837074805070043
identity_hate: 0.967298269419227
avg: 0.9782114139597932


A Jupyter Widget


Epoch: 1, Train Loss: 0.06231, Val Loss: 0.04583, Val AUC: 0.98012
toxic: 0.9831562624486331
severe_toxic: 0.9905621730306595
obscene: 0.9900159499836896
threat: 0.961447459986082
insult: 0.984004331942557
identity_hate: 0.9701644428649889
avg: 0.9798917700427682


A Jupyter Widget


Epoch: 1, Train Loss: 0.06117, Val Loss: 0.04376, Val AUC: 0.97925
toxic: 0.9834003211747316
severe_toxic: 0.9910104985505187
obscene: 0.9904072685801552
threat: 0.9606078484765967
insult: 0.9840482878335165
identity_hate: 0.9713953413239428
avg: 0.9801449276565769


A Jupyter Widget


Epoch: 1, Train Loss: 0.06104, Val Loss: 0.04529, Val AUC: 0.98008
toxic: 0.9835877058610786
severe_toxic: 0.9910136336940141
obscene: 0.9907012057462972
threat: 0.9605972587818826
insult: 0.9843196806534908
identity_hate: 0.9720500974595893
avg: 0.9803782636993921


A Jupyter Widget


Epoch: 1, Train Loss: 0.05887, Val Loss: 0.04592, Val AUC: 0.97796
toxic: 0.9835267730784552
severe_toxic: 0.9909300298674669
obscene: 0.9907799978919563
threat: 0.9603597470575777
insult: 0.9843585647108781
identity_hate: 0.9719429457564694
avg: 0.9803163430604673


And go ahead and store the data in a matrix. Because we get the comments out of order, the ids help us reorder them later

In [45]:
import numpy as np
test_preds = np.zeros((len(test_data), 6))
model.eval()
for batch in test:
    (x,xl) = get_text(batch)
    ids = get_ids(batch)
    model_out=model(x,lengths=xl)
    preds=model.decoder_six(model_out)
    preds = preds.data.cpu().numpy()
    preds = 1/(1+np.exp(-np.clip(preds,-10,10)))
    test_preds[ids]=preds

Great, now reread the test file with pandas and write the output!

In [46]:
import pandas as pd
df = pd.read_csv(f'{path}test.csv')
for i, col in enumerate(label_cols):   
    df[col] = test_preds[:, i]
df.drop(text_label,axis=1).to_csv("submission_ext_gru_short.csv",index=False)
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0.999079,0.440328,0.987529,0.116246,0.915932,0.163502
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.000187,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.000261,4.5e-05,6.2e-05,4.5e-05,0.000138,0.000116
3,00017563c3f7919a,":If you have a look back at the source, the in...",7.2e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.000784,4.5e-05,5.5e-05,8e-05,0.000135,5.7e-05
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0.000188,4.5e-05,4.5e-05,4.6e-05,8.6e-05,4.5e-05
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,0.002832,4.5e-05,0.0001,4.5e-05,0.000124,4.5e-05
7,000247e83dcc1211,:Dear god this site is horrible.,0.361152,0.000344,0.002454,0.000458,0.008306,0.001681
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ...",0.035903,4.6e-05,0.000354,0.000118,0.001758,8.1e-05
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...,0.000232,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05


In [39]:
import pandas as pd
df = pd.read_csv(f'{path}val.csv')
for i, col in enumerate(label_cols):   
    df[col] = val_preds[:, i]
df.drop(text_label,axis=1).to_csv("val_gru_2layer.csv",index=False)
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,180a2e56f6a7c517,GA Review\n:This review is transcluded from Ta...,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05
1,180abdd1373fd099,"""\n Marx? that's just historic ignorance. Stal...",0.025586,4.5e-05,0.002081,8.6e-05,0.003901,0.001035
2,180b31f596d885b8,@ Good! Just tell me how delete my account so ...,0.147984,0.000462,0.003227,0.035831,0.00571,0.000593
3,180c9b29c0d8c8e2,Sabata's Counterattack vs. Sabata's Revenge.\n...,5.8e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05
4,180fcc239f7ebfc3,"Pmanderson|PMAnderson]] 23:42, 16 December",0.008826,8.3e-05,0.002341,0.000149,0.001971,0.000663
5,1819829ecc4e6c5e,Re: Question at my RfA\nWhile I am admittedly ...,4.6e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05
6,181cf62d39ad3686,"""\n\nPlease stop. If you continue to vandalize...",0.002493,4.5e-05,0.000552,8.4e-05,0.000594,4.5e-05
7,181cfafb023c1891,"Because you're a DUMBASS, MastCell, that's why.",0.951092,0.005244,0.568394,0.000678,0.73746,0.021998
8,1820bae92b22e1f8,"Dear Pedant,\n\nI'll sign what I want to sign ...",0.011701,4.5e-05,0.000808,4.5e-05,0.001083,4.5e-05
9,182510a56c64a5cd,Alphabetize track list \n\nThe track list shou...,5.9e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05


In [57]:
f=open('vocab.txt','w')
for word in TEXT.vocab.itos:
    f.write(f'{word}\n')
f.close()