In [1]:
import pandas as pd
import numpy as np
import torch
import re
import spacy
import logging
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tqdm
from torch.autograd import Variable


import torchtext
from torchtext.data import Field
from torchtext import vocab, data
from torchtext.datasets import language_modeling
from torchtext.data import Iterator, BucketIterator

PATH='data/comments/'

### TO DO

- Use other vectors (GLOVE 10B)


In [2]:
VAL_RATIO = 0.2

def prepare_csv(seed=999):
    df_train = pd.read_csv(f'{PATH}train.csv')
    df_test = pd.read_csv(f'{PATH}test.csv')
    
    # Remove newline characters, torchtext dosent handle it well
    df_train["comment_text"] = df_train.comment_text.str.replace("\n", " ")
    df_test["comment_text"] = df_test.comment_text.str.replace("\n", " ")
    
    # Split to train and val sets
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * VAL_RATIO)
    
    # Save new datasets
    df_train.iloc[idx[val_size:],:].to_csv(f'{PATH}dataset_train.csv', index=False)
    df_train.iloc[idx[:val_size],:].to_csv(f'{PATH}dataset_val.csv', index=False)
    df_test.to_csv(f'{PATH}dataset_test.csv', index=False)

In [None]:
# Run it once to create datasets
# prepare_csv()

In [10]:
NLP = spacy.load('en')
MAX_CHARS = 20000

def tokenizer(comment):
    comment = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    
    # Cut very long comments
    if (len(comment)> MAX_CHARS):
        comment = comment[:MAX_CHARS]
        
    return [x.text for x in NLP.tokenizer(comment) if x.text != " "]

In [23]:
# Creates 3 datasets + encoding for vocabulary (comment) we need it to reuse
def get_dataset(fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        # pretrain vectors only supports all lower cases
        lower = True

    comment = data.Field(
        sequential=True,
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=True,
        tensor_type=torch.cuda.LongTensor,
        lower=lower)
    
    field = data.Field(
        use_vocab=False, 
        sequential=False, 
        tensor_type=torch.cuda.ByteTensor)

    train, val = data.TabularDataset.splits(
        path=f'{PATH}', format='csv', skip_header=True,
        train='dataset_train.csv', validation='dataset_val.csv',
        fields=[
            ('id', None),
            ('comment_text', comment),
            ('toxic', field),
            ('severe_toxic', field),
            ('obscene', field),
            ('threat', field),
            ('insult', field),
            ('identity_hate', field),
        ])

    test = data.TabularDataset(
        path=f'{PATH}dataset_test.csv', 
        format='csv', 
        skip_header=True,
        fields=[
            ('id', None),
            ('comment_text', comment)
        ])

    comment.build_vocab(
        train, val, test,
        max_size=20000,
        min_freq=50,
        vectors=vectors
    )

    return train, val, test, comment

In [24]:
train_dataset, validation_dataset, test_dataset, TEXT = get_dataset(vectors="glove.6B.100d")

In [25]:
TEXT.vocab.freqs.most_common(10)

[('.', 952754),
 ('the', 917994),
 (',', 872399),
 ('to', 538875),
 ('i', 431924),
 ('of', 409903),
 ('and', 408699),
 ('a', 406372),
 ('you', 393590),
 ('is', 342373)]

In [26]:
train_dataset.__dict__.keys()

dict_keys(['examples', 'fields'])

In [27]:
" ".join(train_dataset[0].comment_text)

'i made those hiphop edits my brother was on this computer earlier so he is probably the culprit i will tell him to stop the vandalism .'

#### Constructing our iterators (previously DataLoaders) using BucketIterator
It automaticlly shuffles and buckets the input sequences into sequences of similar lenght.

In [28]:
train_iter, val_iter = BucketIterator.splits((train_dataset, validation_dataset),
                                            batch_sizes=(64,64),
                                            device=None, # using GPU
                                            # A key to use for sorting examples in order to batch together
                                            # examples with similar lengths and minimize padding.
                                            sort_key=lambda x: len(x.comment_text), 
                                            sort_within_batch=False,
                                            repeat=False)

test_iter = Iterator(test_dataset, 
                     batch_size=64,
                     device=None, # using GPU
                     sort=False,
                     sort_within_batch=False,
                     repeat=False)

#### Training our Model

In [31]:
class SimpleLSTMBaseline(nn.Module):
    def __init__ (self, input_size, hidden_size, num_layers):
        super(SimpleLSTMBaseline, self).__init__()
        
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(input_size, hidden_size, num_layers)
        self.linear_layers= []
        for _ in range(num_layers - 1):
            self.linear_layers.append(nn.Linear(hidden_size, hidden_size))
            self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_size, 6)
        
    def forwrad(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1,:,:]
        for layer in self.linear_layers:
            feature = layer(feature)
            preds = self.predictior(feature)
        return preds

hidden_dim = 500    # Size of hidden layer
emb_dim = 300 # Embedding size
nl = 3      # Number of layers

model = SimpleLSTMBaseline(emb_dim, hidden_dim, nl)
model.cuda()

TypeError: ModuleList.extend should be called with a list, but got ModuleList

Training loop

In [None]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()

epochs = 2

    
# %%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.data[0] * x.size(0)
        
    epoch_loss = running_loss / len(train_dataset)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data[0] * x.size(0)

    val_loss /= len(validation_dataset)
    
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))