In [2]:
import pandas as pd
import numpy as np
import torch

# Loading the data

First, let's examine what the data looks like.

(Note: This repo does not contain the full data. To get the full data, go to the [Kaggle competition page](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) and download the data for yourself.

In [3]:
df= pd.read_csv("data/train.csv")

df.head(2)

FileNotFoundError: [Errno 2] File b'data/train.csv' does not exist: b'data/train.csv'

### Declaring Fields

The Field class determines how the data is preprocessed and converted into a numeric format

In [4]:
from torchtext.data import Field

In [5]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

That was simple. The preprocessing of the labels is even easier, since they are already converted into a binary encoding.
All we need to do is to tell the Field class that the labels are already processed. We do this by passing the use_vocab=False keyword to the constructor

In [6]:
LABEL = Field(sequential=False, use_vocab=False)

### Creating the Dataset

We'll use the TabularDataset class to read our data, since it is in csv format (TabularDataset handles csv, tsv, and json files as of now)

In [7]:
from torchtext.data import TabularDataset

For the train and validation data, we need to process the labels. The fields we pass in must be in the same order as the columns. For fields we don't use, we pass in a tuple where the second element is None

In [8]:
%%time
tv_datafields = [("text", TEXT), ("stars", LABEL)]


trn, vld = TabularDataset.splits(
        path="data", # the root directory where the data lies
        train='train.csv', validation="valid.csv",
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tv_datafields)

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.csv'

For the test data, we don't have any labels

In [9]:
%%time
tst_datafields = [ # we won't be needing the id, so we pass in None as the field
                 ("text", TEXT), ('stars', LABEL)
]

tst = TabularDataset(
        path="data/test.csv", # the file path
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=tst_datafields)

FileNotFoundError: [Errno 2] No such file or directory: 'data/test.csv'

For the TEXT field to convert words into integers, it needs to be told what the entire vocabulary is. To do this, we run TEXT.build_vocab, passing in the dataset to build the vocabulary on.

In [10]:
%%time
TEXT.build_vocab(trn)

NameError: name 'trn' is not defined

Let's take a look at what the vocab looks like.

The vocab.freqs is a collections.Counter object, so we can take a look at the most frequent words.

In [11]:
TEXT.vocab.freqs.most_common(10)

AttributeError: 'Field' object has no attribute 'vocab'

It is also instructive to take a look inside the Dataset. Datasets can be indexed like normal lists, so we'll look at the first element.

In [12]:
trn[0]

NameError: name 'trn' is not defined

Each element of the dataset is an Example object that bundles the attributes of a single data point together.

In [13]:
trn[0].__dict__.keys()

NameError: name 'trn' is not defined

We see that the comment text is already tokenized for us.

In [14]:
trn[0].stars

NameError: name 'trn' is not defined

### Creating the Iterator

In [15]:
from torchtext.data import Iterator, BucketIterator

During training, we'll be using a special kind of Iterator, called the **BucketIterator**.

When we pass data into a neural network, we want the data to be padded to be the same length so that we can process them in batch:

e.g.
\[ 
\[3, 15, 2, 7\],
\[4, 1\], 
\[5, 5, 6, 8, 1\] 
\] -> \[ 
\[3, 15, 2, 7, **0**\],
\[4, 1, **0**, **0**, **0**\], 
\[5, 5, 6, 8, 1\] 
\] 

If the sequences differ greatly in length, the padding will consume a lot of wasteful memory and time.

The BucketIterator groups sequences of similar lengths together for each batch to minimize padding. Handy, right?

In [16]:
train_iter, val_iter = BucketIterator.splits(
        (trn, vld), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(64, 64),
        device=-1, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

NameError: name 'trn' is not defined

Let's take a look at what the output of the BucketIterator looks like

In [17]:
batch = next(train_iter.__iter__()); batch

NameError: name 'train_iter' is not defined

The batch has all the fields we passed to the Dataset as attributes. The batch data can be accessed through the attribute with the same name.

In [18]:
batch.__dict__.keys()

NameError: name 'batch' is not defined

For the test set, we don't want the data to be shuffled. This is why we'll be using a standard Iterator.

In [None]:
test_iter = Iterator(tst, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False)

### Wrapping the Iterator

Currently, the iterator returns a custom datatype called torchtext.data.Batch. This makes code reuse difficult (since each time the column names change, we need to modify the code), and makes torchtext hard to use with other libraries for some use cases (like torchsample and fastai). 

I hope this will be dealt with in the future (I'm considering filing a PR if I can decide what the API should look like), but in the meantime, we'll hack on a simple wrapper to make the batches easy to use. 

Concretely, we'll convert the batch to a tuple in the form (x, y) where x is the independent variable (the input to the model) and y is the dependent variable (the supervision data).

In [None]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_var):
        self.dl, self.x_var, self.y_var = dl, x_var, y_var # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            if self.y_var is not None: # we will concatenate y into a single tensor
                y = getattr(batch, self.y_var) 
            else:
                y = torch.zeros((1))
            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

We'll use this to wrap the BucketIterator

In [None]:
train_dl = BatchWrapper(train_iter, "text", "stars")
valid_dl = BatchWrapper(val_iter, "text", "stars")
test_dl = BatchWrapper(test_iter, "text", None)
len(test_dl)

In [None]:
next(test_dl.__iter__())[0].size()

Now we're ready to start training a model!

# Training a Text Classifier

We'll use a simple LSTM as a baseline example.

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [1]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
        self.nnDropout = nn.Dropout(0.8)
#         self.batchnorm_input = nn.BatchNorm1d(config.emb_dim, affine=False, track_running_stats=False)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.nnDropout(self.embedding(seq)))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = self.nnDropout(F.relu(layer(feature)))
        preds = self.predictor(feature)
        return preds

NameError: name 'nn' is not defined

In [None]:
em_sz = 32
nh = 32
nl = 2
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz)

If you're using a GPU, remember to call model.cuda() to move your model to the GPU.

In [None]:
# model.cuda()

### The training loop

In [None]:
import tqdm

In [None]:
opt = optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.CrossEntropyLoss()

In [None]:
epochs = 2

In [None]:
%%time
for epoch in range(1, epochs + 10):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()

        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.item()

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

In [None]:
save_name = "baseline_model.pth"
torch.save(model.state_dict(), save_name)

# Writing Predictions

Finally, we output the data in the format required by the competition

In [None]:
def get_accuracy(scores: torch.Tensor, labels: torch.Tensor):
    predicted_labels = scores.argmax(dim=1)
    indicator = (predicted_labels == labels)
    num_matches = indicator.sum()

    return num_matches.float()

In [None]:
preds_list = []
for x, y in tqdm.tqdm(test_dl):
    print(y)
    preds = model(x)
    preds = 1 / (1 + torch.exp(-preds))

    # if you're data is on the GPU, you need to move the data back to the cpu
    # preds = preds.data.cpu().numpy()
    
    preds_list.append(preds)



In [None]:
preds_list