In [1]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

from tokenizers import CharBPETokenizer

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

In [14]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [10]:
encoded = tokenizer.encode_plus("I can feel the magic, can you?")
encoded['input_ids']

[101, 1045, 2064, 2514, 1996, 3894, 1010, 2064, 2017, 1029, 102]

In [11]:
encoded['attention_mask']


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [12]:
encoded["token_type_ids"]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [13]:
df = pd.read_csv("./data/train.csv")
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()
new_df.head()

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [15]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [16]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (159571, 2)
TRAIN Dataset: (127657, 2)
TEST Dataset: (31914, 2)


In [23]:
train_dataset.iloc[0,0]

"Goal scored for Portugal \n\nThis could be miles off the mark but did Ricardo actually score a goal while playing for Portugal? I sincerely hope this was no referring to the penalty he scored against England which, of course, was in a penalty shootout. The page about 'goalscoring goalkeepers' only claims he scored a goal for one of his club sides and a quick internet search turned up nothing."

In [18]:
training_set.__getitem__(0)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'ids': tensor([  101,  3125,  3195,  2005,  5978,  2023,  2071,  2022,  2661,  2125,
          1996,  2928,  2021,  2106, 13559,  2941,  3556,  1037,  3125,  2096,
          2652,  2005,  5978,  1029,  1045, 25664,  3246,  2023,  2001,  2053,
          7727,  2000,  1996,  6531,  2002,  3195,  2114,  2563,  2029,  1010,
          1997,  2607,  1010,  2001,  1999,  1037,  6531, 18297,  1012,  1996,
          3931,  2055,  1005,  3289, 27108,  2075,  9653,  2015,  1005,  2069,
          4447,  2002,  3195,  1037,  3125,  2005,  2028,  1997,  2010,  2252,
          3903,  1998,  1037,  4248,  4274,  3945,  2357,  2039,  2498,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  