In [90]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import re
import string
from torch.utils.data import DataLoader
from torch import nn
import time
from torch.utils.data.dataset import random_split

In [99]:
data_dir = "./data/train.csv"
data_csv = pd.read_csv(data_dir)
data_csv

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


In [102]:
train_csv = data_csv[:1_000_000]
test_csv = data_csv[1_000_000:]

In [30]:
glove_dir = "./data/embeddings/glove.840B.300d/glove.840B.300d.txt"

glove = {}
vocab = set()

with open(glove_dir, encoding = "utf8") as f:
    for line in tqdm(f):
        values = line.split(" ")
        word = values[0]
        vector = np.asarray(values[1:], dtype = 'float32')
        glove[word] = vector
        vocab.add(word)

print('Found %s word vectors.' % len(glove))
print(f'Vocab length: {len(vocab)}')

2196017it [01:00, 36221.78it/s]

Found 2196016 word vectors.
Vocab length: 2196016





In [63]:
def text_to_list(text, vocab = vocab):
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('/', ' ').split()
    return [word for word in text if word in vocab]

test_str = "Why can some birds fly, but others can't?"

text_to_list(test_str)

['Why', 'can', 'some', 'birds', 'fly', 'but', 'others', 'cant']

In [73]:
def tokenizer(text_list):
    # TODO: tokenizer shouldn't embed text.
    # simply return index of word in vocab.
    return np.array([glove.get(word) for word in text_list])

tokenizer(text_to_list(test_str)).shape # np.array of (num_words X 300)

(8, 300)

In [79]:
text_pipeline = lambda x : tokenizer(text_to_list(x))
label_pipeline = lambda x : x

In [127]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]

    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.float32)
        text_list.append(processed_text)

        offsets.append(processed_text.size(0)) # num_words

    label_list = torch.tensor(label_list, dtype = torch.int16)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim = 0)
    text_list = torch.cat(text_list)

    return label_list.to(device), text_list.to(device), offsets.to(device)

In [116]:
mini_csv = train_csv[:50]
mini_iter = ((row['target'], row['question_text'])
             for _, row in mini_csv.drop(columns = ["qid"]).iterrows())
mini_dataset = to_map_style_dataset(mini_iter)

In [131]:
a, b, c = collate_batch(mini_dataset)
c

tensor([  0,  13,  29,  39,  48,  63,  73,  91, 105, 123, 166, 172, 178, 190,
        209, 234, 245, 260, 268, 289, 298, 311, 328, 339, 366, 375, 385, 400,
        408, 420, 430, 445, 457, 469, 491, 497, 522, 535, 540, 553, 560, 573,
        583, 592, 603, 619, 642, 658, 688, 729], device='mps:0')

In [135]:
b[c[2]:c[3]].size()

torch.Size([10, 300])

In [85]:
class QuestionClassifier(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(QuestionClassifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse = False)
        self.fc = nn.Linear(embed_dim, num_classes)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [86]:
num_classes = 2
vocab_size = len(vocab)
emsize = 300
model = QuestionClassifier(vocab_size, emsize, num_classes).to(device)

In [88]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predictied_label = model(text, offsets)
        loss = criterion(predictied_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predictied_label.argmax(1) == label()).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += lable.size(0)
    return total_acc / total_count

In [94]:
def to_map_style_dataset(iter_data):
    r"""Convert iterable-style dataset to map-style dataset.

    args:
        iter_data: An iterator type object. Examples include Iterable datasets, string list, text io, generators etc.


    Examples:
        >>> from torchtext.datasets import IMDB
        >>> from torchtext.data import to_map_style_dataset
        >>> train_iter = IMDB(split='train')
        >>> train_dataset = to_map_style_dataset(train_iter)
        >>> file_name = '.data/EnWik9/enwik9'
        >>> data_iter = to_map_style_dataset(open(file_name,'r'))
    """

    # Inner class to convert iterable-style to map-style dataset
    class _MapStyleDataset(torch.utils.data.Dataset):
        def __init__(self, iter_data) -> None:
            # TODO Avoid list issue #1296
            self._data = list(iter_data)

        def __len__(self):
            return len(self._data)

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(iter_data)

In [103]:
# Hyperparameters
EPOCHS = 10
LR = 5
BATCH_SIZE = 512

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma = 0.1)
total_accu = None
train_iter = ((row['target'], row['question_text'])
              for _, row in train_csv.drop(columns = ["qid"]).iterrows())
test_iter = ((row['target'], row['question_text'])
             for _, row in test_csv.drop(columns = ["qid"]).iterrows())
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size = BATCH_SIZE, 
    shuffle = True, collate_fn = collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size = BATCH_SIZE, 
    shuffle = True, collate_fn = collate_batch
)
train_dataloader = DataLoader(
    test_dataset, batch_size = BATCH_SIZE, 
    shuffle = True, collate_fn = collate_batch
)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

ValueError: if input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences. However, found offsets of type <class 'torch.Tensor'>