In [1]:
# For tips on running notebooks in Google Colab, see
# https://pytorch.org/tutorials/beginner/colab
%matplotlib inline

In [2]:
from torch import nn
from torchtext.data.functional import to_map_style_dataset
import time
import torchtext.transforms as T

In [3]:
import torch
from torchtext.datasets import AG_NEWS

train_iter = iter(AG_NEWS(split="train"))

In [4]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")
train_iter = AG_NEWS(split="train")


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [5]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [6]:
# from torch.utils.data import DataLoader

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# def collate_batch(batch):
#     label_list, text_list, offsets = [], [], [0]
#     for _label, _text in batch:
#         label_list.append(label_pipeline(_label))
#         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
#         text_list.append(processed_text)
#         offsets.append(processed_text.size(0))
#     label_list = torch.tensor(label_list, dtype=torch.int64)
#     offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
#     text_list = torch.cat(text_list)
#     return label_list.to(device), text_list.to(device), offsets.to(device)


In [7]:
# from torch import nn
# from torchtext.data.functional import to_map_style_dataset
# import time

# class TextClassificationModel(nn.Module):
#     def __init__(self, vocab_size, embed_dim, num_class):
#         super(TextClassificationModel, self).__init__()
#         self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
#         self.fc = nn.Linear(embed_dim, num_class)
#         self.init_weights()

#     def init_weights(self):
#         initrange = 0.5
#         self.embedding.weight.data.uniform_(-initrange, initrange)
#         self.fc.weight.data.uniform_(-initrange, initrange)
#         self.fc.bias.data.zero_()

#     def forward(self, text, offsets):
#         embedded = self.embedding(text, offsets)
#         return self.fc(embedded)

In [8]:
# train_iter = AG_NEWS(split="train")
# train_dataset = to_map_style_dataset(train_iter)
# num_class = len(set([label for (label, text) in train_iter]))
# dataloader = DataLoader(
#     train_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch
# )
# vocab_size = len(vocab)
# emsize = 64
# model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [9]:

# # Hyperparameters
# EPOCHS = 10  # epoch
# LR = 5  # learning rate
# BATCH_SIZE = 64  # batch size for training
# epoch=1

# criterion = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [10]:
# model.train()
# total_acc, total_count = 0, 0
# log_interval = 500
# start_time = time.time()

# for idx, (label, text, offsets) in enumerate(dataloader):
#     optimizer.zero_grad()
#     predicted_label = model(text, offsets)
#     loss = criterion(predicted_label, label)
#     loss.backward()
#     torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
#     optimizer.step()
#     total_acc += (predicted_label.argmax(1) == label).sum().item()
#     total_count += label.size(0)
#     if idx % log_interval == 0 and idx > 0:
#         elapsed = time.time() - start_time
#         print(
#             "| epoch {:3d} | {:5d}/{:5d} batches "
#             "| accuracy {:8.3f}".format(
#                 epoch, idx, len(dataloader), total_acc / total_count
#             )
#         )
#         total_acc, total_count = 0, 0
#         start_time = time.time()

In [11]:
# Abi's collate
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def collate_batch(batch):
    label_list, text_list= [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list=[t.tolist() for t in text_list]
    text_list = T.ToTensor(0)(text_list)
    return label_list, text_list


In [12]:
collate_batch(
    [
        [1,"i am sports"],
        [2,"i am news"]
    ]
)

(tensor([0, 1]),
 tensor([[ 282, 1913,  262],
         [ 282, 1913,  136]]))

In [13]:
class EmbeddingBagClassifier(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_classes):
        super(EmbeddingBagClassifier,self).__init__()
        self.embedding=nn.EmbeddingBag(vocab_size,embed_dim,padding_idx=0)
        self.fc=nn.Linear(embed_dim,num_classes)
        self.init_weights()


    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self,x):
        x=self.embedding(x)
        out=self.fc(x)
        return out
        

In [14]:
train_iter = AG_NEWS(split="train")
train_dataset = to_map_style_dataset(train_iter)
num_class = len(set([label for (label, text) in train_iter]))
dataloader = DataLoader(
    train_dataset, batch_size=64, shuffle=False, collate_fn=collate_batch
)
vocab_size = len(vocab)
emsize = 64
model = EmbeddingBagClassifier(vocab_size, emsize, num_class).to(device)

In [15]:

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 64  # batch size for training


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

In [16]:
model.train()
total_acc, total_count = 0, 0
log_interval = 500
start_time = time.time()

for epoch in range(1,EPOCHS+1):
    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

| epoch   1 |   500/ 1875 batches | accuracy    0.669
| epoch   1 |  1000/ 1875 batches | accuracy    0.843
| epoch   1 |  1500/ 1875 batches | accuracy    0.879
| epoch   2 |   500/ 1875 batches | accuracy    0.880
| epoch   2 |  1000/ 1875 batches | accuracy    0.893
| epoch   2 |  1500/ 1875 batches | accuracy    0.914
| epoch   3 |   500/ 1875 batches | accuracy    0.904
| epoch   3 |  1000/ 1875 batches | accuracy    0.908
| epoch   3 |  1500/ 1875 batches | accuracy    0.926
| epoch   4 |   500/ 1875 batches | accuracy    0.915
| epoch   4 |  1000/ 1875 batches | accuracy    0.916
| epoch   4 |  1500/ 1875 batches | accuracy    0.934
| epoch   5 |   500/ 1875 batches | accuracy    0.923
| epoch   5 |  1000/ 1875 batches | accuracy    0.923
| epoch   5 |  1500/ 1875 batches | accuracy    0.941
| epoch   6 |   500/ 1875 batches | accuracy    0.929
| epoch   6 |  1000/ 1875 batches | accuracy    0.928
| epoch   6 |  1500/ 1875 batches | accuracy    0.945
| epoch   7 |   500/ 1875 ba

# Important Observations 
## 1. Train Accuracy jumped from unmoving 25% to about 78% after 10 epochs when applied the learning rate changes
```
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
```
insert clip gradient norm between backward and step
```
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
```
## 2. Train Accuracy jumped from 78% to 96% on passing in the padding_idx to Embedding Bag.
Obviously when the embedding bag sums/means the weights of the embeddings
we want it to ignore the weights of the padding index
padding is just a convenience feature for pytorch training
```
self.embedding=nn.EmbeddingBag(vocab_size,embed_dim,padding_idx=0)
```

In [None]:
RNN has no initialized weights
but does have learning rate sceduling & gradient clipping

In [21]:
class EmbeddingBagClassifier(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_classes):
        super(EmbeddingBagClassifier,self).__init__()
        self.embedding=nn.EmbeddingBag(vocab_size,embed_dim,padding_idx=0)
        self.fc=nn.Linear(embed_dim,num_classes)
        self.init_weights()


    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self,x):
        x=self.embedding(x)
        out=self.fc(x)
        return out

vocab_size=len(vocab)
embed_dim=300
num_classes=4
embedbagmodel=EmbeddingBagClassifier(vocab_size,embed_dim,num_classes)

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(embedbagmodel.parameters(), lr=LR, momentum=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

#### TRAIN#####
embedbagmodel.train()
epochs=10
correct_preds, total_count = 0, 0
log_interval = 500
start_time = time.time()
for epoch in range(1,epochs+1):
    for idx,(tensors,targets) in enumerate(train_iter):
        tensors,targets=tensors,targets
        optimizer.zero_grad()
        predicted_labels=embedbagmodel(tensors)
        loss=criterion(predicted_labels,targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(embedbagmodel.parameters(), 0.1)
        optimizer.step()
        correct_preds +=(predicted_labels.argmax(1)==targets).sum().item()
        total_count += targets.size(0)
        if idx%log_interval==0 and idx>0:
            elapsed=time.time() - start_time
            print(
            "| epoch {:3d} | {:5d}/{:5d} batches "
                    "| accuracy {:8.3f}".format(
                        ##### Weird - in order for len(train_iter) to work data_pipe must be converted using to_map_style_dataset
                        #### after which you cannot do data_pipe.map(func) even more weird
                        #### to circumvent the transform will need to go into the collate_fn
                        #### so all transform inside collate_fn and then len(dataloader) will also work as long as data_pipe was converted
                        #### to dataset using to_map_style_dataset
                        epoch, idx, len(train_iter), correct_preds / total_count
                    )
                )
            correct_preds, total_count = 0, 0
            start_time = time.time()