# Creating dataloaders

First, you should generate our vocab from the train set.

For that, use `torchtext.vocab.build_vocab_from_iterator`.

In [274]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(df):
    for _, sample in df.iterrows():
        yield sample.to_list()[2]


# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab = build_vocab_from_iterator(yield_tokens(train), 
                                  min_freq=1,
                                  specials=special_symbols,
                                  special_first=True)
vocab.set_default_index(UNK_IDX)


And then use our vocab to encode the tokenized sequence

In [276]:
sample = train['Text'][2]
print(sample)
encoded = vocab(sample)
print(encoded)
print(len(sample))

['west', 'game', 'first', 'year', 'teacher', 'teach', 'th', 'grade', 'special', 'read', 'class', 'high', 'comprehens', 'level', 'read', 'book', 'one', 'best', 'thing', 'taught', 'year', 'expand', 'mind', 'allow', 'put', 'charact', 'place', 'easi', 'student', 'make', 'mind', 'movi', 'even', 'use', 'whole', 'read', 'class', 'time', 'order', 'finish', 'book', 'student', 'wait', 'hear', 'end', 'excel', 'book', 'read', 'everi', 'year', 'student']
[2556, 43, 33, 14, 2751, 807, 860, 1724, 728, 131, 1895, 191, 6981, 583, 131, 515, 5, 59, 46, 3505, 14, 2954, 528, 450, 40, 1125, 165, 50, 1924, 22, 528, 945, 30, 4, 271, 131, 1895, 13, 68, 623, 515, 1924, 426, 600, 180, 311, 515, 131, 85, 14, 1924]
51


Now we can define our collate function and create dataloaders

In [319]:
import torch
from torch.utils.data import DataLoader

torch.manual_seed(420)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list= [], []
    maxi = 0
    for _text, _label in batch:
        maxi = max(maxi, len(_text))
    for _text, _label in batch:
        text_indices = [BOS_IDX] + [vocab[token] for token in _text] + [EOS_IDX] + [PAD_IDX] * (maxi - len(_text))
        # Append labels, scores, and helpfulness to their respective lists
        label_list.append(_label)
        text_list.append(text_indices)
    
    label_list = torch.tensor(label_list, dtype=torch.float32)
    text_list = torch.tensor(text_list, dtype=torch.int64)

    return text_list.to(device), label_list.to(device)

train = train[['Text', 'Category']]
val = val[['Text', 'Category']]

train_dataloader = DataLoader(
    train.to_numpy(), batch_size=20, shuffle=True, collate_fn=collate_batch, drop_last=True
)

val_dataloader = DataLoader(
    val.to_numpy(), batch_size=20, shuffle=True, collate_fn=collate_batch, drop_last=True
)


# Defining Network


For writing a network you can use `torch.nn.Embedding` or `torch.nn.EmbeddingBag`. This will allow your netorwk to learn embedding vector for your tokens.

As for the other modules in your network, consider these options:
* Simple Linear layers, activations, basic stuff that goes into the network
* There is a possible of not using the offsets (indices of sequences) in the formard, put use predefined sequence length (maximum length, some value, etc.). If this is an option for you, change the `collate_batch` function according to your architecture.
* You could use all this recurrent stuff (RNN, GRU, LSTM, even Transformer, all up to you), but remembder about the dimentions and hidden states
* If you have any quiestions - google it

In [329]:
import torch.nn as nn

class TextClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(len(vocab), 100)
        self.fc1 = nn.Linear(100, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)


    def forward(self, text):
        embedded = self.embedding(text)
        pooled = embedded.mean(dim=1)
        x = self.fc1(pooled)
        x = self.relu(x)
        x = self.fc2(x) 
        return x

In [330]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    for i, batch in loop:
        text, labels = batch
        # zero the parameter gradients
        optimizer.zero_grad()
        # print(text.shape)
        # forward pass
        outputs = model(text)
        # print(text.shape)
       
        # loss calculation
        loss = loss_fn(outputs, labels.to(torch.long))
        
        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/(i * len(labels))})

In [331]:

def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):
    
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch_num}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, labels = batch
            outputs = model(texts)
           
            loss = loss_fn(outputs, labels.to(torch.long))
            outputs.argmax(0)
            predicted = torch.argmax(outputs, dim=-1)
            total += labels.size(0)
            # print(predicted.shape)
            # print(labels.shape)
            correct +=  (predicted.to(torch.long) == labels.to(torch.long)).sum().item()

            val_loss += loss.item()
            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})

        if correct / total > best_so_far:
            torch.save(model.state_dict(), ckpt_path)
            return correct / total

    return best_so_far

In [332]:
epochs = 10
model = TextClassificationModel(6).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
loss_fn = nn.CrossEntropyLoss() 

In [333]:
best = -float('inf')
for epoch in range(epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch_num=epoch)
    best = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best)

Epoch 0: train: 100%|██████████| 1600/1600 [00:52<00:00, 30.43it/s, loss=0.0431]
Epoch 0: val: 100%|██████████| 400/400 [00:01<00:00, 256.38it/s, loss=0.0203, acc=0.867]
Epoch 1: train: 100%|██████████| 1600/1600 [00:50<00:00, 31.53it/s, loss=0.0162]
Epoch 1: val: 100%|██████████| 400/400 [00:01<00:00, 251.27it/s, loss=0.0151, acc=0.903]
Epoch 2: train: 100%|██████████| 1600/1600 [00:53<00:00, 29.85it/s, loss=0.0109]
Epoch 2: val: 100%|██████████| 400/400 [00:02<00:00, 176.70it/s, loss=0.0158, acc=0.903]
Epoch 3: train: 100%|██████████| 1600/1600 [00:55<00:00, 28.64it/s, loss=0.0079] 
Epoch 3: val: 100%|██████████| 400/400 [00:01<00:00, 276.17it/s, loss=0.0136, acc=0.925]
Epoch 4: train: 100%|██████████| 1600/1600 [00:51<00:00, 30.95it/s, loss=0.00565]
Epoch 4: val: 100%|██████████| 400/400 [00:01<00:00, 267.68it/s, loss=0.0231, acc=0.88] 
Epoch 5: train: 100%|██████████| 1600/1600 [00:55<00:00, 29.07it/s, loss=0.00426]
Epoch 5: val: 100%|██████████| 400/400 [00:01<00:00, 274.57it/s, l

# Predictions

In [407]:
def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc="Predictions:",
        leave=True,
    )
    predictions = []
    with torch.no_grad():
        model.eval()  # evaluation mode
        for  i,batch in loop:
            texts = batch
            print(texts)
            # forward pass and loss calculation
            outputs = model(texts)
            
            predicted = torch.argmax(outputs, dim=-1)
            predictions += predicted.detach().cpu().tolist()

    return predictions

In [402]:
def collate_batch2(batch):
    text_list= []
    maxi = 0
    for _text in batch:
        maxi = max(maxi, len(_text[0]))
    for _text in batch:
        text_indices = [BOS_IDX] + [vocab[token] for token in _text[0]] + [EOS_IDX] + [PAD_IDX] * (maxi-len(_text[0]))
        # Append labels, scores, and helpfulness to their respective lists
        text_list.append(text_indices)
    
    text_list = torch.tensor(text_list, dtype=torch.int64)

    return text_list.to(device)


In [403]:
test_preprocessed = preprocess(test_dataframe)

AttributeError: Can only use .str accessor with string values!

In [408]:
ckpt = torch.load("best.pt")
model.load_state_dict(ckpt)

test_preprocessed = test_preprocessed[['Text']]
test_dataloader = DataLoader(
    test_preprocessed.to_numpy(), batch_size=20, shuffle=False, collate_fn=collate_batch2
)

predictions = predict(model, test_dataloader)



Predictions::   0%|          | 0/500 [00:00<?, ?it/s]

Predictions::   6%|▌         | 29/500 [00:00<00:01, 289.93it/s]

tensor([[    2,  1358,  9768,  ...,     1,     1,     1],
        [    2, 10342,  9817,  ...,     1,     1,     1],
        [    2,   189,    44,  ...,     1,     1,     1],
        ...,
        [    2,  5076,  2538,  ...,     1,     1,     1],
        [    2,  8896,  4216,  ...,     1,     1,     1],
        [    2,   448,   834,  ...,     1,     1,     1]])
tensor([[   2, 2722, 1335,  ...,   34,  116,    3],
        [   2, 5196, 2855,  ...,    1,    1,    1],
        [   2, 5599, 3416,  ...,    1,    1,    1],
        ...,
        [   2,  276,   37,  ...,    1,    1,    1],
        [   2, 1491,  291,  ...,    1,    1,    1],
        [   2,  385,   32,  ...,    1,    1,    1]])
tensor([[    2,   684, 12076,  ...,   121,    61,     3],
        [    2,   276,    37,  ...,     1,     1,     1],
        [    2,   222,  1627,  ...,     1,     1,     1],
        ...,
        [    2,  5095,   741,  ...,     1,     1,     1],
        [    2, 17579,  4903,  ...,     1,     1,     1],
        [

Predictions::  17%|█▋        | 87/500 [00:00<00:01, 283.68it/s]

tensor([[    2,  8623,  1127,  ...,     1,     1,     1],
        [    2,   749,  2682,  ...,    19,    82,     3],
        [    2,  2241,   120,  ...,     1,     1,     1],
        ...,
        [    2,  2140,  2848,  ...,     1,     1,     1],
        [    2, 10342,  4560,  ...,     1,     1,     1],
        [    2,  9211,   167,  ...,     1,     1,     1]])
tensor([[    2,  1152,  1442,  ...,     1,     1,     1],
        [    2,  5654,  3620,  ...,     1,     1,     1],
        [    2,  4582,  6823,  ...,     1,     1,     1],
        ...,
        [    2,  5943, 15145,  ...,     1,     1,     1],
        [    2,   575,   757,  ...,     1,     1,     1],
        [    2,   760,   717,  ...,     1,     1,     1]])
tensor([[    2,   145,  4559,  ...,     1,     1,     1],
        [    2,     0,   246,  ...,     1,     1,     1],
        [    2,   374,  1552,  ...,     1,     1,     1],
        ...,
        [    2, 18564,  5621,  ...,     1,     1,     1],
        [    2,  4983,  3269,  

Predictions::  30%|███       | 150/500 [00:00<00:01, 300.99it/s]

tensor([[   2,  717, 4000,  ...,    1,    1,    1],
        [   2, 4348, 9128,  ...,    1,    1,    1],
        [   2, 4095,    0,  ...,    1,    1,    1],
        ...,
        [   2, 1026,  741,  ...,    1,    1,    1],
        [   2, 1279, 1565,  ...,    1,    1,    1],
        [   2, 5196, 2320,  ...,    1,    1,    1]])
tensor([[   2, 7000,  884,  ...,    1,    1,    1],
        [   2, 2225,  946,  ...,    1,    1,    1],
        [   2, 5571, 5629,  ...,    1,    1,    1],
        ...,
        [   2,  212,  321,  ...,    1,    1,    1],
        [   2, 3701,  860,  ...,    1,    1,    1],
        [   2,    0,    0,  ...,    1,    1,    1]])
tensor([[   2, 2225,  946,  ...,    1,    1,    1],
        [   2,    0, 6598,  ...,    1,    1,    1],
        [   2, 1094,  121,  ...,    1,    1,    1],
        ...,
        [   2,    0, 2560,  ...,    1,    1,    1],
        [   2, 5065, 4805,  ...,    1,    1,    1],
        [   2,  831, 1715,  ...,    1,    1,    1]])
tensor([[    2,  1343,

Predictions::  49%|████▉     | 247/500 [00:00<00:00, 305.88it/s]

tensor([[    2,  1382,    61,  ...,     1,     1,     1],
        [    2,  1685,   264,  ...,     1,     1,     1],
        [    2,   744,   838,  ...,     1,     1,     1],
        ...,
        [    2,   572,   566,  ...,     1,     1,     1],
        [    2, 30689,  6019,  ...,     1,     1,     1],
        [    2,     0,  4349,  ...,     1,     1,     1]])
tensor([[    2,   604,  1175,  ...,     1,     1,     1],
        [    2, 17457,   817,  ...,   225,   808,     3],
        [    2,  3745,   530,  ...,     1,     1,     1],
        ...,
        [    2, 13480,  2501,  ...,     1,     1,     1],
        [    2,  4517,  3490,  ...,     1,     1,     1],
        [    2,  2389,  5626,  ...,     1,     1,     1]])
tensor([[    2,  7124,    38,  ...,     1,     1,     1],
        [    2,  4095,    33,  ...,     1,     1,     1],
        [    2,  1685,  5657,  ...,     1,     1,     1],
        ...,
        [    2,   453,   964,  ...,     1,     1,     1],
        [    2, 19468,  1678,  

Predictions::  63%|██████▎   | 314/500 [00:01<00:00, 311.57it/s]

tensor([[    2, 11663,     0,   231,   397,   696,  3193,  5830,  1563,     5,
           470,     0,   919,  1117,   676,   450,   443,   115,     0,   766,
           513,   269,   448,     3,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    2,  2915,    15,   296,   900,    49,   568,  1191,   157,   153,
           194,   117,    20,   637,   333,  1157,    24,   124,   539,     3,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    2,  3015,   907,   968,   206,     5,   360,   503,    67,   387,
           968,   200,    23,     9,  2071,  6829,    13,  1113,     6,   220,
           100,   103,   316,   170,   605,  1154,    25,   421,     4,     5,
            13,     3,     1,     1,     1,     1,     1,     1,     1],
        [    2,   385,    32,   100,   106,   153,   136,   422,  3121

Predictions::  69%|██████▉   | 346/500 [00:01<00:00, 305.94it/s]

tensor([[    2,  6944,  6638,  ...,     1,     1,     1],
        [    2,  1071,  1504,  ...,     1,     1,     1],
        [    2,  3016,  2407,  ...,     1,     1,     1],
        ...,
        [    2,   276,    37,  ...,     1,     1,     1],
        [    2, 29276,  1971,  ...,     1,     1,     1],
        [    2,    58,  3570,  ...,     1,     1,     1]])
tensor([[    2,   102,    98,  ...,   285,  1749,     3],
        [    2,  9858,    35,  ...,     1,     1,     1],
        [    2,    31,   591,  ...,     1,     1,     1],
        ...,
        [    2,   890,  1026,  ...,     1,     1,     1],
        [    2, 21859,  1009,  ...,     1,     1,     1],
        [    2,  2634,  4752,  ...,     1,     1,     1]])
tensor([[    2,  3209,  2513,  ...,     1,     1,     1],
        [    2,  1702,  4296,  ...,     1,     1,     1],
        [    2,   143, 17727,  ...,     1,     1,     1],
        ...,
        [    2,  2187,   625,  ...,     1,     1,     1],
        [    2,   121,    62,  

Predictions::  81%|████████▏ | 407/500 [00:01<00:00, 270.00it/s]

tensor([[    2,  1039,  3072,  ...,     1,     1,     1],
        [    2,   612,  4361,  ...,     1,     1,     1],
        [    2,  5571,   283,  ...,     1,     1,     1],
        ...,
        [    2,   118,  7894,  ...,     1,     1,     1],
        [    2, 22914,  1613,  ...,     1,     1,     1],
        [    2,  2226,  2569,  ...,     1,     1,     1]])
tensor([[    2,  5022,  6434,  ...,     1,     1,     1],
        [    2, 10984, 17177,  ...,  5324,   322,     3],
        [    2, 20499,  3078,  ...,     1,     1,     1],
        ...,
        [    2,  1294,  7697,  ...,     1,     1,     1],
        [    2,  2760,  4850,  ...,     1,     1,     1],
        [    2,  2723,  7250,  ...,     1,     1,     1]])
tensor([[   2, 1638, 2137,  ...,    1,    1,    1],
        [   2, 3415,  870,  ...,    1,    1,    1],
        [   2, 2892, 5219,  ...,    1,    1,    1],
        ...,
        [   2, 2991, 3356,  ...,    1,    1,    1],
        [   2,   31,   43,  ...,  187,   56,    3],
   

Predictions::  94%|█████████▍| 472/500 [00:01<00:00, 295.59it/s]

tensor([[    2,   572,   566,  2547,  3502,    28,    15,  1275,   468,    52,
           739,   350,    78,   657,     8,  4727,   641,   775,  6636,   444,
           162,  1275,     8,     3,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1],
        [    2,   686,   537,   361,    28,    31,   890,  1536,  4794,  1259,
          1018,    78,    55,   860,   423,   165,  4546,    86,  1018,   464,
           124,   539,   431,    96,   176,   556,     8,   101,     7,   380,
          3160,   136,   211,   885,  3157,     3,     1,     1],
        [    2,    21,   705,  8767,  1871,   545,   552,    35,   212,  1665,
            35,     8,     7,   121,     7,   687,    89,     5,  1861,    67,
            44,   193,    90,     9,    73,   476,  1232,     4,  4167,    89,
           124,  1114,   277,    67,    74,    13,   152,     3],
        [    2,  1184,    32,   299,  4758,   197,  9278,   123,  2238,   172,
           2

Predictions:: 100%|██████████| 500/500 [00:01<00:00, 297.87it/s]


In [409]:
results = pd.Series(predictions).apply(lambda x: idx2cat[x])
results.to_csv('submission.csv', index_label='id')