In [10]:
import re
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

from tqdm import tqdm

from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from typing import Iterable

In [11]:
data_path = "data"

train_df = pd.read_csv(f"{data_path}/train.csv")
train_df.dropna(inplace=True)
train_df.head()

Unnamed: 0,class,text
0,12,Rules Changed Up is the debut studio album by...
1,14,Back is a novel written by British writer Hen...
2,14,Love and Glory (ISBN 0-385-29261-9) is a 1983...
3,13,Max Manus: Man of War is a 2008 Norwegian bio...
4,7,The former Ahavas Sholem Synagogue building w...


In [12]:
tokenizer = get_tokenizer("spacy")


def preprocess_text(s):
    s = s.strip()
    s = s.lower()
    s = re.sub(r"[^a-zA-Z.,!?]+", " ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = s.strip()
    return s



In [19]:
def build_vocab(dataset):
    for text in tqdm(dataset, desc="Building vocabulary"):
        yield tokenizer(preprocess_text(str(text)))


vocab = build_vocab_from_iterator(
    build_vocab(train_df["text"].values),
    max_tokens=200000,
    specials=["<UNK>", "<PAD>"],
    special_first=True,
)
vocab.set_default_index(vocab["<UNK>"])

VOCAB_SIZE = len(vocab)
print("Vocabulary size: ", VOCAB_SIZE)

Building vocabulary: 100%|██████████| 100800/100800 [00:14<00:00, 7007.29it/s]


Vocabulary size:  200000


In [20]:
BATCH_SIZE = 16
SEQUENCE_LENGTH = 100


def text_pipeline(__text: str):
    return vocab(tokenizer(preprocess_text(__text)))


def collate_fn(__batch: Iterable):
    texts, labels = [], []
    for text, label in __batch:
        text_tokens_ids = text_pipeline(text)
        if len(text_tokens_ids) > SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:SEQUENCE_LENGTH]
        elif len(text_tokens_ids) < SEQUENCE_LENGTH:
            text_tokens_ids.extend(vocab(["<PAD>" for _ in range(SEQUENCE_LENGTH - len(text_tokens_ids))]))

        texts.append(text_tokens_ids)
        labels.append(label - 1)
    texts = torch.tensor(texts, dtype=torch.int)
    labels = torch.tensor(labels, dtype=torch.float)
    return texts, labels


data = np.column_stack((train_df["text"].values, train_df["class"].values))
train_dataloader = DataLoader(data, BATCH_SIZE, True, collate_fn=collate_fn)

In [21]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers=1):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, h0, c0):
        embedded = self.embedding(text)

        output, (hidden, cell) = self.lstm(embedded, (h0, c0))
        return self.fc(hidden[-1, :, :])

In [22]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EMBEDDING_DIM = 64
HIDDEN_DIM = 256
OUTPUT_DIM = 14
N_LAYERS = 5

model = RNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss().to(DEVICE)

In [23]:
def train(model, optimizer, loss_fn):
    global train_dataloader, DEVICE, N_LAYERS, HIDDEN_DIM

    epoch_loss = 0
    model.train()

    for text, labels in tqdm(train_dataloader):
        text = text.to(DEVICE)
        labels = labels.to(DEVICE).long()

        optimizer.zero_grad()

        h0 = torch.randn(N_LAYERS, text.shape[0], HIDDEN_DIM, device=DEVICE)
        c0 = torch.randn(N_LAYERS, text.shape[0], HIDDEN_DIM, device=DEVICE)
        predictions = model(text, h0, c0)
        loss = loss_fn(predictions, labels)
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()

    return epoch_loss / len(train_dataloader)

In [24]:
EPOCHS = 10

for epoch in range(1, EPOCHS + 1):
    train_loss = train(model, optimizer, loss_fn)
    print(f"Epoch: {epoch}, Train Loss: {train_loss}")

100%|██████████| 6300/6300 [02:26<00:00, 42.98it/s]


Epoch: 1, Train Loss: 2.6355843584499663


100%|██████████| 6300/6300 [02:33<00:00, 40.95it/s]


Epoch: 2, Train Loss: 2.639563256483229


100%|██████████| 6300/6300 [02:30<00:00, 41.76it/s]


Epoch: 3, Train Loss: 2.48703523660463


100%|██████████| 6300/6300 [02:29<00:00, 42.08it/s]


Epoch: 4, Train Loss: 1.4151550195803717


100%|██████████| 6300/6300 [02:32<00:00, 41.42it/s]


Epoch: 5, Train Loss: 0.46779079507932897


100%|██████████| 6300/6300 [02:32<00:00, 41.42it/s]


Epoch: 6, Train Loss: 0.18303175392738055


100%|██████████| 6300/6300 [02:32<00:00, 41.22it/s]


Epoch: 7, Train Loss: 0.10602147062330337


100%|██████████| 6300/6300 [02:31<00:00, 41.54it/s]


Epoch: 8, Train Loss: 0.07425418831260194


100%|██████████| 6300/6300 [02:32<00:00, 41.21it/s]


Epoch: 9, Train Loss: 0.05457411956789512


100%|██████████| 6300/6300 [02:45<00:00, 38.11it/s]

Epoch: 10, Train Loss: 0.042264396312999046





In [30]:
test_df = pd.read_csv(f"{data_path}/test.csv")
test_data = np.column_stack((test_df["text"].values, np.zeros(test_df["text"].shape)))
test_dataloader = DataLoader(test_data, BATCH_SIZE, collate_fn=collate_fn)

In [31]:
def predict(model):
    global test_dataloader

    model.eval()
    predictions = []

    with torch.no_grad():
        for text, _ in tqdm(test_dataloader):
            text = text.to(DEVICE)

            h0 = torch.zeros(N_LAYERS, text.size(0), HIDDEN_DIM).to(DEVICE)
            c0 = torch.zeros(N_LAYERS, text.size(0), HIDDEN_DIM).to(DEVICE)

            output = model(text, h0, c0)
            _, predicted = torch.max(output, 1)
            predicted += 1
            predictions.extend(predicted.tolist())

    return predictions

In [32]:
predictions = predict(model)

submission_df = pd.DataFrame({"id": test_df["id"], "class_id": predictions})
submission_df.to_csv("submission.csv", index=False)

100%|██████████| 700/700 [00:07<00:00, 88.50it/s]
