# RNN. ELMO


### Text classification with RNN


In [1]:
import pandas as pd

rating2sentiment = {0.0: 0, 1.0: 0, 2.0: 0, 3.0: 1, 4.0: 2, 5.0: 2}

df = pd.read_csv(
    "1429_1.csv", low_memory=False
)
df = df[["reviews.text", "reviews.rating"]]
df.dropna(inplace=True)

df["sentiment"] = df["reviews.rating"].apply(lambda x: rating2sentiment[x])
df.head()

Unnamed: 0,reviews.text,reviews.rating,sentiment
0,This product so far has not disappointed. My c...,5.0,2
1,great for beginner or experienced person. Boug...,5.0,2
2,Inexpensive tablet for him to use and learn on...,5.0,2
3,I've had my Fire HD 8 two weeks now and I love...,4.0,2
4,I bought this for my grand daughter when she c...,5.0,2


In [2]:
import re
from tqdm import tqdm
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


tokenizer = get_tokenizer("basic_english")


def preprocess_text(s):
    s = s.strip()
    s = s.lower()
    s = re.sub(r"[^a-zA-Z.,!?]+", " ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = s.strip()
    return s


def build_vocab(dataset):
    for text in tqdm(dataset, desc="Building vocabulary"):
        yield tokenizer(preprocess_text(str(text)))


vocab = build_vocab_from_iterator(
    build_vocab(df["reviews.text"].values),
    max_tokens=25000,
    specials=["<UNK>", "<PAD>"],
    special_first=True,
)
vocab.set_default_index(vocab["<UNK>"])

VOCAB_SIZE = len(vocab)
print("Vocabulary size: ", VOCAB_SIZE)

Building vocabulary: 100%|██████████| 34626/34626 [00:01<00:00, 19072.46it/s]


Vocabulary size:  13457


In [3]:
import torch
import numpy as np
from torch.utils.data import DataLoader

BATCH_SIZE = 16
SEQUENCE_LENGTH = 100


def text_pipeline(text):
    return vocab(tokenizer(preprocess_text(text)))


def collate_fn(batch):
    texts, labels = [], []
    for text, label in batch:
        text_tokens_ids = text_pipeline(text)
        if len(text_tokens_ids) > SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:SEQUENCE_LENGTH]
        elif len(text_tokens_ids) < SEQUENCE_LENGTH:
            text_tokens_ids.extend(
                vocab(["<PAD>" for _ in range(SEQUENCE_LENGTH - len(text_tokens_ids))])
            )

        texts.append(text_tokens_ids)
        labels.append(label)
    texts = torch.tensor(texts, dtype=torch.int)
    labels = torch.tensor(labels, dtype=torch.float)
    return texts, labels


data = np.column_stack((df["reviews.text"].values, df["sentiment"].values))
print(data.shape)
dataloader = DataLoader(
    data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

(34626, 2)


In [4]:
import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers=1):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, h0, c0):
        embedded = self.embedding(text)

        output, (hidden, cell) = self.lstm(embedded, (h0, c0))
        return self.fc(hidden[-1, :, :])

In [5]:
input_dim = VOCAB_SIZE
embedding_dim = 64
hidden_dim = 16
output_dim = 3
n_layers = 5
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim, n_layers=n_layers)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

In [7]:
from tqdm import tqdm


def train(model, dataloader, optimizer, criterion, device):
    epoch_loss = 0

    model.train()

    for text, labels in tqdm(dataloader):
        text = text.to(device)
        labels = labels.to(device).long()

        optimizer.zero_grad()

        h0 = torch.randn(n_layers, text.shape[0], hidden_dim, device=device)
        c0 = torch.randn(n_layers, text.shape[0], hidden_dim, device=device)
        predictions = model(text, h0, c0)
        loss = criterion(predictions, labels)

        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()

    return epoch_loss / len(dataloader)

In [8]:
epochs = 3

for epoch in range(epochs):
    train_loss = train(model, dataloader, optimizer, criterion, device)
    print(f"Epoch: {epoch}, Train Loss:  {train_loss} ")

100%|██████████| 2165/2165 [00:10<00:00, 212.91it/s]


Epoch: 0, Train Loss:  0.6185697166391113 


100%|██████████| 2165/2165 [00:09<00:00, 225.94it/s]


Epoch: 1, Train Loss:  0.3488606919592578 


100%|██████████| 2165/2165 [00:09<00:00, 221.95it/s]

Epoch: 2, Train Loss:  0.3032767735955071 





In [9]:
text = "This product is so cool"
tokens = text_pipeline(text)
tokens.extend(vocab(["<PAD>" for _ in range(SEQUENCE_LENGTH - len(tokens))]))
tokens = torch.tensor([tokens]).to(device)
h0 = torch.randn(n_layers, tokens.shape[0], hidden_dim, device=device)
c0 = torch.randn(n_layers, tokens.shape[0], hidden_dim, device=device)
predictions = model(tokens, h0, c0)
predictions.argmax(axis=1).item()

2

## ELMO (Embeddings from Language Models)


## Task


Task is to classify descriptions into categories with use of rnn-based models


In [10]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cuda.deterministic = True

In [11]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,class,text
0,12,Rules Changed Up is the debut studio album by...
1,14,Back is a novel written by British writer Hen...
2,14,Love and Glory (ISBN 0-385-29261-9) is a 1983...
3,13,Max Manus: Man of War is a 2008 Norwegian bio...
4,7,The former Ahavas Sholem Synagogue building w...


### Data Preprocessing


In [12]:
vocab = build_vocab_from_iterator(
    build_vocab(df["text"].values),
    max_tokens=25000,
    specials=["<UNK>", "<PAD>"],
    special_first=True,
)
vocab.set_default_index(vocab["<UNK>"])

VOCAB_SIZE = len(vocab)
VOCAB_SIZE

Building vocabulary: 100%|██████████| 100800/100800 [00:08<00:00, 12240.92it/s]


25000

In [13]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(df)

In [14]:
batch_size = 64
train_data = np.column_stack(
    (train_data["text"].values, train_data["class"].values - 1)
)
train_dataloader = DataLoader(
    train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)

val_data = np.column_stack((val_data["text"].values, val_data["class"].values - 1))
val_dataloader = DataLoader(
    val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)

In [15]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### RNN Network


In [16]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout,
    ):

        super(RNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout,
            batch_first=True,
        )

        self.fc = nn.Linear(hidden_dim * n_layers, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text):
        embedded = self.embedding(text)

        packed_output, (hidden_state, cell_state) = self.lstm(embedded)

        hidden = torch.cat((hidden_state[-2, :, :], hidden_state[-1, :, :]), dim=1)

        dense_outputs = self.fc(hidden)

        outputs = self.sigmoid(dense_outputs)
        return outputs

### Training


In [17]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 14
NUM_LAYERS = 2
BIDIRECTION = True
DROPOUT = 0.2
model = RNN(
    VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS, BIDIRECTION, DROPOUT
)

In [18]:
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [19]:
def train(model, dataloader, optimizer, criterion):

    epoch_loss = 0.0
    model.train()

    for texts, labels in tqdm(dataloader, desc="Train"):
        texts = texts.to(device)
        labels = labels.to(device).long()
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()

        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [20]:
def evaluate(model, dataloader, criterion):

    epoch_loss = 0.0

    model.eval()
    with torch.no_grad():
        for texts, labels in tqdm(dataloader, desc="Valid"):
            texts = texts.to(device)
            labels = labels.to(device).long()
            predictions = model(texts)
            loss = criterion(predictions, labels.long())
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [21]:
EPOCHS = 20
for epoch in range(EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion)
    valid_loss = evaluate(model, val_dataloader, criterion)
    print(f"train loss: {train_loss}, valid loss: {valid_loss}")

Train: 100%|██████████| 1182/1182 [00:14<00:00, 79.83it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 114.68it/s]


train loss: 2.0024117780014143, valid loss: 1.9027152612124603


Train: 100%|██████████| 1182/1182 [00:14<00:00, 80.90it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 115.06it/s]


train loss: 1.8922993199877731, valid loss: 1.884994240279125


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.03it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 111.53it/s]


train loss: 1.879814177902822, valid loss: 1.8795358711087764


Train: 100%|██████████| 1182/1182 [00:14<00:00, 81.84it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 112.63it/s]


train loss: 1.8681329416139476, valid loss: 1.8687883391598155


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.01it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 114.34it/s]


train loss: 1.8605137997875763, valid loss: 1.8660776551604876


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.02it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 111.12it/s]


train loss: 1.8510790682081038, valid loss: 1.8534734361062801


Train: 100%|██████████| 1182/1182 [00:14<00:00, 81.81it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 113.37it/s]


train loss: 1.844316228795576, valid loss: 1.8542714046342723


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.88it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 115.72it/s]


train loss: 1.8429934104081942, valid loss: 1.8535343590121585


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.62it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 115.83it/s]


train loss: 1.8398821449723541, valid loss: 1.8447956687907883


Train: 100%|██████████| 1182/1182 [00:14<00:00, 83.25it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 116.26it/s]


train loss: 1.8342228103208462, valid loss: 1.8453956185863707


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.50it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 115.93it/s]


train loss: 1.8329573194992723, valid loss: 1.8401279900279748


Train: 100%|██████████| 1182/1182 [00:14<00:00, 83.05it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 112.79it/s]


train loss: 1.8263573699997968, valid loss: 1.8360928393862574


Train: 100%|██████████| 1182/1182 [00:14<00:00, 83.25it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 116.86it/s]


train loss: 1.8258725182860838, valid loss: 1.8367155914379256


Train: 100%|██████████| 1182/1182 [00:14<00:00, 83.12it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 114.58it/s]


train loss: 1.824906956927627, valid loss: 1.8363080315178422


Train: 100%|██████████| 1182/1182 [00:14<00:00, 83.40it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 116.69it/s]


train loss: 1.8245481420087735, valid loss: 1.8366485232024023


Train: 100%|██████████| 1182/1182 [00:14<00:00, 83.04it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 116.27it/s]


train loss: 1.82475457911564, valid loss: 1.8362478281035641


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.77it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 112.31it/s]


train loss: 1.8218622728047638, valid loss: 1.8301015744354518


Train: 100%|██████████| 1182/1182 [00:14<00:00, 81.87it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 116.46it/s]


train loss: 1.8174771555587523, valid loss: 1.8310612627697476


Train: 100%|██████████| 1182/1182 [00:14<00:00, 82.64it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 113.02it/s]


train loss: 1.8170392601018024, valid loss: 1.8294448746642487


Train: 100%|██████████| 1182/1182 [00:14<00:00, 81.79it/s]

Valid: 100%|██████████| 394/394 [00:03<00:00, 111.87it/s]

train loss: 1.816553943754251, valid loss: 1.8242907036984632





### Prediction


In [22]:
df = pd.read_csv("test.csv")
df.head()

Unnamed: 0,id,text
0,0,The Valea Cândii River is a tributary of the ...
1,1,Etiene Pires de Medeiros (born May 24 1991 in...
2,2,Bromelia balansae is a species of the genus B...
3,3,Pilsbryspira loxospira is a species of sea sn...
4,4,Wellington Management Company is one of the l...


In [23]:
test_data = np.column_stack((df["text"].values, np.zeros(len(df["text"]))))
test_dataloader = DataLoader(
    test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)

In [24]:
def predict(model, dataloader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for texts, _ in tqdm(dataloader):
            texts = texts.to(device)
            preds = model(texts)
            predictions.extend(preds.argmax(axis=1).cpu().tolist())
    return predictions

In [25]:
predictions = predict(model, test_dataloader)

100%|██████████| 175/175 [00:01<00:00, 113.85it/s]


In [26]:
converted_predictions = [pred + 1 for pred in predictions]

In [27]:
answer = pd.DataFrame(converted_predictions, columns=["class_id"])
answer.head()

Unnamed: 0,class_id
0,8
1,4
2,11
3,11
4,1
