In [1]:
from abc import ABC

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange

# LSTM for intent classification

In [2]:
class ReviewsDataSet(Dataset, ABC):
    def __init__(self, sentences, sentences_lens, y):
        self.X = sentences
        self.X_lens = sentences_lens
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, item):
        return self.X[item], self.X_lens[item], self.y[item]

In [3]:
def tokenize(x_train, x_val):
    word2idx = {"[PAD]": 0, "[UNK]": 1}
    idx2word = ["[PAD]", "[UNK]"]
    for sent in x_train:
        for word in sent.split():
            if word not in word2idx:
                word2idx[word] = len(word2idx)
                idx2word.append(word)

    final_list_train, final_list_test = [], []
    for sent in x_train:
        final_list_train.append([word2idx[word] for word in sent.split()])
    for sent in x_val:
        final_list_test.append([word2idx[word] if word in word2idx else word2idx['UNK'] for word in sent.split()])
    return final_list_train, final_list_test, word2idx, idx2word

In [4]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, :len(review)] = np.array(review)[:seq_len]
    return features

In [5]:
class MyNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim=30, hidden_dim=50, tag_dim=2):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.word_embedding = nn.Embedding(vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, batch_first=True)
        self.hidden2tag = nn.Sequential(nn.ReLU(),
                                        nn.Linear(self.hidden_dim, tag_dim))
        self.loss_fn = nn.NLLLoss()

    def forward(self, sentence, sentence_len, tags=None):
        embeds = self.word_embedding(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), -1, self.embedding_dim))
        tag_space = self.hidden2tag(lstm_out[range(len(sentence)), sentence_len - 1, :])
        tag_score = F.softmax(tag_space, dim=1)
        if tags is None:
            return tag_score, None
        loss = self.loss_fn(tag_score, tags)
        return tag_score, loss

In [6]:
def train(model, device, optimizer, train_dataset, val_dataset):
    accuracies = []
    for phase in ["train", "validation"]:
        if phase == "train":
            model.train(True)
        else:
            model.train(False) #or model.evel()
        correct = 0.0
        count = 0
        accuracy = None
        dataset = train_dataset if phase == "train" else val_dataset
        t_bar = tqdm(dataset)
        for sentence, lens, tags in t_bar:
            if phase == "train":
                tag_scores, loss = model(sentence.to(device), lens.to(device), tags.to(device))
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            else:
                with torch.no_grad():
                    tag_scores, _ = model(sentence.to(device), lens.to(device), tags.to(device))
            correct += (tag_scores.argmax(1).to("cpu") == tags).sum()
            count += len(tags)
            accuracy = correct/count
            t_bar.set_description(f"{phase} accuracy: {accuracy:.2f}")
        accuracies += [accuracy]
    return accuracies

In [7]:
%% time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:", device)

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
x_train, y_train = train_data["sentence"].values, train_data["tag"].values
x_test, y_test = test_data["sentence"].values, test_data["tag"].values
n_classes = max(y_test) + 1

# x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)
x_train, x_test, word2idx, idx2word = tokenize(x_train, x_test)
vocab_size = len(word2idx)
train_sentence_lens = [min(len(s), 500) for s in x_train]
test_sentence_lens = [min(len(s), 500) for s in x_test]

x_train_pad = padding_(x_train, 500)
x_test_pad = padding_(x_test, 500)

print(x_train_pad.shape, x_test_pad.shape)

train_dataset = ReviewsDataSet(x_train_pad, train_sentence_lens, y_train)
test_dataset = ReviewsDataSet(x_test_pad, test_sentence_lens, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=64)
test_dataloader = DataLoader(test_dataset, batch_size=64)

model = MyNet(vocab_size, tag_dim=n_classes)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=8e-3)

best_accuracy = 0
best_epoch = None
for epoch in range(1000):
    print(f"\n -- Epoch {epoch} --")
    train_accuracy, val_accuracy = train(model, device, optimizer, train_dataloader, test_dataloader)
    if val_accuracy>best_accuracy:
        best_accuracy = val_accuracy
        best_epoch = epoch
    if epoch - best_epoch == 3:
        break
print(f"best accuracy: {best_accuracy:.2f} in epoch {best_epoch}")

device: cpu


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

# LSTM for Generation

In [None]:
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

In [None]:
import torch
import pandas as pd
from collections import Counter

class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        sequence_length,
    ):
        self.sequence_length = sequence_length
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        train_df = pd.read_csv('reddit-cleanjokes.csv')
        text = train_df['Joke'].str.cat(sep=' [STOP] [START] ')
        return text.split(' ')

    def get_stop_start(self):
        return self.word_to_index["[STOP]"], self.word_to_index["[START]"]

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )


In [None]:
import argparse
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm, trange

def train(dataset, model, max_epochs=10, batch_size=256, sequence_length=4):
    model.train()

    dataloader = DataLoader(dataset)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(sequence_length)
        t_bar = tqdm(enumerate(dataloader))
        for batch, (x, y) in t_bar:
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()

            t_bar.set_description(f"epoch {epoch} batch {batch} loss {loss.item()}")

In [None]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    STOP, _ = dataset.get_stop_start()

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])
        if word_index == STOP:
            return words
    return words

In [None]:
dataset = Dataset(sequence_length=3)
model = Model(dataset)

train(dataset, model, max_epochs=10, batch_size=256, sequence_length=3)
print(predict(dataset, model, text='[START] Knock knock.'))

epoch 0 batch 27152 loss 14.284713745117188: : 27153it [12:38, 35.80it/s]
epoch 1 batch 27152 loss 12.78635311126709: : 27153it [14:20, 31.54it/s]
epoch 2 batch 27152 loss 11.708107948303223: : 27153it [13:53, 32.60it/s]
epoch 3 batch 27152 loss 13.02394962310791: : 27153it [13:41, 33.04it/s]
epoch 4 batch 27152 loss 12.384133338928223: : 27153it [13:20, 33.93it/s]
epoch 5 batch 27152 loss 12.444621086120605: : 27153it [13:19, 33.95it/s]
epoch 6 batch 27152 loss 11.304375648498535: : 27153it [13:08, 34.43it/s]
epoch 7 batch 27152 loss 10.294144630432129: : 27153it [13:06, 34.53it/s]
epoch 8 batch 27152 loss 8.282746315002441: : 27153it [13:13, 34.21it/s]
epoch 9 batch 27152 loss 6.325592041015625: : 27153it [13:31, 33.48it/s]


['[START]', 'Knock', 'knock.', 'walks', 'into', 'a', 'bar...', 'joke', 'about', 'his', 'fish', 'Because', 'they', "don't", 'C#', '[STOP]']


In [3]:
import numpy as np
(np.exp(0.0281)) / (np.exp(0.0281) + np.exp(-0.0902))


0.5295405566296691