In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
%load_ext autoreload
%autoreload 2

import data_handling as data
import preprocess as pp

In [4]:
# # Original data
# DATA_RAW_PATH = "./data/bds_1.txt"
# IDs, BDs = data.load_raw(DATA_RAW_PATH)

In [5]:
# Data that has already been preprocessed
# Generated by applying pp.preprocess_text() to each BD,
# then saved to a TSV
DATA_CLEAN_PATH = "./data/bds_1_clean.txt"
IDs, BDs = data.load_raw(DATA_CLEAN_PATH)

Following PyTorch's tutorial for data setup.
https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [6]:
# Build frequency table
# (cleaned data joins tokens by space)
counter = Counter()
for desc in BDs:
    counter.update(desc.split(" "))

In [7]:
# PyTorch torchtext vocabulary converts tokens to indices and vice versa.
# Also has an '<unk>' for OOV words (might be useful later).
vocab = Vocab(counter,
              max_size=None,
              min_freq=1,
              specials=['<unk>'])

In [8]:
# Example usage: unknown word, convert token to int ID, convert a whole list of tokens to IDs.
print(vocab.stoi["thisworddoesntexist"], vocab.unk_index)
print(vocab["commercial"], vocab.stoi["commercial"], vocab.itos[vocab["commercial"]])
print(vocab.lookup_indices(["commercial", "fact", "data", "tech"]))

0 0
52244 52244 commercial
[52244, 1411, 76, 4061]


In [9]:
class BDDataset(Dataset):
    """ Very simple dataset object. Stores all the passages.
    
    This is just for compatibility with PyTorch DataLoader.
    """
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [10]:
# "Preprocessing" function
def text_pipeline(text):
    return [vocab[token] for token in text.split(" ")]

def collate_batch(batch):
    """ Convert a batch of text (each a list of tokens) into appropriate torch tensors.
    
    Modification of https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html.
    We don't need labels.
    """
    # Offsets tells the model (which will use EmbeddingBag) where each text starts.
    text_list, offsets = [], [0]
    for _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), offsets.to(device)

In [11]:
dataset = BDDataset(BDs)
data_loader = DataLoader(dataset, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [15]:
# How to iterate through the data batchwise
for idx, (text, offsets) in enumerate(data_loader):
    
    # Just have a look at the first item in the first batch.
    start = offsets[0]
    end = offsets[1]
    indices = text[start:end]
    
    # Should be equal to the 1st original BD text.
    tmp = " ".join(vocab.itos[i] for i in indices)
    print(tmp[:500])
    assert tmp == BDs[0]
    
    break
    
    # TODO: training stuff (model(), loss, backward, optimizer.step etc.)

We medallion financi compani financ compani organ delawar corpor includ medallion bank primari oper subsidiari In recent year strateg growth medallion bank origin consum loan purchas recreat vehicl boat trailer financ home improv We histor lead posit origin acquir servic loan financ taxi medallion variou type commerci busi sinc medallion bank acquir consum loan portfolio began origin consum loan 2004 increas consum loan portfolio compound annual growth rate 16 19 loan sale 2016 2017 2018 In janu


In [16]:
# Example model
# TODO: try denoising autoencoder
# example https://github.com/shentianxiao/text-autoencoders/blob/master/model.py
class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(Model, self).__init__()
        # EmbeddingBag is basically Embedding but aggregates words (i.e. bag-of-words).
        # See the "mode" argument. Quoted from official documentation:
        # with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``
        # Also takes an additional "offsets" 1D array, which indicates where each sentence starts in the batch.
        # For now we treat a whole document as a sentence (list of tokens).
        self.embedding = nn.EmbeddingBag(
            vocab_size,
            embed_dim,
            sparse=True,
            mode="sum", # can be sum, mean or max (defaults to mean)
        )
        
        # TODO: layers
    
    def forward(self, text, offsets):
        """ Takes a batch of texts and a 1D array telling us where each sentence starts. """
        out = self.embedding(text, offsets)
        
        # TODO: forward pass
        
        return out
