In [16]:
import numpy as np
import matplotlib.pyplot as plt
import tqdm

In [2]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

# Current model gives RuntimeError: CUDA error: CUBLAS_STATUS_INTERNAL_ERROR when calling `cublasCreate(handle)`
# during training when running on GPU. Disable for now.
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

In [3]:
%load_ext autoreload
%autoreload 2

import data_handling as data
import preprocess as pp

In [4]:
# # Original data
# DATA_RAW_PATH = "./data/bds_1.txt"
# IDs, BDs = data.load_raw(DATA_RAW_PATH)

In [5]:
# Data that has already been preprocessed
# Generated by applying pp.preprocess_text() to each BD,
# then saved to a TSV
DATA_CLEAN_PATH = "./data/bds_1_clean.txt"
IDs_raw, BDs_raw = data.load_raw(DATA_CLEAN_PATH)

In [6]:
# Some entries have empty BDs, so filter those out
IDs = []
BDs = []
for iid, bd in zip(IDs_raw, BDs_raw):
    if len(bd) > 0:
        IDs.append(iid)
        BDs.append(bd)

print(len(IDs), len(BDs))

2034 2034


Following PyTorch's tutorial for data setup.
https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [7]:
# Build frequency table
# (cleaned data joins tokens by space)
counter = Counter()
for desc in BDs:
    counter.update(desc.split(" "))

In [8]:
# PyTorch torchtext vocabulary converts tokens to indices and vice versa.
# Also has an '<unk>' for OOV words (might be useful later).
vocab = Vocab(counter,
              max_size=None,
              min_freq=1,
              specials=['<unk>'])
print(len(vocab))
# actual is 70770 without max_size restriction

70770


In [9]:
# Example usage: unknown word, convert token to int ID, convert a whole list of tokens to IDs.
print(vocab.stoi["thisworddoesntexist"], vocab.unk_index)
print(vocab["commercial"], vocab.stoi["commercial"], vocab.itos[vocab["commercial"]])
print(vocab.lookup_indices(["commercial", "fact", "data", "tech"]))

0 0
52243 52243 commercial
[52243, 1410, 76, 4060]


In [10]:
class BDDataset(Dataset):
    """ Very simple dataset object. Stores all the passages.
    
    This is just for compatibility with PyTorch DataLoader.
    """
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [11]:
# "Preprocessing" function
def text_pipeline(text):
    return [vocab[token] for token in text.split(" ")]

def collate_batch(batch):
    """ Convert a batch of text (each a list of tokens) into appropriate torch tensors.
    
    Modification of https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html.
    We don't need labels.
    """
    # Offsets tells the model (which will use EmbeddingBag) where each text starts.
    text_list, offsets = [], [0]
    for _text in batch:
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list.to(device), offsets.to(device)

In [12]:
# TODO: try autoencoder models from here:
# example https://github.com/shentianxiao/text-autoencoders/blob/master/model.py

# The one below includes built-in embeddings that can be trained.

# class BasicAutoencoder(nn.Module):
#     def __init__(self, vocab_size, embed_dim, batch_size):
#         super(BasicAutoencoder, self).__init__()
#         # EmbeddingBag is basically Embedding but aggregates words (i.e. bag-of-words).
#         # See the "mode" argument. Quoted from official documentation:
#         # with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``
#         # Also takes an additional "offsets" 1D array, which indicates where each sentence starts in the batch.
#         # For now we treat a whole document as a sentence (list of tokens).
#         self.embedding = nn.EmbeddingBag(
#             vocab_size,
#             embed_dim,
#             sparse=False, # optimizer complains when sparse=True
#             mode="mean", # can be sum, mean or max (defaults to mean)
#         )
        
#         self.batch_size = batch_size
#         hidden_dim = 64
        
#         # Very simple encoder/decoder at first
#         self.encoder = nn.Linear(embed_dim, hidden_dim)
#         self.decoder = nn.Linear(hidden_dim, vocab_size)
    
#     def forward(self, text, offsets):
#         """ Takes a batch of texts and a 1D array telling us where each sentence starts. """
#         out = self.embedding(text, offsets)
        
#         out = self.encoder(out)
#         out = self.decoder(out)

#         return out


In [13]:
class BasicAutoencoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim=64):
        super(BasicAutoencoder, self).__init__()        
        
        # Very simple encoder/decoder at first
        self.encoder = nn.Sequential(
            nn.Linear(vocab_size, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        self.decoder = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, text):
        # Here text is (batch_size, vocab_size) BoW representation.
        out = self.encoder(text)
        out = self.decoder(out)

        return out

In [33]:
def to_bow(text, vocab_size):
    bow = torch.zeros(vocab_size, dtype=torch.float)
    bow_counter = Counter(text.tolist())
    for k, v in bow_counter.items():
        bow[k] = v
    return bow

In [34]:
# Create data loader to iterate over dataset in batches during training/evaluation
dataset = BDDataset(BDs)
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [95]:
total_epochs = 20
# embed_size = 300
model = BasicAutoencoder(len(vocab), hidden_dim=64).to(device)
model.train()

# For simplicity
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
criterion = torch.nn.MSELoss()

for epoch in range(total_epochs):
    
    epoch_loss = 0.0
    
    for idx, (text, offsets) in enumerate(data_loader):

        text = text.to(device)
        offsets = offsets.to(device)
        
        optimizer.zero_grad()

        # Construct BOW representations
        # Last batch could be smaller than batch_size #data is not a multiple.
        actual_batch_size = offsets.size(0)
        bow = torch.zeros((actual_batch_size, len(vocab))).to(device)
        for i in range(len(offsets) - 1):
            # Extract i^th sentence in the batch and compute BOW
            bow[i] = to_bow(text[offsets[i]:offsets[i+1]], len(vocab))

        # Predict BOW representation directly
        
        #
        # GPU ERROR HERE
        #
        bow_pred = model(bow)
        
        loss = criterion(bow_pred, bow)
        epoch_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    print("Epoch {}, Loss={}".format(epoch + 1, epoch_loss / len(data_loader)))

5.7144904136657715
5.486662881448865
5.222834336571395
5.0932224141433835
5.0470630610361695
5.0302523002028465
5.023749152198434
5.0210719387978315
5.019853452220559
5.019221466965973
5.018844919279218
5.0185893746092916
5.018398659303784
5.018245497718453
5.018116380088031
5.0180038549005985
5.017904122360051
5.01781468745321
5.017732351087034
5.017656886018813


In [96]:
# Look at the model's output
model.eval()

# Store the "hidden representations" (i.e. encoder output) for all passages
# and see if they are meaningful (probably not at this stage because the model is too simple,
# not enough data etc, not enough training epochs etc.)
hidden_vecs = torch.zeros((len(BDs), 64))
with torch.no_grad():
    for i, text_raw in enumerate(BDs):
        text = torch.tensor(text_pipeline(text_raw), dtype=torch.int64)
        bow = to_bow(text, len(vocab))
        h = model.encoder(bow.unsqueeze(0))
        hidden_vecs[i] = h

In [92]:
# Take an example company
# Group 1 Automotive, Inc. is an international Fortune 500 automotive retailer.
# ID is GPI:1031203
GPI_ID = 79
target_h = hidden_vecs[GPI_ID]
similarity = nn.CosineSimilarity(dim=1)

# TODO: compare BOW cosine similarity directly

# Show top 10 most similar (based on cosine distance)
sims = similarity(target_h.unsqueeze(0), hidden_vecs)
idxs = torch.topk(sims, 10, largest=False).indices

for idx in idxs:
    print("{}, {}".format(IDs[idx], BDs[idx][:100]))

WAT:1000697, gener water corpor compani us specialti measur compani oper fundament underli purpos advanc scienc e
DISH:1001082, 1 item 1A risk factor 24 item 1B unresolv staff comment 62 item 2 properti 62 item 3 legal proceed 6
KTYB:1000232, gener kentucki bancshar compani kentucki us bank hold compani headquart pari kentucki the compani or
NVAX:1000694, overview novavax togeth swedish subsidiari novavax AB biotechnolog compani focus discoveri develop c
IMH:1000298, impac mortgag hold sometim refer herein compani us maryland corpor incorpor august 1995 includ follo
OCC:1000230, overview optic cabl corpor incorpor commonwealth virginia 1983 We headquart 5290 concours drive roan
HSIC:1000228, gener henri schein solut compani health care profession power network peopl technolog We believ worl
MFIN:1000209, We medallion financi compani financ compani organ delawar corpor includ medallion bank primari oper 
CLB:1000229, gener core laboratori netherland limit liabil compani We establish 193

In [None]:
# # How to iterate through the data batchwise
# for idx, (text, offsets) in enumerate(data_loader):
    
#     # Just have a look at the first item in the first batch.
#     start = offsets[0]
#     end = offsets[1]
#     indices = text[start:end]
    
#     # Should be equal to the 1st original BD text.
#     tmp = " ".join(vocab.itos[i] for i in indices)
#     print(tmp[:500])
#     assert tmp == BDs[0]
    
#     break
    
#     # TODO: training stuff (model(), loss, backward, optimizer.step etc.)