# Data Utils

The purpose of this notebook is to:

1. Create utility operations for preparing and cleaning data that will be used to train the different models.
2. Test out different utility methods to make sure they work as intended.
3. Create a custom torch Dataset that can be plugged in to pytorch models.

**NOTE: This code uses pseudo-random operations, so running this script will generate different data on each run.**

In [65]:
import math
import numpy as np
import pandas as pd
import time
import torch

from torch.utils.data import Dataset


## Load the Data

In [2]:
def load_embeddings(path, embedding_dim):
    with open(path) as file:
        lines = file.readlines()

        index = []
        embeddings = np.zeros((len(lines), embedding_dim))
        
        for i, l in enumerate(lines):
            tokens = l.split(' ')
            index.append(tokens[0])
            embeddings[i, :] = tokens[1:]

        return pd.DataFrame(embeddings, index=index)
    

In [3]:
embeddings = load_embeddings('./data/glove.6B/glove.6B.100d.txt', embedding_dim=100)

embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.038194,-0.24487,0.72812,-0.39961,0.083172,0.043953,-0.39141,0.3344,-0.57545,0.087459,...,0.016215,-0.017099,-0.38984,0.87424,-0.72569,-0.51058,-0.52028,-0.1459,0.8278,0.27062
",",-0.10767,0.11053,0.59812,-0.54361,0.67396,0.10663,0.038867,0.35481,0.06351,-0.094189,...,0.34951,-0.7226,0.37549,0.4441,-0.99059,0.61214,-0.35111,-0.83155,0.45293,0.082577
.,-0.33979,0.20941,0.46348,-0.64792,-0.38377,0.038034,0.17127,0.15978,0.46619,-0.019169,...,-0.063351,-0.67412,-0.068895,0.53604,-0.87773,0.31802,-0.39242,-0.23394,0.47298,-0.028803
of,-0.1529,-0.24279,0.89837,0.16996,0.53516,0.48784,-0.58826,-0.17982,-1.3581,0.42541,...,0.18712,-0.018488,-0.26757,0.727,-0.59363,-0.34839,-0.56094,-0.591,1.0039,0.20664
to,-0.1897,0.050024,0.19084,-0.049184,-0.089737,0.21006,-0.54952,0.098377,-0.20135,0.34241,...,-0.13134,0.058617,-0.31869,-0.61419,-0.62393,-0.41548,-0.038175,-0.39804,0.47647,-0.15983


In [4]:
# Load training data into dataframe.
data = pd.read_json('./data/train_data.json', orient='records')


## Downsampling

In [5]:
def downsample(df, c):
    category_counts = df['category'].value_counts()
    min_count = category_counts.min()

    # Calculate the probability of keeping a row
    # of a given category.
    category_probs = (min_count / category_counts) ** (1/c)

    # This is a series used to determine the probability that each
    # row is kept. Each rows mask depends on its category.
    prob_mask = np.zeros(len(df))

    for i, category in enumerate(category_counts.index.tolist()):
        category_prob = category_probs[i]
        category_keep_mask = (df['category'] == category) * category_prob
        prob_mask = prob_mask + category_keep_mask

    keep_mask = np.random.rand(len(df)) <= prob_mask
    
    return df[keep_mask].reset_index(drop=True)


## Cleaning Up Data

In [6]:
import string

from nltk.tokenize.regexp import WordPunctTokenizer

In [7]:
tokenizer = WordPunctTokenizer()

In [8]:
# Special token for tokens that occur MIN_WORD_FREQ or fewer times in the
# entire corpus.
__LOW_FREQ_TOKEN__ = '__LOW_FREQ_TOKEN__'

MIN_WORD_FREQ = 5


In [9]:
def cleanup_and_tokenize_text(text):
    cleaned = ''.join([c for c in text if c not in string.punctuation]).lower()
    return tokenizer.tokenize(cleaned)


In [10]:
def tokenize_rows(df):
    tokenized_headlines = df['headline'].apply(cleanup_and_tokenize_text).tolist()
    tokenized_desc = df['short_description'].apply(cleanup_and_tokenize_text).tolist()

    return [tokens1 + tokens2 for tokens1, tokens2 in zip(tokenized_headlines, tokenized_desc)]
    

In [16]:
def create_unigram_counts(rows):
    # Flatten
    tokens = [t for tokens in rows for t in tokens]
    
    counts = {}

    for token in tokens:
        if token not in counts:
            counts[token] = 0
        counts[token] += 1

    return counts
    

## Pytorch Dataset

In [120]:
class WordTokenDatasetSample():
    def __init__(self, sequence, offset, label, vocab_size):
        self.sequence = sequence
        self.offset = offset
        self.label = label
        self.vocab_size = vocab_size


    def __len__(self):
        return len(self.labels)



In [170]:
class WordTokenDataset(Dataset):
    __TOKEN_UNK__ = '__TOKEN_UNK__'

    __TOKEN_LOW_FREQ__ = '__TOKEN_LOW_FREQ__'

    def __init__(self, data, downsample_c=None, accepted_tokens=None, min_word_freq=0):
        super().__init__()
        
        self.data = data
        self.downsample_c = downsample_c
        self.accepted_tokens = accepted_tokens
        self.min_word_freq = min_word_freq

        self._is_prepared = False
        self._unigram_counts = None
        self._token_encoder = None
        self._encoded_to_idx = None
        self._label_encoder = None


    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        if type(idx) == int:
            idx = slice(idx, idx+1)

        assert(self._is_prepared)

        sub_data = self.data.iloc[idx]
        
        if len(sub_data) == 0:
            return WordTokenDatasetSample(sequence=torch.LongTensor([]),
                                          offset=torch.LongTensor([]),
                                          label=torch.LongTensor([]),
                                          vocab_size=len(self._encoded_to_idx))

        tokenized_rows = tokenize_rows(sub_data)

        offset = []
        sequence = []
        
        for i, tokens in enumerate(tokenized_rows):
            sub_sequence = [self._encoded_to_idx[self._token_encoder[t]] for t in tokens]
            sequence.extend(sub_sequence)
            offset.append(len(sequence) - len(sub_sequence))
        
        label = [self._label_encoder[l] for l in sub_data['category']]

        return WordTokenDatasetSample(sequence=torch.LongTensor(sequence),
                                      offset=torch.LongTensor(offset),
                                      label=torch.LongTensor(label),
                                      vocab_size=len(self._encoded_to_idx))


    def prepare(self):
        if self.downsample_c is not None:
            self.data = downsample(self.data, self.downsample_c)

        tokenized_rows = tokenize_rows(self.data)

        self._unigram_counts = create_unigram_counts(tokenized_rows)
        self._token_encoder = { t : self._encoded_token(t) for t in self._unigram_counts }
        self._encoded_to_idx = { t:i for i,t in enumerate(self._token_encoder.values()) }
        self._label_encoder = {l:i for i,l in enumerate(self.data['category'].unique()) }
        
        # Remove any rows in data that have no tokens.
        keep_mask = np.zeros(len(tokenized_rows))
        for i, ts in enumerate(tokenized_rows):
            # This will be true if there exists a token that is encoded into itself.
            # (i.e. not an unknown token or low freq token).
            keep_mask[i] = len([True for t in ts if self._encoded_token(t) == t]) > 0

        keep_mask = keep_mask.astype(bool)
        self.data = self.data.iloc[keep_mask]
        self._is_prepared = True


    def _encoded_token(self, token):
        assert(self._unigram_counts is not None)

        if self.accepted_tokens is not None and token not in self.accepted_tokens:
            return self.__TOKEN_UNK__
        elif token not in self._unigram_counts:
            return self.__TOKEN_UNK__
        elif self._unigram_counts[token] < self.min_word_freq:
            return self.__TOKEN_LOW_FREQ__
        return token

        

In [191]:
def collate_samples(samples):
    if len(samples) == 0:
        return WordTokenDatasetSample(sequence=torch.LongTensor([]),
                                      offset=torch.LongTensor([]),
                                      label=torch.LongTensor([]),
                                      vocab_size=0)

    label = torch.cat([s.label for s in samples])
    sequence = torch.cat([s.sequence for s in samples])
    vocab_size = samples[0].vocab_size

    offset = torch.zeros_like(label, dtype=torch.int64)
    iter = 0
    shift_val = 0

    for i, sample in enumerate(samples):
        print(iter)
        sample_offset = sample.offset
        offset[iter:(iter+len(sample_offset))] = (sample_offset + shift_val)

        iter = iter + len(sample_offset)
        shift_val = shift_val + len(samples[i].sequence)
    
    return WordTokenDatasetSample(sequence=sequence,
                                  offset=offset,
                                  label=label,
                                  vocab_size=vocab_size)

