# Data Utils

The purpose of this notebook is to:

1. Create utility operations for preparing and cleaning data that will be used to train the different models.
2. Test out different utility methods to make sure they work as intended.
3. Create a custom torch Dataset that can be plugged in to pytorch models.

**NOTE: This code uses pseudo-random operations, so running this script will generate different data on each run.**

In [34]:
import math
import numpy as np
import pandas as pd
import time

from torch.utils.data import Dataset


## Load the Data

In [48]:
def load_embeddings(path, embedding_dim):
    with open(path) as file:
        lines = file.readlines()

        index = []
        embeddings = np.zeros((len(lines), embedding_dim))
        
        for i, l in enumerate(lines):
            tokens = l.split(' ')
            index.append(tokens[0])
            embeddings[i, :] = tokens[1:]

        return pd.DataFrame(embeddings, index=index)
    

In [49]:
embeddings = load_embeddings('./data/glove.6B/glove.6B.100d.txt', embedding_dim=100)

embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.038194,-0.24487,0.72812,-0.39961,0.083172,0.043953,-0.39141,0.3344,-0.57545,0.087459,...,0.016215,-0.017099,-0.38984,0.87424,-0.72569,-0.51058,-0.52028,-0.1459,0.8278,0.27062
",",-0.10767,0.11053,0.59812,-0.54361,0.67396,0.10663,0.038867,0.35481,0.06351,-0.094189,...,0.34951,-0.7226,0.37549,0.4441,-0.99059,0.61214,-0.35111,-0.83155,0.45293,0.082577
.,-0.33979,0.20941,0.46348,-0.64792,-0.38377,0.038034,0.17127,0.15978,0.46619,-0.019169,...,-0.063351,-0.67412,-0.068895,0.53604,-0.87773,0.31802,-0.39242,-0.23394,0.47298,-0.028803
of,-0.1529,-0.24279,0.89837,0.16996,0.53516,0.48784,-0.58826,-0.17982,-1.3581,0.42541,...,0.18712,-0.018488,-0.26757,0.727,-0.59363,-0.34839,-0.56094,-0.591,1.0039,0.20664
to,-0.1897,0.050024,0.19084,-0.049184,-0.089737,0.21006,-0.54952,0.098377,-0.20135,0.34241,...,-0.13134,0.058617,-0.31869,-0.61419,-0.62393,-0.41548,-0.038175,-0.39804,0.47647,-0.15983


In [44]:
# Load training data into dataframe.
data = pd.read_json('./data/train_data.json', orient='records')


## Downsampling

In [5]:
def downsample(df, c):
    category_counts = df['category'].value_counts()
    min_count = category_counts.min()

    # Calculate the probability of keeping a row
    # of a given category.
    category_probs = (min_count / category_counts) ** (1/c)

    # This is a series used to determine the probability that each
    # row is kept. Each rows mask depends on its category.
    prob_mask = np.zeros(len(df))

    for i, category in enumerate(category_counts.index.tolist()):
        category_prob = category_probs[i]
        category_keep_mask = (df['category'] == category) * category_prob
        prob_mask = prob_mask + category_keep_mask

    keep_mask = np.random.rand(len(df)) <= prob_mask
    
    return df[keep_mask].reset_index(drop=True)


## Cleaning Up Data

In [6]:
import string

from nltk.tokenize.regexp import WordPunctTokenizer

In [7]:
tokenizer = WordPunctTokenizer()

In [8]:
# Special token for tokens that occur MIN_WORD_FREQ or fewer times in the
# entire corpus.
__LOW_FREQ_TOKEN__ = '__LOW_FREQ_TOKEN__'

MIN_WORD_FREQ = 5


In [9]:
def cleanup_and_tokenize_text(text):
    cleaned = ''.join([c for c in text if c not in string.punctuation]).lower()
    return tokenizer.tokenize(cleaned)


In [10]:
def tokenize_rows(df):
    tokenized_headlines = df['headline'].apply(cleanup_and_tokenize_text).tolist()
    tokenized_desc = df['short_description'].apply(cleanup_and_tokenize_text).tolist()

    return [tokens1 + tokens2 for tokens1, tokens2 in zip(tokenized_headlines, tokenized_desc)]
    

In [11]:
def create_unigram_counts(rows, token_encoder):
    # Flatten
    tokens = [t for tokens in rows for t in tokens]
    
    counts = {}

    for token in tokens:
        if token not in counts:
            counts[token] = 0
        counts[token] += 1

    return counts
    

In [12]:
def create_encoder_and_decoder(unigram_counts):
    encoder = {t:i for i,t in enumerate(unigram_counts.keys())}
    decoder = {i:t for t,i in encoder.items()}
    
    return encoder, decoder
    

In [13]:
def create_bow_dataframe(encoded_token_rows, encoder, decoder):
    bows = np.zeros((len(encoded_token_rows), len(encoder)))

    for i, encoded_tokens in enumerate(encoded_token_rows):
        for encoded in encoded_tokens:
            bows[i, encoded] += 1
    
    df = pd.DataFrame(data=bows)
    df.columns = [decoder[i] for i in range(len(decoder))]
    
    return df
    

In [14]:
def process_data(data, token_encoder, log=True):
    print_if_logging = lambda x: print(x) if log else None

    print_if_logging(f'[2/8] Generating labels...')
    labels = data['category']
    
    print_if_logging(f'[3/8] Tokenizing rows...')
    token_rows = tokenize_rows(data)

    print_if_logging('[4/8] Generating global unigram count ...')
    unigram_counts = create_unigram_counts(token_rows)


    print_if_logging('[5/8] Filtering out low-frequency words (only small dataset) ...')
    if min_word_freq is not None:
        token_rows = [[token if unigram_counts[token] > min_word_freq else __LOW_FREQ_TOKEN__ for token in tokens] for tokens in token_rows]
        unigram_counts = create_unigram_counts(token_rows)
    else:
        print_if_logging(f'      Skipping low-frequency filtering')
        

    print_if_logging('[6/8] Create encoder / decoder ...')
    encoder, decoder = create_encoder_and_decoder(unigram_counts)
    

    print_if_logging('[7/8] Encoding Token Rows ...')
    encoded_token_rows = [[encoder[t] for t in tokens] for tokens in token_rows]

    
    print_if_logging('[8/8] Creating Bag Of Words DataFrame ...')
    data_bow = create_bow_dataframe(encoded_token_rows, encoder, decoder)


    return data_bow, labels, encoder, decoder



In [15]:
start_time = time.time()

print('Processing large dataset...')
data_large, labels_large, encoder_large, decoder_large = process_data(data, min_word_freq=None, should_downsample=False)

print('Processing small dataset...')
data_small, labels_small, encoder_small, decoder_small = process_data(data, min_word_freq=MIN_WORD_FREQ, should_downsample=True)

end_time = time.time()

print('Done!')
print(f'Ran in {(end_time - start_time)/60:.02f}m')


Processing large dataset...
[1/7] Downsampling...
Skipping Downsampling
[2/8] Generating labels...
[3/8] Tokenizing rows...
[4/8] Generating global unigram count ...
[5/8] Filtering out low-frequency words (only small dataset) ...
Skipping low-frequency filtering
[6/8] Create encoder / decoder ...
[7/8] Encoding Token Rows ...
[8/8] Creating Bag Of Words DataFrame ...
Processing small dataset...
[1/7] Downsampling...
[2/8] Generating labels...
[3/8] Tokenizing rows...
[4/8] Generating global unigram count ...
[5/8] Filtering out low-frequency words (only small dataset) ...
[6/8] Create encoder / decoder ...
[7/8] Encoding Token Rows ...
[8/8] Creating Bag Of Words DataFrame ...
Done!
Ran in 0.45m


In [16]:
print(f'len(data_large) == {len(data_large)}')
print(f'len(data_small) == {len(data_small)}')
print()
print(f'len(data_large.columns) == {len(data_large.columns)}')
print(f'len(data_small.columns) == {len(data_small.columns)}')

len(data_large) == 160607
len(data_small) == 86654

len(data_large.columns) == 101082
len(data_small.columns) == 20959


## Pytorch Dataset

In [None]:
# TODO: HERE I AM CREATING TORCH DATASET!

class WorkTokenDataset(Dataset):
    __TOKEN_UNK__ = '__TOKEN_UNK__'

    __TOKEN_LOW_FREQ__ = '__TOKEN_LOW_FREQ__'
    
    __TOKEN_UNK__ = '__TOKEN_UNK__'


    def __init__(self, data, embeddings, downsample_c=None, min_word_freq=0):
        super().__init__()
        
        self.data = data
        self.embeddings = embeddings
        self.downsample_c = downsample_c
        self.min_word_freq = min_word_freq

        self._is_prepared = False


    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        sub_data = self.data.iloc[idx]


    def prepare(self):
        tokenized_rows = tokenize_rows(self.data)
        unigram_counts = create_unigram_counts(self.data)
        
    