# Data Cleanup

The purpose of this notebook is to:

1. Cleanup the data so it can be readily used by the models.
2. Split the data into training and testing sets so the different models have a consistent benchmark.
3. Create a mini version of the training set for models that are more computationally limited.

**NOTE: This code uses pseudo-random operations, so running this script will generate different data on each run.**

In [56]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split


## Load the Data

In [57]:
# The dataset provided is malformed JSON. Need to fix up the JSON formatting
# so that it can be ingested by pandas.

with open('./data/News_Category_Dataset_v2.json') as file:
    lines = file.readlines()
    json = f'[{",".join(lines)}]'


In [58]:
data = pd.read_json(json, orient='records')

## Downsampling

In [59]:
def downsample(df, c):
    category_counts = df['category'].value_counts()
    min_count = category_counts.min()

    # Calculate the probability of keeping a row
    # of a given category.
    category_probs = (min_count / category_counts) ** (1/c)

    # This is a series used to determine the probability that each
    # row is kept. Each rows mask depends on its category.
    prob_mask = np.zeros(len(df))

    for i, category in enumerate(category_counts.index.tolist()):
        category_prob = category_probs[i]
        category_keep_mask = (df['category'] == category) * category_prob
        prob_mask = prob_mask + category_keep_mask

    keep_mask = np.random.rand(len(df)) <= prob_mask
    
    return df[keep_mask]


In [60]:
data = downsample(data, c=3)

## Cleaning Up Data

In [61]:
import string

from nltk.tokenize.regexp import WordPunctTokenizer

In [62]:
tokenizer = WordPunctTokenizer()

In [63]:
# Special token for tokens that occur MIN_WORD_FREQ or fewer times in the
# entire corpus.
__LOW_FREQ_TOKEN__ = '__LOW_FREQ_TOKEN__'

MIN_WORD_FREQ = 5


In [64]:
def cleanup_and_tokenize_text(text):
    cleaned = ''.join([c for c in text if c not in string.punctuation]).lower()
    return tokenizer.tokenize(cleaned)


In [65]:
def tokenize_rows(df):
    tokenized_headlines = df['headline'].apply(cleanup_and_tokenize_text).tolist()
    tokenized_desc = df['short_description'].apply(cleanup_and_tokenize_text).tolist()

    return [tokens1 + tokens2 for tokens1, tokens2 in zip(tokenized_headlines, tokenized_desc)]
    

In [66]:
def create_unigram_counts(rows):
    # Flatten
    tokens = [t for tokens in rows for t in tokens]
    
    counts = {}

    for token in tokens:
        if token not in counts:
            counts[token] = 0
        counts[token] += 1

    return counts
    

In [67]:
def create_encoder_and_decoder(unigram_counts):
    encoder = {t:i for i,t in enumerate(unigram_counts.keys())}
    decoder = {i:t for t,i in encoder.items()}
    
    return encoder, decoder
    

In [68]:
def create_bow_dataframe(encoded_token_rows, encoder, decoder):
    bows = np.zeros((len(encoded_token_rows), len(encoder)))

    for i, encoded_tokens in enumerate(encoded_token_rows):
        for encoded in encoded_tokens:
            bows[i, encoded] += 1
    
    df = pd.DataFrame(data=bows)
    df.columns = [decoder[i] for i in range(len(decoder))]
    
    return df
    

In [69]:
start_time = time.time()

print('[1/7] Tokenizing rows ...')
token_rows = tokenize_rows(data)

print('[2/7] Generating global unigram count ...')
unigram_counts = create_unigram_counts(token_rows)

print('[3/7] Filtering out low-frequency words ...')
token_rows = [[token if unigram_counts[token] > MIN_WORD_FREQ else __LOW_FREQ_TOKEN__ for token in tokens] for tokens in token_rows]

print('[4/7] Re-computing unigram counts ...')
unigram_counts = create_unigram_counts(token_rows)

print('[5/7] Create encoder / decoder ...')
encoder, decoder = create_encoder_and_decoder(unigram_counts)

print('[6/7] Encoding Token Rows ...')
encoded_token_rows = [[encoder[t] for t in tokens] for tokens in token_rows]

print('[7/7] Creating Bag Of Words DataFrame ...')
data_bow = create_bow_dataframe(encoded_token_rows, encoder, decoder)

end_time = time.time()

print('Done!')
print(f'Ran in {(end_time - start_time)/60:.02f}m')


[1/7] Tokenizing rows ...
[2/7] Generating global unigram count ...
[3/7] Filtering out low-frequency words ...
[4/7] Re-computing unigram counts ...
[5/7] Create encoder / decoder ...
[6/7] Encoding Token Rows ...
[7/7] Creating Bag Of Words DataFrame ...
Done!
Ran in 0.22m


## Generating Datasets

In [70]:
labels = data['category']

X_train, X_test, y_train, y_test = train_test_split(data_bow, labels, test_size=0.4)


In [71]:
# Create a smaller version of the training data
# for quicker training.

sample_rate = 0.6
sample_mask = np.random.rand(len(X_train)) < sample_rate

X_train_mini = X_train[sample_mask]
X_train_mini.reset_index(inplace=True)

y_train_mini = y_train[sample_mask]


In [72]:
start_time = time.time()

print(f'[1/3] Saving training data...')
X_train.to_pickle('./data/train_data.pickle')
y_train.to_pickle('./data/train_labels.pickle')

print(f'[2/3] Saving mini training data...')
X_train_mini.to_pickle('./data/train_data_mini.pickle')
y_train_mini.to_pickle('./data/train_labels_mini.pickle')

print(f'[3/3] Saving test data...')
X_test.to_pickle('./data/test_data.pickle')
y_test.to_pickle('./data/test_labels.pickle')

end_time = time.time()

print('Done!')
print(f'Ran in {(end_time - start_time)/60:.02f}m')


[1/3] Saving training data...
[2/3] Saving mini training data...
[3/3] Saving test data...
Done!
Ran in 2.14m


In [73]:
print(f'{len(X_train)} samples in training set.')

print(f'{len(X_train_mini)} samples in mini training set.')

print(f'{len(X_test)} samples in test set')


64722 samples in training set.
39041 samples in mini training set.
43148 samples in test set
