In [46]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

import dask
from dask.distributed import Client
import dask.dataframe as dd

In [47]:
client = Client(n_workers=4)

### Loading the Data

In [6]:
# Read in data
data = './data/ag_news_csv/train.csv'

train_df = dd.read_csv(
    data,
    header=None,
    names=['class', 'title', 'description'],
)
train_df

Unnamed: 0_level_0,class,title,description
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,object,object
,...,...,...


In [7]:
# concatenate column 1 and 2 as one text
train_df['text'] = train_df.title + train_df.description
train_df = train_df.drop(['title', 'description'], axis=1)

In [8]:
train_df

Unnamed: 0_level_0,class,text
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
,int64,object
,...,...


### Tokenizing the Text

In [25]:
texts = train_df.text.str.lower().to_dask_array(lengths=True).rechunk((2000,)) # text values as an array

print(f"Length of texts: {len(texts.compute())}")
print(f"First two text excerpts:\n{texts.compute()[:2]}")

Length of texts: 120000
First two text excerpts:
0    wall st. bears claw back into the black (reute...
1    carlyle looks toward commercial aerospace (reu...
Name: text, dtype: object


In [26]:
# Initialize and train Tokenizer on text
# The Tokenizer at a character level will detect all unique characters that exist on the training dataset (texts)
# When not at char_level, it does this on all unique words
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(texts.compute())

# This generates a character dictionary learned from the training data
tk.word_index

{'UNK': 1,
 ' ': 2,
 'e': 3,
 'a': 4,
 't': 5,
 'i': 6,
 's': 7,
 'o': 8,
 'n': 9,
 'r': 10,
 'l': 11,
 'd': 12,
 'h': 13,
 'c': 14,
 'u': 15,
 'p': 16,
 'm': 17,
 'g': 18,
 'f': 19,
 'y': 20,
 'w': 21,
 'b': 22,
 '.': 23,
 'v': 24,
 'k': 25,
 ',': 26,
 '-': 27,
 ';': 28,
 '3': 29,
 '0': 30,
 'x': 31,
 '9': 32,
 'j': 33,
 'q': 34,
 '#': 35,
 '1': 36,
 '(': 37,
 ')': 38,
 '2': 39,
 "'": 40,
 'z': 41,
 '\\': 42,
 '&': 43,
 ':': 44,
 '/': 45,
 '5': 46,
 '4': 47,
 '6': 48,
 '"': 49,
 '7': 50,
 '$': 51,
 '8': 52,
 '=': 53,
 '?': 54,
 '!': 55,
 '_': 56,
 '*': 57}

In [27]:
# Although we generated a vocabulary already, we already have an existing character list:
alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
char_dict

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 '-': 60,
 ',': 38,
 ';': 39,
 '.': 40,
 '!': 41,
 '?': 42,
 ':': 43,
 "'": 44,
 '"': 45,
 '/': 46,
 '\\': 47,
 '|': 48,
 '_': 49,
 '@': 50,
 '#': 51,
 '$': 52,
 '%': 53,
 '^': 54,
 '&': 55,
 '*': 56,
 '~': 57,
 '`': 58,
 '+': 59,
 '=': 61,
 '<': 62,
 '>': 63,
 '(': 64,
 ')': 65,
 '[': 66,
 ']': 67,
 '{': 68,
 '}': 69}

In [28]:
# Now, we just need to add the 'UNK' character to the vocabulary

tk.word_index = char_dict   # assign Tokenizer's word index to our custom index
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1   # append 'UNK' to be the next sequential value of the char dict

### Converting Characters to Index

In this step, the goal is to represent all text by using its character index according to the char_dict we set earlier

In [33]:
# This can be accomplished using tk.texts_to_sequences()
sequences = tk.texts_to_sequences(texts)

# notice how a string has been converted to a character array, where each value is the char_dict
# value corresponding to that character key
print(texts[0])
print(sequences[0])

wall st. bears claw back into the black (reuters)reuters - short-sellers, wall street's dwindling\band of ultra-cynics, are seeing green again.
[23, 1, 12, 12, 70, 19, 20, 40, 70, 2, 5, 1, 18, 19, 70, 3, 12, 1, 23, 70, 2, 1, 3, 11, 70, 9, 14, 20, 15, 70, 20, 8, 5, 70, 2, 12, 1, 3, 11, 70, 64, 18, 5, 21, 20, 5, 18, 19, 65, 18, 5, 21, 20, 5, 18, 19, 70, 60, 70, 19, 8, 15, 18, 20, 60, 19, 5, 12, 12, 5, 18, 19, 38, 70, 23, 1, 12, 12, 70, 19, 20, 18, 5, 5, 20, 44, 19, 70, 4, 23, 9, 14, 4, 12, 9, 14, 7, 47, 2, 1, 14, 4, 70, 15, 6, 70, 21, 12, 20, 18, 1, 60, 3, 25, 14, 9, 3, 19, 38, 70, 1, 18, 5, 70, 19, 5, 5, 9, 14, 7, 70, 7, 18, 5, 5, 14, 70, 1, 7, 1, 9, 14, 40]


### Padding

As you might imagine, all the texts have different lengths, and they must be normalized so that the CNN can handle the batch data. Notice that our `char_dict` starts at 1, not 0. This is because 0 will serve as our meaningless padding value.

In [12]:
data = pad_sequences(
    sequences,                                  # sequences to be padded
    maxlen=max([len(i) for i in sequences]),    # get max length of all sequences
    padding='post'                              # pad sequences on the right end
)

In [13]:
print(sequences[0][:160], "\n")
print(data[0][:160])

[23, 1, 12, 12, 70, 19, 20, 40, 70, 2, 5, 1, 18, 19, 70, 3, 12, 1, 23, 70, 2, 1, 3, 11, 70, 9, 14, 20, 15, 70, 20, 8, 5, 70, 2, 12, 1, 3, 11, 70, 64, 18, 5, 21, 20, 5, 18, 19, 65, 18, 5, 21, 20, 5, 18, 19, 70, 60, 70, 19, 8, 15, 18, 20, 60, 19, 5, 12, 12, 5, 18, 19, 38, 70, 23, 1, 12, 12, 70, 19, 20, 18, 5, 5, 20, 44, 19, 70, 4, 23, 9, 14, 4, 12, 9, 14, 7, 47, 2, 1, 14, 4, 70, 15, 6, 70, 21, 12, 20, 18, 1, 60, 3, 25, 14, 9, 3, 19, 38, 70, 1, 18, 5, 70, 19, 5, 5, 9, 14, 7, 70, 7, 18, 5, 5, 14, 70, 1, 7, 1, 9, 14, 40] 

[23  1 12 12 70 19 20 40 70  2  5  1 18 19 70  3 12  1 23 70  2  1  3 11
 70  9 14 20 15 70 20  8  5 70  2 12  1  3 11 70 64 18  5 21 20  5 18 19
 65 18  5 21 20  5 18 19 70 60 70 19  8 15 18 20 60 19  5 12 12  5 18 19
 38 70 23  1 12 12 70 19 20 18  5  5 20 44 19 70  4 23  9 14  4 12  9 14
  7 47  2  1 14  4 70 15  6 70 21 12 20 18  1 60  3 25 14  9  3 19 38 70
  1 18  5 70 19  5  5  9 14  7 70  7 18  5  5 14 70  1  7  1  9 14 40  0
  0  0  0  0  0  0  0  0  0  0  0  0  

In [14]:
# Now that our data has been normalized, we can convert the 2D list to a numpy array
data = np.array(data)
data.shape

(120000, 1011)

### Obtaining the Labels

In this step, we're going to make our labels begin as 0-index (since they start with 1, currently). Afterwards, because this is a multiclass classifier task, we need the classes to be one-hot encoded. There are 4 classes, so 4 class columns will be created; a column value will be 1 if that column matches that sample's class, and 0 otherwise.

In [22]:
class_list = [ x-1 for x in train_df['class'].values ]    # make classes start with 0 index

In [24]:
from tensorflow.keras.utils import to_categorical

classes = to_categorical(class_list)

In [32]:
class_list[75:80]   # note the classes...

[2, 2, 2, 3, 3]

In [33]:
classes[75:80]  # now note where the 1 value is...

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [35]:
train_df

Unnamed: 0,class,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."
...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...
119996,2,Renteria signing a top-shelf dealRed Sox gener...
119997,2,Saban not going to Dolphins yetThe Miami Dolph...
119998,2,Today's NFL gamesPITTSBURGH at NY GIANTS Time:...


In [48]:
client.shutdown()

2022-06-13 16:19:11,331 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
