## Getting ready...

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences, skipgrams, make_sampling_table
from tensorflow.keras.preprocessing.text import text_to_word_sequence, one_hot, hashing_trick, Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2022-01-20 15:07:46.352544: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## How to do it...

### Sequence Processing

#### Time series generator

In [2]:
series = np.array([i for i in range(10)])
print(series)

[0 1 2 3 4 5 6 7 8 9]


In [11]:
generator = TimeseriesGenerator(data = series,
                               targets = series,
                               length=5,
                               batch_size=1,
                               shuffle=False,
                               reverse=False)

In [12]:
# number of samples
print(f'Samples: {len(generator)}')

Samples: 5


In [13]:
for i in range(len(generator)):
    x, y = generator[i]
    print(f'{x} => {y}')

[[0 1 2 3 4]] => [5]
[[1 2 3 4 5]] => [6]
[[2 3 4 5 6]] => [7]
[[3 4 5 6 7]] => [8]
[[4 5 6 7 8]] => [9]


In [14]:
model = Sequential()
model.add(Dense(10, activation='relu', input_dim=5))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

2022-01-20 15:15:15.650952: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-01-20 15:15:15.703441: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-20 15:15:15.703812: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:2d:00.0 name: GeForce RTX 2070 SUPER computeCapability: 7.5
coreClock: 1.785GHz coreCount: 40 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 417.29GiB/s
2022-01-20 15:15:15.703827: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-01-20 15:15:15.705428: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-01-20 15:15:15.705451: I tensorflow/stream_executor/plat

In [15]:
model.fit(generator, epochs=10)

2022-01-20 15:15:26.999346: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-01-20 15:15:27.019445: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 4200250000 Hz
2022-01-20 15:15:27.176421: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2022-01-20 15:15:27.445324: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11


<tensorflow.python.keras.callbacks.History at 0x7f6fe0e7ae50>

#### Padding sequences

In [16]:
sentences = [["What", "do", "you", "like", "?"],
            ["I", "like", "basket-ball", "!"],
            ["And", "you", "?"],
            ["I", "like", "coconut", "and", "apple"]]

In [17]:
text_set = set(np.concatenate(sentences))
vocab_to_int = dict(zip(text_set, range(len(text_set))))
int_to_vocab = {vocab_to_int[word]:word for word in vocab_to_int.keys()}

In [19]:
encoded_sentences = []
for sentence in sentences:
    encoded_sentence = [vocab_to_int[word] for word in sentence]
    encoded_sentences.append(encoded_sentence)
encoded_sentences

[[3, 11, 10, 0, 5], [4, 0, 6, 9], [7, 10, 5], [4, 0, 8, 1, 2]]

In [20]:
pad_sequences(encoded_sentences)

array([[ 3, 11, 10,  0,  5],
       [ 0,  4,  0,  6,  9],
       [ 0,  0,  7, 10,  5],
       [ 4,  0,  8,  1,  2]], dtype=int32)

In [22]:
pad_sequences(encoded_sentences, maxlen=7)

array([[ 0,  0,  3, 11, 10,  0,  5],
       [ 0,  0,  0,  4,  0,  6,  9],
       [ 0,  0,  0,  0,  7, 10,  5],
       [ 0,  0,  4,  0,  8,  1,  2]], dtype=int32)

In [23]:
pad_sequences(encoded_sentences, maxlen=3)

array([[10,  0,  5],
       [ 0,  6,  9],
       [ 7, 10,  5],
       [ 8,  1,  2]], dtype=int32)

In [24]:
pad_sequences(encoded_sentences, maxlen=3, truncating='post')

array([[ 3, 11, 10],
       [ 4,  0,  6],
       [ 7, 10,  5],
       [ 4,  0,  8]], dtype=int32)

#### Skip-grams

In [25]:
sentence = "I like coconut and apple"
encoded_sentence = [vocab_to_int[word] for word in sentence.split()]
vocabulary_size = len(encoded_sentence)

In [26]:
pairs, labels = skipgrams(encoded_sentence,
                         vocabulary_size,
                         window_size=1,
                         negative_samples=0)

In [27]:
for i in range(len(pairs)):
    print(f"({int_to_vocab[pairs[i][0]]}, {int_to_vocab[pairs[i][1]]}) -> {labels[i]}")

(apple, and) -> 1
(and, apple) -> 1
(coconut, and) -> 1
(and, coconut) -> 1


In [28]:
len(pairs)

4

### Text preprocessing

#### Split text to word sequence

In [29]:
sentence = "I like cocount, I like apple"

In [30]:
text_to_word_sequence(sentence, lower=False)

['I', 'like', 'cocount', 'I', 'like', 'apple']

In [31]:
text_to_word_sequence(sentence, lower=True, filters=[])

['i', 'like', 'cocount,', 'i', 'like', 'apple']

#### Tokenizer

In [32]:
sentences = [["What", "do", "you", "like", "?"],
            ["I", "like", "basket-ball", "!"],
            ["And", "you", "?"],
            ["I", "like", "coconut", "and", "apple"]]

In [34]:
# Create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(sentences)

In [35]:
print(t.word_counts)

OrderedDict([('what', 1), ('do', 1), ('you', 2), ('like', 3), ('?', 2), ('i', 2), ('basket-ball', 1), ('!', 1), ('and', 2), ('coconut', 1), ('apple', 1)])


In [36]:
print(t.document_count)

4


In [37]:
print(t.word_index)

{'like': 1, 'you': 2, '?': 3, 'i': 4, 'and': 5, 'what': 6, 'do': 7, 'basket-ball': 8, '!': 9, 'coconut': 10, 'apple': 11}


In [38]:
print(t.word_docs)

defaultdict(<class 'int'>, {'like': 3, '?': 2, 'what': 1, 'you': 2, 'do': 1, '!': 1, 'basket-ball': 1, 'i': 2, 'and': 2, 'apple': 1, 'coconut': 1})


In [39]:
t.texts_to_matrix(sentences, mode='binary')

array([[0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0.],
       [0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.]])

In [40]:
t.texts_to_matrix(sentences, mode='count')

array([[0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0.],
       [0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.]])

### Image preprocessing

In [41]:
# Load CIFAR10 Dataset
(x_cifar10_train, y_cifar10_train), (x_cifar10_test, y_cifar10_test) = tf.keras.datasets.cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [43]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=15,
    width_shift_range=3,
    height_shift_range=3,
    horizontal_flip=True)

In [44]:
it = datagen.flow(x_cifar10_train, y_cifar10_train, batch_size=32)

In [48]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=[32, 32, 3]),
    tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding="same", activation='relu'),
    tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding="same", activation='relu'),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy",
             optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
             metrics=['accuracy'])

In [49]:
history = model.fit(it, epochs=10,
                   steps_per_epoch=len(x_cifar10_train) / 32,
                   validation_data=(x_cifar10_test, y_cifar10_test))

Epoch 1/10


2022-01-20 16:00:14.664511: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2022-01-20 16:00:15.109255: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8100


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
