In [1]:
import datasets
import keras_nlp
import transformers
import numpy as np
import tensorflow as tf
import tqdm.notebook as tqdm
import sklearn.model_selection
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

from collections.abc import Callable

2024-09-28 14:46:12.258397: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-28 14:46:12.264303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-28 14:46:12.273148: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-28 14:46:12.275573: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-28 14:46:12.281785: I tensorflow/core/platform/cpu_feature_guar

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except:
        pass

I0000 00:00:1727498774.815794   67784 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1727498774.817850   67784 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1727498774.818880   67784 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [3]:
train = datasets.load_dataset('wangrongsheng/ag_news', split='train')
test  = datasets.load_dataset('wangrongsheng/ag_news', split='test')

In [5]:
# tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset("gpt2_base_en")
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')

In [6]:
tokenizer.pad_token = tokenizer.eos_token

In [7]:
class LSTM(tf.keras.layers.Layer):
    n_features: int = 0
    def  __init__(self, n_features: int, name: str, **kwargs):
        super().__init__(name=name, **kwargs)
        self.n_features = n_features
        self.dense_1 = tf.keras.layers.Dense(n_features, activation='sigmoid', name=f'{name}_dense1')
        self.dense_2 = tf.keras.layers.Dense(n_features, activation='sigmoid', name=f'{name}_dense2')
        self.dense_3 = tf.keras.layers.Dense(n_features, activation='sigmoid', name=f'{name}_dense3')
        self.dense_4 = tf.keras.layers.Dense(n_features, activation='tanh', name=f'{name}_dense4')

    def call(self, x: tuple[tf.Tensor, tuple[tf.Tensor, tf.Tensor]]) -> tuple[tf.Tensor, tuple[tf.Tensor, tf.Tensor]]: # x + state
        x, (h, c) = x
        x_h = tf.keras.ops.concatenate([x, h], axis=1)
        c = tf.keras.ops.add(
            tf.keras.ops.multiply(self.dense_1(x_h), c), 
            tf.keras.ops.multiply(self.dense_2(x_h), self.dense_4(x_h))
        ) # c_new
        h = tf.keras.ops.multiply(tf.keras.ops.tanh(c), self.dense_3(x_h))
        return h, (h, c)

    def get_config(self) -> dict:
        res = super().get_config()
        res['n_features'] = self.n_features
        return res

In [8]:
class GRU(tf.keras.layers.Layer):
    n_features: int = 0
    def  __init__(self, n_features: int, name: str, **kwargs):
        super().__init__(name=name, **kwargs)
        self.n_features = n_features
        self.dense_1 = tf.keras.layers.Dense(n_features, activation='sigmoid', name=f'{name}_dense1')
        self.dense_2 = tf.keras.layers.Dense(n_features, activation='sigmoid', name=f'{name}_dense2')
        self.dense_3 = tf.keras.layers.Dense(n_features, activation='tanh', name=f'{name}_dense3')

    def call(self, x: tuple[tf.Tensor, tuple[tf.Tensor]]) -> tuple[tf.Tensor, tuple[tf.Tensor]]: # x + state
        x, (h,) = x
        x_h = tf.keras.ops.concatenate([x, h], axis=1)
        r = tf.keras.ops.concatenate(
                [tf.keras.ops.multiply(self.dense_1(x_h), h), x],
                axis = 1
        )
        r = self.dense_3(r) 

        z = self.dense_2(x_h)
        h_hat = tf.keras.ops.multiply(z,r)
        z = tf.keras.ops.multiply(
                tf.keras.ops.subtract(tf.keras.ops.ones_like(z), z),
                h
            )
        h = tf.keras.ops.add(z, h_hat)

        return h, (h,)
        

    def get_config(self) -> dict:
        res = super().get_config()
        res['n_features'] = self.n_features
        return res

In [9]:
def make_get_state_LSTM(n_features: int) -> Callable[[int, tf.DType], tuple[tf.Tensor, ...]]:
    @tf.function
    def get_state(batch_size: int, dtype: tf.DType) -> tuple[tf.Tensor, tf.Tensor]:
        return tf.zeros([batch_size, n_features], dtype=dtype), tf.zeros([batch_size, n_features], dtype=dtype)
    return get_state    
    
def make_get_state_GRU(n_features: int) -> Callable[[int, tf.DType], tuple[tf.Tensor, ...]]:
    @tf.function
    def get_state(batch_size: int, dtype: tf.DType) -> tuple[tf.Tensor]:
        return tf.zeros([batch_size, n_features], dtype=dtype), 
    return get_state            

def apply_rnn(x: tf.Tensor, model: tf.keras.Model, gen_state: Callable[[int, tf.DType], tuple[tf.Tensor, ...]]) -> tf.Tensor:
    n = tf.shape(x)[1]
    c = lambda i, _: tf.less(i, n)

    state = gen_state(tf.shape(x)[0], x.dtype)
    b = lambda i, state: (i + 1, model((x[:, i], state))[1])
    i = tf.constant(0)
    return tf.while_loop(c, b, [i, state])[1]

@tf.function
def apply_rnn(x: tf.Tensor, model: tf.keras.Model, gen_state: Callable[[int, tf.DType], tuple[tf.Tensor, ...]]) -> tf.Tensor:
    i = tf.constant(0)
    state = gen_state(tf.shape(x)[0], x.dtype)
    n = tf.shape(x)[1]
    while i < n:
        state = model((x[:, i], state))[1]
        i += 1
    return state

# цикл обучения с tf gradient tape 

In [10]:
def get_accuracy(
    data: datasets.Dataset,
    embedding: tf.keras.layers.Layer,
    rnn: tf.keras.layers.Layer,
    prediction: tf.keras.layers.Layer,
    batch_size: int = 64,
    leave: bool = True
) -> float:
    accuracy = 0
    for i in tqdm.trange(0, len(data), batch_size, leave=leave):
        data_slice = data[i:i + batch_size]
        X, y = data_slice['text'], tf.constant(data_slice['label'])
        x = tokenizer(X, return_tensors='tf', padding='max_length', max_length=64, truncation=True)['input_ids']
        
        x = embedding(x)
        x = apply_rnn(x, rnn, gen_state)[0]
        x = prediction(x)
    
        y_pred = tf.keras.ops.argmax(x, axis=-1)
        accuracy += tf.keras.ops.mean(y == y_pred)
    
    return (accuracy / len(range(0, len(data), batch_size))).numpy()

In [None]:
# gen = np.random.default_rng()
# a = gen.uniform(0.0, 1.0, (2, 8, 64)).astype(np.float32)
# a = tf.constant(a)

# #model  = LSTM(64, 'first_lstm')
# #apply_rnn(a, model, make_get_state_LSTM(64))

# model  = GRU(64, 'first_lstm')
# apply_rnn(a, model, make_get_state_GRU(64))

In [11]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

I0000 00:00:1727498812.338173   67784 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1727498812.339773   67784 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1727498812.340729   67784 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1727498812.439272   67784 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [12]:
embedding = tf.keras.layers.Embedding(50257, 64, name='embedding')
# rnn = LSTM(64, 'lstm_news')
rnn = GRU(64, 'gru_news')
# gen_state = make_get_state_LSTM(64)
gen_state = make_get_state_GRU(64)
prediction = tf.keras.layers.Dense(4, name='prediction')

In [13]:
x = tokenizer(train['text'][10:14], return_tensors='tf', padding='max_length', max_length=64, truncation=True)['input_ids']
x = embedding(x)
x = apply_rnn(x, rnn, gen_state)[0]
x = prediction(x)
# loss = loss_function(y, x)
# x = tf.keras.ops.argmax(x, axis=-1)

''+ptx85+ptx85' is not a recognized feature for this target' is not a recognized feature for this target (ignoring feature)
 (ignoring feature)
''+ptx85+ptx85' is not a recognized feature for this target' is not a recognized feature for this target (ignoring feature)
 (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring f

In [14]:
rng = np.random.default_rng()

In [113]:
loss

<tf.Tensor: shape=(), dtype=float32, numpy=1.3876143>

In [15]:
pbar = tqdm.trange(8)
for _ in pbar:
    pbar_epoch = tqdm.trange(1800, leave=False)
    for _ in pbar_epoch:
        ind = rng.choice(len(train), size=64, replace=False)
        data = train[ind]
        X, y = data['text'], tf.constant(data['label'])
        x = tokenizer(X, return_tensors='tf', padding='max_length', max_length=64, truncation=True)['input_ids']

        with tf.GradientTape() as tape:
            x = embedding(x)
            x = apply_rnn(x, rnn, gen_state)[0]
            x = prediction(x)
            loss = loss_function(y, x)

        g = tape.gradient(loss, (*embedding.trainable_weights, *rnn.trainable_weights, *prediction.trainable_weights))
        optimizer.apply_gradients(zip(g, (*embedding.trainable_weights, *rnn.trainable_weights, *prediction.trainable_weights)))

        pbar_epoch.set_description(f'loss: {tf.math.reduce_mean(loss):.3f}')

    pbar.set_description(f'accuracy: {get_accuracy(test, embedding, rnn, prediction, leave=False):.3f}')

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring feature)
'+ptx85' is not a recognized feature for this target (ignoring f

  0%|          | 0/119 [00:00<?, ?it/s]

  0%|          | 0/1800 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [16]:
get_accuracy(test, embedding, rnn, prediction, leave=False)

  0%|          | 0/119 [00:00<?, ?it/s]

0.91268384

In [17]:
get_accuracy(train, embedding, rnn, prediction)

  0%|          | 0/1875 [00:00<?, ?it/s]

0.94794166