<a href="https://colab.research.google.com/github/allanbatista/classificacao-de-produtos-no-e-commerce/blob/master/codigo/notebooks/Distor%C3%A7%C3%A3o_de_Texto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Distorção de Texto

Antes dos textos serem injetados no modelo, eles passam por dois processos de distorção:

* remoção de palavras;
* troca de posição de palavras

### Criação dos dados de Exemplo

In [21]:
def format_print(sequences):
  for sequence in sequences.numpy():
    print(*[str(x.round(2)).ljust(5) for x in sequence.reshape(-1)])

In [22]:
import tensorflow as tf

sequences = tf.stack(
  [
    tf.pad(tf.random.uniform(shape=[8, 1], minval=0, maxval=10), [[0, 0], [0, 0]]),
    tf.pad(tf.random.uniform(shape=[7, 1], minval=0, maxval=10), [[0, 1], [0, 0]]),
    tf.pad(tf.random.uniform(shape=[6, 1], minval=0, maxval=10), [[0, 2], [0, 0]]),
    tf.pad(tf.random.uniform(shape=[5, 1], minval=0, maxval=10), [[0, 3], [0, 0]])
  ],
  axis=0
)

format_print(sequences)

1.88  7.14  6.06  3.7   7.49  7.27  6.07  9.24 
4.55  4.43  7.54  9.8   6.89  4.98  1.73  0.0  
8.54  1.15  2.13  9.12  4.42  7.33  0.0   0.0  
5.31  0.14  0.32  5.15  4.26  0.0   0.0   0.0  


### Definição das funções de Distorção do texto

* **uniform_random_drop_sequence_fn**: cria uma função para remover uma palavra de uma sequência simples.
* **uniform_random_swap_sequence_fn**: cria uma função para trocar aposição de duas palavras em uma sequência simples.
* **uniform_random_swap_sequences**: faz a troca de palavras de multiplas sequencias em paralelo.
* **uniform_random_drop_sequences**: remove palavras de multiplas sequencias em paralelo.
* **uniform_distorce_sequences**: Faz a distorção tanto de remoção quanto de troca de posiçãão de palavras ao mesmo tempo em multiplas sequencias em paralelo.

In [28]:
import tensorflow as tf


def uniform_random_drop_sequence_fn(sequence_size: int, minrate: float = 0.1, maxrate: float = 0.1, skip_rate: float = 0.5):
    """
    essa irá remover N elementos da sequencia e irá substituir por um novo elemento (zero) no final da sequencia
    """

    rate = tf.random.uniform(shape=[], minval=minrate, maxval=maxrate)
    min_quantity = tf.constant(1, dtype=tf.float32)
    skip_rate = tf.constant(skip_rate, dtype=tf.float32)

    def _uniform_random_drop_sequence(sequence):
        if tf.random.uniform(shape=[], minval=0.0, maxval=1.0) < skip_rate:
            return sequence

        reduce_sum = tf.reduce_sum(sequence, axis=1)
        sequence_clean = tf.boolean_mask(sequence, tf.cast(reduce_sum, dtype=tf.bool))
        total_nonzero = tf.math.count_nonzero(reduce_sum)

        # too low quantity words. should skip
        if total_nonzero <= 4:
            return sequence

        drop_quantity = tf.math.floor(tf.multiply(tf.cast(total_nonzero, dtype=tf.float32), rate))
        drop_quantity = tf.reduce_max([drop_quantity, min_quantity])
        drop_quantity = tf.cast(drop_quantity, dtype=tf.int32)

        drop_mask_mask = tf.argsort(tf.random.uniform([total_nonzero])) >= drop_quantity

        sequence_dropped = tf.boolean_mask(sequence_clean, drop_mask_mask)

        right_pad_size = sequence_size - tf.math.count_nonzero(tf.reduce_sum(sequence_dropped, axis=1))
        right_pad = tf.zeros((right_pad_size, sequence.get_shape()[1]))

        return tf.concat([sequence_dropped, right_pad], axis=0)

    return _uniform_random_drop_sequence

def uniform_random_swap_sequence_fn(sequence_size: int, minrate: float = 0.1, maxrate: float = 0.1, skip_rate: float = 0.5):
    rate = tf.random.uniform(shape=[], minval=minrate, maxval=maxrate)
    min_quantity = tf.constant(1, dtype=tf.float32)
    skip_rate = tf.constant(skip_rate, dtype=tf.float32)

    def _uniform_random_swap_sequence(sequence):
        if tf.random.uniform(shape=[], minval=0.0, maxval=1.0) < skip_rate:
          return sequence

        reduce_sum = tf.reduce_sum(sequence, axis=1)
        total_nonzero = tf.math.count_nonzero(reduce_sum)

        # too low quantity words. should skip
        if total_nonzero <= 2:
            return sequence

        swap_quantity = tf.math.floor(tf.multiply(tf.cast(total_nonzero, dtype=tf.float32), rate))
        swap_quantity = tf.reduce_max([swap_quantity, min_quantity])
        swap_quantity = tf.cast(swap_quantity, dtype=tf.int32)

        indexies_to_swap = tf.argsort(tf.random.uniform([total_nonzero - 1]))[:swap_quantity]

        indexies = tf.range(sequence_size)
        indexies = tf.tensor_scatter_nd_update(indexies, tf.reshape(indexies_to_swap, (-1, 1)), indexies_to_swap + 1)
        indexies = tf.tensor_scatter_nd_update(indexies, tf.reshape(indexies_to_swap + 1, (-1, 1)), indexies_to_swap)

        return tf.gather(sequence, indexies)

    return _uniform_random_swap_sequence


def uniform_random_swap_sequences(sequences, minrate: float = 0.1, maxrate: float = 0.1, skip_rate: float = 0.5):
    map_fn = uniform_random_swap_sequence_fn(sequence_size=sequences.shape[1], minrate=minrate, maxrate=maxrate, skip_rate=skip_rate)
    return tf.map_fn(map_fn, sequences, dtype=sequences.dtype, parallel_iterations=sequences.shape[0])


def uniform_random_drop_sequences(sequences, minrate: float = 0.1, maxrate: float = 0.1, skip_rate: float = 0.5):
    map_fn = uniform_random_drop_sequence_fn(sequence_size=sequences.shape[1], minrate=minrate, maxrate=maxrate, skip_rate=skip_rate)
    return tf.map_fn(map_fn, sequences, dtype=sequences.dtype, parallel_iterations=sequences.shape[0])


def uniform_distorce_sequences(sequences,
                               drop_minrate=0.1, drop_maxrate=0.1,
                               swap_minrate=0.1, swap_maxrate=0.1,
                               drop_skip_rate=0.5, swap_skip_rate=0.5):
    _sequences = uniform_random_drop_sequences(sequences, minrate=drop_minrate, maxrate=drop_maxrate, skip_rate=drop_skip_rate)
    _sequences = uniform_random_swap_sequences(_sequences, minrate=swap_minrate, maxrate=swap_maxrate, skip_rate=swap_skip_rate)
    return _sequences

#### Exemplo de remoção de palavras

In [24]:
for rate in tf.range(0.0, 1, 0.25):
  print(f"rate={rate}")
  format_print(uniform_random_drop_sequences(sequences, minrate=rate, maxrate=rate, skip_rate=0.0))

rate=0.0
1.88  7.14  3.7   7.49  7.27  6.07  9.24  0.0  
4.43  7.54  9.8   6.89  4.98  1.73  0.0   0.0  
8.54  1.15  2.13  9.12  7.33  0.0   0.0   0.0  
5.31  0.14  0.32  5.15  0.0   0.0   0.0   0.0  
rate=0.25
1.88  3.7   7.49  7.27  6.07  9.24  0.0   0.0  
4.55  4.43  7.54  9.8   4.98  1.73  0.0   0.0  
8.54  1.15  9.12  4.42  7.33  0.0   0.0   0.0  
0.14  0.32  5.15  4.26  0.0   0.0   0.0   0.0  
rate=0.5
1.88  3.7   7.27  6.07  0.0   0.0   0.0   0.0  
4.55  4.43  7.54  1.73  0.0   0.0   0.0   0.0  
8.54  9.12  4.42  0.0   0.0   0.0   0.0   0.0  
5.31  0.14  0.32  0.0   0.0   0.0   0.0   0.0  
rate=0.75
7.27  6.07  0.0   0.0   0.0   0.0   0.0   0.0  
4.43  6.89  0.0   0.0   0.0   0.0   0.0   0.0  
8.54  2.13  0.0   0.0   0.0   0.0   0.0   0.0  
5.31  5.15  0.0   0.0   0.0   0.0   0.0   0.0  


#### Exemplo troca de palavras de posição

In [25]:
for rate in tf.range(0.0, 1, 0.25):
  print(f"rate={rate}")
  format_print(uniform_random_swap_sequences(sequences, minrate=rate, maxrate=rate, skip_rate=0.0))

rate=0.0
1.88  7.14  6.06  3.7   7.49  6.07  7.27  9.24 
4.55  4.43  7.54  9.8   6.89  1.73  4.98  0.0  
8.54  1.15  2.13  9.12  7.33  4.42  0.0   0.0  
5.31  0.14  5.15  0.32  4.26  0.0   0.0   0.0  
rate=0.25
1.88  6.06  7.14  3.7   7.49  6.07  7.27  9.24 
4.55  4.43  7.54  9.8   6.89  1.73  4.98  0.0  
1.15  8.54  2.13  9.12  4.42  7.33  0.0   0.0  
5.31  0.14  0.32  4.26  5.15  0.0   0.0   0.0  
rate=0.5
7.14  1.88  7.14  6.06  7.27  7.49  6.07  9.24 
4.43  4.55  7.54  6.89  9.8   6.89  1.73  0.0  
1.15  8.54  1.15  4.42  9.12  7.33  0.0   0.0  
5.31  0.14  5.15  0.32  5.15  0.0   0.0   0.0  
rate=0.75
7.14  1.88  7.14  6.06  3.7   7.49  7.27  9.24 
4.43  4.55  4.43  7.54  9.8   6.89  1.73  0.0  
1.15  8.54  1.15  2.13  7.33  4.42  0.0   0.0  
0.14  5.31  5.15  0.32  5.15  0.0   0.0   0.0  


### Exemplo de remoção e troca de posição de palavras.

In [29]:
for rate in tf.range(0.0, 1, 0.25):
  print(f"rate={rate}")
  format_print(uniform_distorce_sequences(sequences,
                                            drop_minrate=rate, drop_maxrate=rate,
                                            swap_minrate=rate, swap_maxrate=rate,
                                            drop_skip_rate=0.0, swap_skip_rate=0.0))

rate=0.0
7.14  1.88  6.06  3.7   7.49  6.07  9.24  0.0  
4.43  4.55  7.54  9.8   6.89  1.73  0.0   0.0  
8.54  2.13  1.15  9.12  7.33  0.0   0.0   0.0  
5.31  5.15  0.32  4.26  0.0   0.0   0.0   0.0  
rate=0.25
7.14  6.06  7.49  7.27  9.24  6.07  0.0   0.0  
4.55  9.8   7.54  6.89  4.98  1.73  0.0   0.0  
1.15  8.54  2.13  4.42  7.33  0.0   0.0   0.0  
0.14  5.31  0.32  4.26  0.0   0.0   0.0   0.0  
rate=0.5
7.14  1.88  7.14  6.07  0.0   0.0   0.0   0.0  
6.89  9.8   6.89  1.73  0.0   0.0   0.0   0.0  
2.13  8.54  4.42  0.0   0.0   0.0   0.0   0.0  
0.14  4.26  0.32  0.0   0.0   0.0   0.0   0.0  
rate=0.75
3.7   7.27  0.0   0.0   0.0   0.0   0.0   0.0  
9.8   1.73  0.0   0.0   0.0   0.0   0.0   0.0  
1.15  2.13  0.0   0.0   0.0   0.0   0.0   0.0  
5.31  0.14  0.0   0.0   0.0   0.0   0.0   0.0  
