# Transformers

In [214]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import ( 
    MultiHeadAttention, 
    LayerNormalization, 
    Dropout, 
    Layer,
    Embedding, 
    Input, 
    GlobalAveragePooling1D, 
    Dense,
    LSTM
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from nlp_utils import basic_cleaning, process_text
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer

In [376]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), 
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [3]:
data = pd.read_csv("../../data/ArchivoProblemas.csv", sep=";",header=None,)
data.columns = ["target", "id", "text"]
data = basic_cleaning(data, text_cols=["text"])
data["text_tokenized_list"] = data["text"].apply(lambda x: process_text(x, keep_as_list=True))
data["text_tokenized"] = data["text"].apply(lambda x: process_text(x, keep_as_list=False))

# Shuffle data
data = data.sample(frac = 1)
data

Unnamed: 0,target,id,text,text_tokenized_list,text_tokenized
12305,cause,3340.0,bajo desempeno fiscal,"[bajo, desempeno, fiscal]",bajo desempeno fiscal
1982,effect,15215.0,aumento en los indices de trabajo infantil.,"[aumento, indices, trabajo, infantil]",aumento indices trabajo infantil
2447,effect,212098.0,inseguridad e incomodidad para usuarios y func...,"[inseguridad, incomodidad, usuarios, funcionar...",inseguridad incomodidad usuarios funcionarios
3773,effect,5373.0,incremento en los indices de pobreza multidime...,"[incremento, indices, pobreza, multidimensiona...",incremento indices pobreza multidimensional mu...
10441,cause,127192.0,mal estado de las vias,"[mal, vias]",mal vias
...,...,...,...,...,...
8700,cause,4784.0,"1. vias en mal estado, intransitables o con re...","[vias, mal, intransitables, restricciones, tra...",vias mal intransitables restricciones transito
7119,effect,5399.0,aumento de tiempos de viaje,"[aumento, tiempos, viaje]",aumento tiempos viaje
1272,cause,10.0,3.falta de tiempo y dificil desplazamiento pa...,"[tiempo, dificil, desplazamiento, asistir, ied]",tiempo dificil desplazamiento asistir ied
6937,cause,4603.0,deterioro de las vias,"[deterioro, vias]",deterioro vias


## Text preprocessing

In [276]:
MAX_LEN = 7
VOCAB_SIZE = len(tf_tokenizer.word_index)
EMBED_DIM = 100

label_binarizer = LabelBinarizer()
y = label_binarizer.fit_transform(data["target"])
tf_tokenizer = Tokenizer()
fit_text = [" ".join(data["text_tokenized"])]
tf_tokenizer.fit_on_texts(fit_text)

In [277]:
def text_to_index(text):
    return [ tf_tokenizer.word_index[word] for word in text.split(" ")]

data["index_text"] = data["text_tokenized"].apply(lambda x: text_to_index(x))

# Transformer 

In [280]:
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

X = np.array(data["index_text"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_LEN)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_LEN)

inputs = Input(shape=(MAX_LEN,))
embedding_layer = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE+1, EMBED_DIM)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(EMBED_DIM, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(3, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

In [281]:
model.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])

history = model.fit(X_train, y_train, 
                    batch_size=32, epochs=50, 
                    validation_data=(X_test, y_test)
                   )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# LSTM

In [287]:
def define_nn():
    NeuralNetwork = Sequential()
    NeuralNetwork.add(Input(shape=(MAX_LEN,)))
    NeuralNetwork.add(Embedding(input_dim=VOCAB_SIZE+1, output_dim=EMBED_DIM))
    NeuralNetwork.add(LSTM(128))
    NeuralNetwork.add(Dense(128, activation="relu"))
    NeuralNetwork.add(Dropout(0.1))
    NeuralNetwork.add(Dense(16, activation="relu"))
    NeuralNetwork.add(Dropout(0.1))
    NeuralNetwork.add(Dense(3, activation="softmax"))
    print('NeuralNetwork architecture: \n')
    print(NeuralNetwork.summary())  
    return NeuralNetwork

In [288]:
nn_model = define_nn()

NeuralNetwork architecture: 

Model: "sequential_51"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_93 (Embedding)    (None, 7, 100)            541800    
                                                                 
 lstm_10 (LSTM)              (None, 128)               117248    
                                                                 
 dense_165 (Dense)           (None, 128)               16512     
                                                                 
 dropout_155 (Dropout)       (None, 128)               0         
                                                                 
 dense_166 (Dense)           (None, 16)                2064      
                                                                 
 dropout_156 (Dropout)       (None, 16)                0         
                                                                 
 dense_167 (Dense)     

In [289]:
nn_model.compile(optimizer="adam", loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])

history = nn_model.fit(X_train, y_train, 
                    batch_size=64, epochs=50, 
                    validation_data=(X_test, y_test)
                   )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Predictions

In [367]:
import tensorflow as tf

def make_prediction(
    text: str, model, 
    prediction_threshold: float = 0.35
    ) -> str:
    """Make prediction for Selected neural network
    """
    tokenized = " ".join([
        word for word in process_text(text.lower()).split(" ")
        if word in list(tf_tokenizer.word_index.keys())
    ])
    
    vector_ = tf.keras.preprocessing.sequence.pad_sequences( 
        np.array(text_to_index(tokenized)).reshape(1,-1),  maxlen=MAX_LEN
    )
    
    probabilities = np.array(model.predict(vector_))
    predictions = {
        label_binarizer.classes_[i]: probabilities[0][i]
        for i in range(3)
    }
    
    if any([prob > prediction_threshold for prob in list(predictions.values())]):
        return max(predictions, key=predictions.get)
    return "Predicciones no superan el umbral para seleccionar almenos una categoria"

In [375]:
make_prediction("recursos insuficientes del presupuest ", model=nn_model)



'cause'