#### Chapter 4 - Design Patterns for Non Visual Representation learning

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

# Sample sentences
sentences = [
    'This is the first sentence.',
    'And here is another sentence.',
    'Yet another sentence follows.'
]

# Create tokenizer with character-level tokenization
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(sentences)

# Tokenize the sentences at the character level
tokenized_sentences = tokenizer.texts_to_sequences(sentences)

print(tokenized_sentences)


In [None]:
# Sample sentences
sentences = [
    'This is the first sentence.',
    'And here is another sentence.',
   'Yet another sentence follows.'
]

# Create tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

# Tokenize the sentences
tokenized_sentences = tokenizer.texts_to_sequences(sentences)

print(tokenized_sentences)


##### Bert Tokenizer

In [None]:
from tokenizers import BertWordPieceTokenizer
import tempfile

# Sample sentence
sentence = 'This is the first sentence.'

# Initialize the tokenizer
tokenizer = BertWordPieceTokenizer()

# Create a temporary file and write the sentence to it
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(sentence.encode('utf-8'))
temp_file.close()

# Train the tokenizer on the sentence
tokenizer.train([temp_file.name])

# Encode the sentence into subword tokens
encoding = tokenizer.encode(sentence)

# Get the subword tokens
subword_tokens = encoding.tokens

print(subword_tokens)


#### Word Embeddings

In [None]:
# Define the vocabulary size and embedding dimension
vocab_size = 10000
embedding_dim = 128
# Create the embedding layer
embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)

# Example usage: Pass token indices to the embedding layer
token_indices = [1, 3, 5, 7, 9]
token_indices_tensor = tf.constant(token_indices)
embedded_tokens = embedding_layer(token_indices_tensor)

print(embedded_tokens)


#### Sequence labelling

In [None]:
import tensorflow as tf

# Define the input sequence as text
input_sequence = ["This is an example sentence.", "Another sentence for testing."]

# Define the POS labels
pos_labels = [['DET', 'VERB', 'DET', 'NOUN', 'NOUN', 'PUNCT'], ['DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT']]

# Define the maximum length for padding
maxlen = 10

# Define the vocabulary size and embedding dimension
vocab_size = 100
embedding_dim = 128


In [None]:
# Tokenize the input sequence
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(input_sequence)
input_sequence = tokenizer.texts_to_sequences(input_sequence)

# Convert the input sequence and labels to numpy arrays
input_sequence = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=maxlen, padding='post')
# Convert POS labels to numerical indices
pos_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>', filters='')
pos_tokenizer.fit_on_texts([label for sentence_labels in pos_labels for label in sentence_labels])
pos_labels_num = [pos_tokenizer.texts_to_sequences(sentence_labels) for sentence_labels in pos_labels]

# Pad the POS labels
pos_labels_padded = tf.keras.preprocessing.sequence.pad_sequences(pos_labels_num, maxlen=maxlen, padding='post')


In [None]:
# Define the RNN parameters
rnn_units = 64

# Create the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True),
    tf.keras.layers.Dense(len(pos_tokenizer.word_index) + 1, activation='softmax')  # +1 for padding
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
model.fit(input_sequence, pos_labels_padded, epochs=10)


#### Sequence Classification Pattern

In [None]:
import numpy as np
import tensorflow as tf

# Define the input sequence as text
input_sequence = ["This is an example sentence.", "Another sentence for testing."]

# Define the labels
labels = [0, 1]

# Tokenize the input sequence
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(input_sequence)
input_sequence = tokenizer.texts_to_sequences(input_sequence)

# Convert the input sequence and labels to numpy arrays
input_sequence = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=maxlen, padding='post')
# Convert the labels to numpy arrays
labels = np.array(labels)

# Define the vocabulary size and embedding dimension
vocab_size = 100
embedding_dim = 128


In [None]:
# Create the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.LSTM(lstm_units),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(input_sequence, labels, epochs=10)




#### Language Generation Pattern

1.	text = '''Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.'''

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,LSTM,Dense

# Create tokenizer with character-level tokenization
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(text)


In [None]:
def textProcessor(text,tokenizer,train=True):
    # Tokenize the sentences at the character level
    tokenized_sentences = tokenizer.texts_to_sequences(text)
    # Create tokenized sequence by unlisting the tokens
    tokenized_sequence = [char[0] for char in tokenized_sentences]
    # Define length of a sequence
    length = 10
    if train:
        dataset = []  

        for i in range(length,len(tokenized_sequence)):
            # Select the sequence for the desired length
            seq = tokenized_sequence[i-length:i+1]
            # Append the data set
            dataset.append(seq)
        dataset = np.array(dataset) 

        # Generate the sequence and the target
        X,Y = dataset[:,:-1],dataset[:,-1]
        # Get the Vocab size also
        vocab_size = len(set(tokenized_sequence))
        # Reshaping the target to convert to one hot encoded format
        Y_reshape = Y.reshape(-1, 1)
        # Converting target to one hot encoded shape
        y = to_categorical(Y_reshape, num_classes=vocab_size+1)
        return X,y,vocab_size
    else:
        return [tokenized_sequence[-length:]]


In [None]:
# Create the dataset.
dataset,target,vocab_size = textProcessor(text,tokenizer)

# Define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size+1, embedding_dim, input_length=dataset.shape[1]))
model.add(LSTM(64, return_sequences=True)),
model.add(LSTM(128))
model.add(Dense(vocab_size+1, activation='softmax'))


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(dataset,target, epochs=20, batch_size=32, validation_split=0.2)


In [None]:
def generate_text(model, start_text,tokenizer, max_len=100):
    """Generates text from an LSTM model."""
    generated_text = start_text
    for _ in range(max_len):
        # Get the next character prediction.
        prediction = model.predict(textProcessor(generated_text,tokenizer,False))
        # Select the most likely character.
        next_char = np.argmax(prediction)
        # Convert it back to character
        char_converted = tokenizer.sequences_to_texts(np.array([next_char]).reshape(-1, 1))[0]
        # Append the next character to the generated text.
        generated_text += char_converted
    return generated_text





In [None]:
generated_text = generate_text(model, "This is a sample text",tokenizer, max_len=20)

#### Encoder Decoder Architecture

The below is a sample code, to demonstrate the concepts. The full implementation can be found in the following link

https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/.

In [None]:
Sample model for encoder decoder architecture
model = Sequential()
# Encoder 

model.add(Embedding(enc_vocab, n_units, input_length=enc_timesteps, mask_zero=True))
model.add(LSTM(n_units))
# Decoder 

model.add(RepeatVector(dec_timesteps))
model.add(LSTM(n_units, return_sequences=True)) 

model.add(TimeDistributed(Dense(dec_vocab, activation='softmax'))) 

# compile model
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
model.summary()


#### Attention Mechanism

In [None]:
# Defining the dimensions
hidden_state = 16
time_sequence = 4
# Defining the input layers
Query = Input(shape=(hidden_state,), name='Query')
Key = Input(shape=(time_sequence,hidden_state), name='Key')

# Expanding the dimension of Query
Query_with_time_axis = tf.expand_dims(Query, 1,name="Query_expansion_layer")

# Dense Layers for each of the inputs 

W_Query = tf.keras.layers.Dense(hidden_state,name="Query_dense_layer")
W_Key = tf.keras.layers.Dense(hidden_state,name="Key_dense_layer")
V_consolidated = tf.keras.layers.Dense(1,name="Consolidated_dense_layer")

# Getting the Score 

scores = V_consolidated(tf.nn.tanh(W_Query(Query_with_time_axis) + W_Key(Key)))


In [None]:
# Deriving the normalized scores using Softmax
Attention_weight = tf.nn.softmax(scores, axis=1)


#### RNN with attention Mechanism

Please download the data set from the following link

https://github.com/Rishav09/Neural-Machine-Translation-System/blob/master/english-german-both.pkl

You can download the pickeled data and then store it in your local drive under a folder 'data/'

In [None]:
dataPath = "data/english-german-both.pkl"

# Load a clean dataset
clean_dataset = load(open(dataPath, 'rb'))
clean_dataset

n_sentences = 10000
train_split = 0.9

# Reduce dataset size
dataset = clean_dataset[:n_sentences, :]

# Random shuffle the dataset
shuffle(dataset)

# Split the dataset
train = dataset[:int(n_sentences * train_split)]

In [None]:
 # Fit a tokenizer
def create_tokenizer(dataset): 
    tokenizer = Tokenizer() 
    tokenizer.fit_on_texts(dataset)
    return tokenizer

def find_seq_length(dataset):
    return max(len(seq.split()) for seq in dataset)

def find_vocab_size(tokenizer, dataset):
    tokenizer.fit_on_texts(dataset)
    return len(tokenizer.word_index) + 1


# Prepare tokenizer for the encoder input
enc_tokenizer = create_tokenizer(train[:, 0])
enc_seq_length = find_seq_length(train[:, 0])
enc_vocab_size = find_vocab_size(enc_tokenizer, train[:, 0])


# Encode and pad the input sequences
trainX = enc_tokenizer.texts_to_sequences(train[:, 0])
trainX

trainX = pad_sequences(trainX, maxlen=enc_seq_length, padding='post')
trainX

trainX = convert_to_tensor(trainX, dtype=tf.float32)
trainX

# Prepare tokenizer for the decoder input
dec_tokenizer = create_tokenizer(train[:, 1])
dec_seq_length = find_seq_length(train[:, 1])
dec_vocab_size = find_vocab_size(dec_tokenizer, train[:, 1])

# Encode and pad the input sequences
trainY = dec_tokenizer.texts_to_sequences(train[:, 1])
trainY

trainY = pad_sequences(trainY, maxlen=dec_seq_length, padding='post')
trainY

trainY = convert_to_tensor(trainY, dtype=tf.float32)
trainY

In [None]:
config = {"batch_size" : 64,
          "src_vocabSize" : len(enc_tokenizer.word_index)+1,
          "tar_vocabSize" : len(dec_tokenizer.word_index)+1,
          "steps_per_epoch" :len(trainX)//64 ,
          "embedding_dim" : 128,
          "hidden_units" : 512  }

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, config):
        super(Encoder, self).__init__()
        self.batch_size= int(config["batch_size"])
        self.encoder_hu=int(config["hidden_units"])
        self.embedding=tf.keras.layers.Embedding(int(config["src_vocabSize"]), int(config["embedding_dim"]), dtype=tf.float32)
        self.lstm= tf.keras.layers.LSTM(self.encoder_hu, return_sequences=True,return_state=True, dtype=tf.float32)
    
    def call(self, x, hidden_lyr):
        # Create the embedding layer
        x= self.embedding(x)
        # Get the LSTM output
        output, state_h, state_c = self.lstm(x,initial_state=hidden_lyr)
        return output, [state_h, state_c]

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,config):
        super( BahdanauAttention, self).__init__()
        self.W1= Dense(config["hidden_units"], dtype=tf.float32)  # encoder output
        self.W2= Dense(config["hidden_units"], dtype=tf.float32)  # Decoder hidden
        self.V= Dense(1, dtype=tf.float32)
    
    def call(self, query, values):
        #calculate the Attention score
        score= self.V(tf.nn.tanh(self.W1(values) + self.W2(tf.expand_dims(query, axis=1))))
        # attention_weights 
        attention_weights= tf.nn.softmax(score, axis=1) # shape > (batch_size, src_vocab_size, 1)
        # context_vector 
        context_vector= attention_weights * values # shape > (batch_size, src_vocab_size, HU)
        # Reduce the context vector along the sequence axis
        context_vector = tf.reduce_sum(context_vector, axis=1) # shape > (batch_size,HU)
        return context_vector, attention_weights

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, config):
        super (Decoder,self).__init__()
        self.batch_size = config["batch_size"]
        self.decoder_hu = config["hidden_units"]
        self.embedding = Embedding(config["tar_vocabSize"], config["embedding_dim"], dtype=tf.float32)
        self.lstm= LSTM(config["hidden_units"], return_sequences= True,return_state=True,recurrent_initializer='glorot_uniform', dtype=tf.float32)
        # Fully connected layer
        self.fc= Dense(config["tar_vocabSize"], dtype=tf.float32)
        # attention
        self.attention = BahdanauAttention(config)
        
    def call(self, x, query, values):
        # Get the context vector 
        context_vector, attention_weights = self.attention(query,values)
        # Pass the decoder sequence one time step at a time to get the embedding layer
        x= self.embedding(x) # shape > (batch_size, 1, embedding dim)
        #print("decoder input after embedding",x.shape)
        # Concatenate decoder input and context vector to get the hidden state and output
        x= tf.concat([tf.expand_dims( context_vector, 1), x], axis=-1) # shape > (batch_size, 1, embedding dim + HU)
        #print("decoder input after concatenation",x.shape)
        # Get the RNN output
        output, state_h,state_c = self.lstm(x) # shape > (batch_size, 1,HU)
        #print("Output shape raw",output.shape)
        # Reshape the output to remove the sequence
        output= tf.reshape(output, (-1, output.shape[2])) # shape > (batch_size,HU)
        #print("Output shape after reshape",output.shape)
        # Final decoder output
        x= self.fc(output) # shape > (batch_size,target_vocab_size)
        return x, [state_h,state_c], attention_weights

In [None]:
#Define the optimizer and the loss function
optimizer = tf.keras.optimizers.Adam()

In [None]:
def train_step(inp, targ, encoder_hidden,config):
    loss = 0
    with tf.GradientTape() as tape:
        # Get the encoder outputs
        encoder_output, [enc_state_h, enc_state_c]= encoder(inp, encoder_hidden)
        # Initialise the decoder hidden state as the encoder hidden state
        decoder_hidden = enc_state_h
        # Initialise the decoder first input as 0. This can be altered to make it as start token 
        decoder_input = tf.expand_dims([0] * config["batch_size"], 1)
        # Iterate through each of the target sequence to get the predicted output and loss calculation
        for t in range(targ.shape[1]):
          # Generating the decoder output for the target sequence time step
          predictions, [decoder_hidden,_], _ = decoder(decoder_input, decoder_hidden, encoder_output)
          # Loss calculations for each sequence
          loss += tf.keras.losses.sparse_categorical_crossentropy(targ[:, t], predictions)
          # The next input will be the current target
          decoder_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [None]:
#Create data in memeory 
dataset=tf.data.Dataset.from_tensor_slices((trainX, trainY)).shuffle(config['batch_size'])
# shuffles the data in the batch
dataset = dataset.batch(config['batch_size'], drop_remainder=True)



In [None]:
EPOCHS=10
# Initialise encoder and decoder
encoder = Encoder(config)
decoder= Decoder(config)
for epoch in range(EPOCHS):
    start = time.time()
    # Initialise the encoder hidden shape
    enc_hidden = [tf.zeros((config["batch_size"], config["hidden_units"])),tf.zeros((config["batch_size"], config["hidden_units"]))]
    total_loss = 0
    # Train the model in batches
    for (batch, (inp, targ)) in enumerate(dataset.take(config["steps_per_epoch"])):
        batch_loss = train_step(inp, targ, enc_hidden,config)
        total_loss += batch_loss
        if batch % 100 == 0:
          print('Epoch {} Batch {} Loss {}'.format(epoch + 1,
                                                       batch,                                                   
                                             batch_loss.numpy()))
    print('Epoch {} Loss {}'.format(epoch + 1,
                                      total_loss / config["steps_per_epoch"]))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))