## THIS NOTEBOOK IS FOR BEING FAMILIAR WITH DEEP LEARNING
- We will use tensorflow and keras for training
- We will have a small, custom dataset

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


2023-08-13 00:10:20.314968: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-13 00:10:20.363521: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.


In [2]:
# Sample data
sentences = [
    ("hello world", "hi world"),
    ("apple orange", "apple banana"),
    ("machine learning", "deep learning"),
    ("openai", "openai gpt"),
    # Add more data here
]

In [10]:
# Combine sentences to form corrupted input
corrupted_sentences = [s1 + ' ' + s2 + ' ' + s1 for s1, s2 in sentences]
labels = [s2 for s1, s2 in sentences]
print(corrupted_sentences)

['hello world hi world hello world', 'apple orange apple banana apple orange', 'machine learning deep learning machine learning', 'openai openai gpt openai']


In [4]:
# Define tokenizer and fit on corrupted sentences
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(corrupted_sentences)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 18


In [31]:
# Tokenizing and add padding

# Input data
max_length = max([len(s) for s in corrupted_sentences])
X = tokenizer.texts_to_sequences(corrupted_sentences)
X = pad_sequences(X, maxlen=max_length, padding='post')

In [30]:
# Labels
# Do the same for the labels
Y = tokenizer.texts_to_sequences(labels)
Y = pad_sequences(Y, maxlen=max_length, padding='post')

print("X shape: ", X.shape)
print("Y shape: ", Y.shape)


X shape:  (4, 47)
Y shape:  (4, 47)


In [34]:
# Build the autoencoder model (same as before)
input_layer = Input(shape=(max_length,))
embedding = Embedding(vocab_size, 64, input_length=max_length)(input_layer)
bi_lstm = Bidirectional(LSTM(32))(embedding)
decoded = Dense(vocab_size, activation='softmax')(bi_lstm)

autoencoder = Model(inputs=input_layer, outputs=decoded)
# autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
autoencoder.compile(optimizer='adam', loss='categorical_crossentropy')
print(autoencoder.summary())


autoencoder.fit(X, Y, epochs=10, batch_size=4)

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 47)]              0         
                                                                 
 embedding_10 (Embedding)    (None, 47, 64)            1152      
                                                                 
 bidirectional_10 (Bidirect  (None, 64)                24832     
 ional)                                                          
                                                                 
 dense_10 (Dense)            (None, 18)                1170      
                                                                 
Total params: 27154 (106.07 KB)
Trainable params: 27154 (106.07 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10


ValueError: in user code:

    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/home/tam/tam-code/string-matching-rnn/venv/lib/python3.10/site-packages/keras/src/backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (4, 47) and (4, 18) are incompatible


In [None]:
# Encode the sentences
encoder = Model(inputs=input_layer, outputs=bi_lstm)
encoded_data = encoder.predict(X)



In [None]:
# Now you can use the encoded_data for prediction
new_samples = [
    ("hello world", "hi there"),
    ("apple banana", "banana apple"),
    ("machine learning", "learning machine"),
    ("openai", "gpt openai"),
    # Add more new samples here
]

new_encoded_samples = []

In [None]:
for s1, s2 in new_samples:
    new_input = tokenizer.texts_to_sequences([s1 + ' ' + s2 + ' ' + s1])
    new_input = pad_sequences(new_input, maxlen=max_length, padding='post')
    new_encoded = encoder.predict(new_input)
    new_encoded_samples.append(new_encoded)

# Now you can use new_encoded_samples for further analysis or predictions
print("Encoded data for new samples:")
print(new_encoded_samples)

Encoded data for new samples:
[array([[-0.01078334, -0.01630737,  0.02797763, -0.00336495, -0.01090707,
        -0.02515487,  0.02766134, -0.01381304,  0.01710702, -0.02640632,
        -0.0233629 , -0.00292518, -0.0191737 ,  0.01790585,  0.02886255,
        -0.00611169,  0.03289169, -0.01349095, -0.01487449, -0.02337017,
         0.02534136, -0.00255498,  0.01700653, -0.00702298,  0.00211998,
         0.02680595,  0.0008493 ,  0.00223542, -0.02161465,  0.02137545,
        -0.0032118 , -0.02374905,  0.01421249,  0.00284617,  0.00553894,
        -0.00778293, -0.00454393, -0.00229641,  0.0053709 ,  0.00148447,
        -0.00575233,  0.00627638,  0.00236789, -0.00053764, -0.00858179,
         0.00280549,  0.00697564, -0.0032546 ,  0.00166743,  0.0016205 ,
        -0.0065305 ,  0.00274602, -0.01002082,  0.00664434,  0.0036468 ,
        -0.00441237, -0.00139709,  0.00259375, -0.01472466,  0.00917792,
        -0.00579543, -0.0083159 ,  0.00594948,  0.01332669]],
      dtype=float32), array([[-