<a href="https://colab.research.google.com/github/arksolutionzz/ark/blob/master/LSTM_AutoEncoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load the IMDB dataset
max_features = 10000  # Number of words to consider as features
max_len = 100  # Cut texts after this number of words

# Load the data
(x_train, _), (x_test, _) = imdb.load_data(num_words=max_features)

# Pad sequences to the same length
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
embedding_dim = 128
latent_dim = 64

# Encoder
inputs = Input(shape=(max_len,))
embedding = Embedding(max_features, embedding_dim, input_length=max_len)(inputs)  # Embedding layer to convert words to vectors
encoded = LSTM(latent_dim)(embedding)  # LSTM layer to encode the input sequence into a latent vector

# Decoder
decoded = RepeatVector(max_len)(encoded)  # RepeatVector repeats the encoded vector max_len times to match the input sequence length
decoded = LSTM(embedding_dim, return_sequences=True)(decoded)  # LSTM layer to decode the repeated vector back into sequence form
decoded = TimeDistributed(Dense(max_features, activation='softmax'))(decoded)  # TimeDistributed applies Dense layer to each time step

# Autoencoder Model
autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')  # Compile the model with Adam optimizer and sparse categorical crossentropy loss
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVecto  (None, 100, 64)           0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 100, 128)          98816     
                                                                 
 time_distributed (TimeDist  (None, 100, 10000)        1290000   
 ributed)                                                    

In [None]:

# We need to expand the dimensions of the target data to match the output of the model
x_train_exp = np.expand_dims(x_train, -1)
x_test_exp = np.expand_dims(x_test, -1)

In [None]:
# Train the model
autoencoder.fit(x_train, x_train_exp, epochs=10, batch_size=32, validation_data=(x_test, x_test_exp))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x782bf2b5b970>

In [None]:
# Make predictions
predictions = autoencoder.predict(x_test[:10])

# Convert predictions to text for evaluation (this part is simplified and illustrative)
tokenizer = Tokenizer(num_words=max_features)
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i, '?') for i in encoded_review])

# Example of decoding original and reconstructed reviews
original_review = decode_review(x_test[0])
reconstructed_review = decode_review(np.argmax(predictions[0], axis=-1))

print(f"Original review: {original_review}")
print(f"Reconstructed review: {reconstructed_review}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
Original review: ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? the wonder own as by is sequence i i and and to of hollywood br of down shouting getting boring of ever it sadly sadly sadly i i was then does don't close faint after one carry as by are be favourites all family turn in does as three part in another some to be probably with world and her an have faint beginning own as is sequence
Reconstructed review: ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? the the as as as of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of of
