#### This code develops and trains a model to take text embeddings as input and generates the impression sentence.

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Reshape

# Parameters
embedding_dim = 768  # Dimension of your sentence embeddings
lstm_units = 256     # Number of units in LSTM
# Assuming your vocabulary size and maximum sentence length
vocab_size = 4001

# Encoder
encoder_inputs = Input(shape=(embedding_dim,))  # Your embeddings size

# Reshape input to add a time dimension (e.g., treating each feature as a time step)
encoder_reshaped = Reshape((embedding_dim, 1))(encoder_inputs)

encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_reshaped)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=256)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_dense = Dense(vocab_size, activation='softmax')

embedded_decoder_inputs = decoder_embedding(decoder_inputs)
decoder_outputs, _, _ = decoder_lstm(embedded_decoder_inputs, initial_state=encoder_states)
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Summary
model.summary()

# Training
# You need to prepare your data accordingly
# model.fit([input_embeddings, decoder_input_data], decoder_target_data, batch_size=64, epochs=100)


2023-12-09 04:37:42.074673: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-09 04:37:42.123854: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-09 04:37:44.015089: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:4b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-09 04:37:44.038044: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not op

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 768)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 reshape (Reshape)           (None, 768, 1)               0         ['input_1[0][0]']             
                                                                                                  
 embedding (Embedding)       (None, None, 256)            1024256   ['input_2[0][0]']             
                                                                                              

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Reshape, Dropout
from tensorflow.keras.regularizers import L1L2

# Parameters
embedding_dim = 768
lstm_units = 256
vocab_size = 2161
dropout_rate = 0.2  # Dropout rate
l2_reg = 1e-4       # L2 regularization factor

# Encoder
encoder_inputs = Input(shape=(embedding_dim,))
encoder_reshaped = Reshape((embedding_dim, 1))(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True, dropout=dropout_rate, 
                    recurrent_regularizer=L1L2(l2=l2_reg))  # Apply L2 regularization here
encoder_outputs, state_h, state_c = encoder_lstm(encoder_reshaped)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=256)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True, dropout=dropout_rate, 
                    recurrent_regularizer=L1L2(l2=l2_reg))  # Apply L2 regularization here
decoder_dense = Dense(vocab_size, activation='softmax')

embedded_decoder_inputs = decoder_embedding(decoder_inputs)
decoder_outputs, _, _ = decoder_lstm(embedded_decoder_inputs, initial_state=encoder_states)
decoder_outputs = Dropout(dropout_rate)(decoder_outputs)  # Dropout after LSTM
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Model summary
model.summary()

# Training with early stopping
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# You need to prepare your data accordingly
# model.fit([input_embeddings, decoder_input_data], decoder_target_data,
#           batch_size=64, epochs=100, callbacks=[early_stopping], validation_data=(val_input, val_target))


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 768)]                0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 reshape_1 (Reshape)         (None, 768, 1)               0         ['input_3[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 256)            553216    ['input_4[0][0]']             
                                                                                            

In [3]:
import pickle
# Load the data from the pickle file
with open('uniqueSentences_embeddings.pkl', 'rb') as fin:
    data = pickle.load(fin)


In [4]:
text_embeddings = data

In [5]:
len(text_embeddings)

1771

In [6]:
max_length = max(embedding.shape[0] for embedding in text_embeddings)
max_length

768

In [7]:
import numpy as np
# Function to pad embeddings
def pad_embeddings(embeddings, max_length):
    padded_embeddings = []
    for embedding in embeddings:
        # Calculate padding length
        padding_length = max_length - embedding.shape[0]
        
        # Create padding (zero padding in this example)
        padding = np.zeros(padding_length)

        # Append padding to the embedding
        padded_embedding = np.append(embedding, padding)
        padded_embeddings.append(padded_embedding)

    return np.array(padded_embeddings)

# Apply padding
padded_embeddings = pad_embeddings(text_embeddings, max_length)

In [8]:
with open("unique_sentences.txt", "r") as file:
    lines = file.readlines()

sentences_list = [line.strip() for line in lines]

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Parameters
vocab_size = 2161  # Your vocabulary size
max_sentence_length = 768  # Max length of the sentence


# Tokenization
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences_list)
target_sequences = tokenizer.texts_to_sequences(sentences_list)


In [10]:
# Padding
target_padded = pad_sequences(target_sequences, maxlen=max_sentence_length, padding='post')

# One-hot Encoding
#target_one_hot = np.array([to_categorical(seq, num_classes=vocab_size) for seq in target_padded])


In [11]:
# Now, split your data into training and validation sets
from sklearn.model_selection import train_test_split
input_train, input_val, target_train, target_val = train_test_split(padded_embeddings, target_padded, test_size=0.2)


In [12]:
target_train.shape

(1416, 768)

In [13]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor for improvement (validation loss)
    min_delta = 0.001, 
    patience=5,  # Number of epochs with no improvement after which training will stop
    restore_best_weights=True  # Restore the model weights from the epoch with the best value of the monitored metric
)

In [14]:
# Shift target sequences by one for the decoder input
decoder_input_data = np.zeros_like(target_padded)
decoder_input_data[:, 1:] = target_padded[:, :-1]  # Shift target sequence

# The decoder output data is the original target sequence
decoder_target_data = target_padded

# Split data into training and validation sets
input_train, input_val, decoder_input_train, decoder_input_val, target_train, target_val = train_test_split(
    padded_embeddings, decoder_input_data, target_padded, test_size=0.2
)


In [15]:
# Training
model.fit([input_train, target_train], target_train,
          batch_size=16,
          epochs=100,
          validation_data=([input_val, target_val], target_val),
         callbacks=[early_stopping])

Epoch 1/100


2023-12-09 04:38:26.928679: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8905
2023-12-09 04:38:27.317597: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-12-09 04:38:27.380486: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x36dd71f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-09 04:38:27.380538: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX 6000 Ada Generation, Compute Capability 8.9
2023-12-09 04:38:27.421246: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-12-09 04:38:27.699459: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-12-09 04:38:27.81

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


<keras.src.callbacks.History at 0x7f429476a610>

In [16]:
import pickle

# Save the tokenizer
with open('tokenizer3.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [17]:
model.save('model4_unique3.h5')

  saving_api.save_model(
