In [None]:
import pandas as pd
import re
import tensorflow as tf

# Load the dataset
train_data = pd.read_csv("/content/dialogs_expanded.csv", encoding='latin1')

# Preprocessing
train_data = train_data.dropna()  # Remove rows with missing values

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Preprocess the text
train_data['question'] = train_data['question'].apply(preprocess_text)
train_data['answer'] = train_data['answer'].apply(preprocess_text)

# Extract the input (question) and target (answer) texts
input_texts = train_data['question'].tolist()
target_texts = train_data['answer'].tolist()

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts)

input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# Padding sequences
max_seq_length = max(max([len(seq) for seq in input_sequences]), max([len(seq) for seq in target_sequences]))
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')

# The data is now ready to be used for training a seq2seq model with attention.


In [None]:
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split


# Train-test split
input_train, input_val, target_train, target_val = train_test_split(input_sequences, target_sequences, test_size=0.2)

target_train = np.array(target_train)
target_val = np.array(target_val)




In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate

# Hyperparameters
embedding_dim = 128
lstm_units = 128
vocab_size = len(tokenizer.word_index) + 1

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention Mechanism
attention = Attention()  # Scaled Dot-Product Attention
context_vector = attention([decoder_outputs, encoder_outputs])

# Concatenate context vector and decoder outputs
decoder_concat_input = Concatenate(axis=-1)([context_vector, decoder_outputs])

# Dense Layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit([input_train, input_train], target_train, batch_size=16, epochs=1, validation_data=([input_val, input_val], target_val))


[1m  20/6971[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5:03:43[0m 3s/step - accuracy: 0.5724 - loss: 9.5784

KeyboardInterrupt: 

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate
import numpy as np
import tensorflow as tf

# Define parameters
lstm_units = 256  # Example value
embedding_dim = 128  # Example value
vocab_size = 10000  # Example value
max_seq_length = 30  # Example value

# Define the embedding layer
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)

# Define the embedding layer for encoder (for consistency)
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)

# Encoder model
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedded = encoder_embedding(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedded)
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Decoder Inference Model
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_state_input_h = Input(shape=(lstm_units,), name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(lstm_units,), name='decoder_state_input_c')
decoder_hidden_state_input = Input(shape=(None, lstm_units), name='decoder_hidden_state_input')

# Apply embedding layer to decoder inputs
decoder_embedding_inf = decoder_embedding(decoder_inputs)

# Define LSTM layer for the decoder
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_embedding_inf, initial_state=[decoder_state_input_h, decoder_state_input_c]
)

# Define Attention layer
attention = Attention(name='attention')
context_vector_inf = attention([decoder_outputs_inf, decoder_hidden_state_input])

# Concatenate context vector and decoder outputs
decoder_concat_input_inf = Concatenate(axis=-1)([context_vector_inf, decoder_outputs_inf])

# Define Dense layer for the decoder outputs
decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs_inf = decoder_dense(decoder_concat_input_inf)

# Define the decoder model
decoder_model = Model(
    [decoder_inputs, decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs_inf, state_h_inf, state_c_inf]
)

# Function to Decode Sequence Using Beam Search
def beam_search_decode_sequence(input_seq, beam_width=3):
    # Encode the input as state vectors
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq)

    # Check if 'starttoken' and 'endtoken' are in the tokenizer's vocabulary
    start_token_index = tokenizer.word_index.get('starttoken', None)
    end_token_index = tokenizer.word_index.get('endtoken', None)

    if start_token_index is None or end_token_index is None:
        raise ValueError("Start token or end token not found in tokenizer's word index")

    # Initialize beams
    beams = [([start_token_index], 0.0)]  # (sequence, score)

    while beams:
        all_candidates = []
        for seq, score in beams:
            target_seq = np.array(seq).reshape(1, -1)
            output_tokens, h, c = decoder_model.predict([target_seq, encoder_outputs, state_h, state_c])
            top_tokens = np.argsort(output_tokens[0, -1, :])[-beam_width:]

            for token in top_tokens:
                new_seq = seq + [token]
                new_score = score - np.log(output_tokens[0, -1, token])
                if token == end_token_index or len(new_seq) > max_seq_length:
                    return ' '.join([tokenizer.index_word.get(t, '') for t in new_seq if t != start_token_index])

                all_candidates.append((new_seq, new_score))

        # Select the top beam_width sequences
        beams = sorted(all_candidates, key=lambda x: x[1])[:beam_width]

    return ''

# Function to Preprocess Input Text
def preprocess_input_text(input_text):
    input_text = preprocess_text(input_text)
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, maxlen=max_seq_length, padding='post')
    return input_seq

# Check the tokenizer for special tokens
print("Tokenizer word index:", tokenizer.word_index)

# Ensure special tokens are added
special_tokens = {
    'starttoken': 1,
    'endtoken': 2
}

tokenizer.word_index.update(special_tokens)
tokenizer.index_word.update({v: k for k, v in special_tokens.items()})

# Update vocab_size
vocab_size = len(tokenizer.word_index) + 1
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)

# Example Usage
input_text = "upset?"
input_seq = preprocess_input_text(input_text)
decoded_sentence = beam_search_decode_sequence(input_seq)

print(f"Input: {input_text}")
print(f"Response: {decoded_sentence}")


Tokenizer word index: {'i': 1, 'you': 2, 'the': 3, 'to': 4, 'a': 5, 'it': 6, 'that': 7, 'do': 8, 'what': 9, 'is': 10, 'of': 11, 'and': 12, 'have': 13, 'are': 14, 'in': 15, 'they': 16, 'was': 17, 'its': 18, 'did': 19, 'so': 20, 'like': 21, 'yes': 22, 'for': 23, 'my': 24, 'dont': 25, 'but': 26, 'he': 27, 'no': 28, 'be': 29, 'me': 30, 'im': 31, 'thats': 32, 'about': 33, 'we': 34, 'on': 35, 'how': 36, 'go': 37, 'not': 38, 'think': 39, 'too': 40, 'why': 41, 'your': 42, 'well': 43, 'going': 44, 'good': 45, 'will': 46, 'with': 47, 'want': 48, 'really': 49, 'get': 50, 'know': 51, 'all': 52, 'at': 53, 'there': 54, 'ill': 55, 'one': 56, 'just': 57, 'can': 58, 'this': 59, 'would': 60, 'if': 61, 'youre': 62, 'people': 63, 'see': 64, 'then': 65, 'she': 66, 'right': 67, 'nice': 68, 'didnt': 69, 'out': 70, 'should': 71, 'whats': 72, 'time': 73, 'need': 74, 'her': 75, 'money': 76, 'oh': 77, 'maybe': 78, 'him': 79, 'course': 80, 'lot': 81, 'when': 82, 'or': 83, 'much': 84, 'okay': 85, 'got': 86, 'up': 