In [15]:
import re
import pandas as pd
import numpy as np
from datetime import datetime
from google.colab import drive

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.models import Model

from sklearn.model_selection import train_test_split

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Path to WhatsApp chat
file_path = "/content/drive/MyDrive/Github/chat_NLP/Dataset/Chat.txt"

# Read all lines
with open(file_path, encoding="utf-8") as f:
    lines = f.read().splitlines()

# Pattern for the start of a WhatsApp message
message_pattern = re.compile(
    r'^(\d{1,2}/\d{1,2}/\d{2,4}),\s'        # date (e.g. 6/26/17),
    r'(\d{1,2}:\d{2}\s[AP]M)\s-\s'           # time (e.g. 5:19 PM) -
    r'([^:]+):\s'                            # user name (anything up to the colon)
    r'(.*)$'                                 # the actual message
)

records = []
current = None

for line in lines:
    m = message_pattern.match(line)
    if m:
        # Start a new record
        date_str, time_str, user, text = m.groups()
        timestamp = datetime.strptime(f"{date_str} {time_str}", "%m/%d/%y %I:%M %p")
        current = {"date": timestamp, "user": user, "message": text}
        records.append(current)
    else:
        # Continuation of the previous message
        if current:
            current["message"] += "\n" + line

# Build DataFrame
parsed_df = pd.DataFrame(records)


# # Deleted Messages
parsed_df = parsed_df[~parsed_df["message"].str.contains("This message was deleted")]

# Delete all rows with "Media omitted" Message
parsed_df = parsed_df[~parsed_df["message"].str.contains("Media omitted")]

# Quick check
parsed_df.head()


Unnamed: 0,date,user,message
0,2017-06-26 17:19:00,Linda Roldán,Hoola
1,2017-06-26 17:19:00,Linda Roldán,Bebe
2,2017-06-26 17:46:00,Alejandro Castellanos,Hola chiquita
3,2017-06-26 17:46:00,Alejandro Castellanos,Y este número¿
4,2017-06-26 17:54:00,Linda Roldán,Este es mi numero anterior


In [3]:
parsed_df.shape

(203094, 3)

In [4]:
parsed_df["user"].value_counts()

Unnamed: 0_level_0,count
user,Unnamed: 1_level_1
Linda Roldán,116973
Alejandro Castellanos,86121


In [5]:
# Define the “self” user whose replies we want to model
self_user = 'Alejandro Castellanos'

# Step 1: collapse consecutive messages by the same speaker into dialogue turns
parsed_df['turn_id'] = (parsed_df['user'] != parsed_df['user'].shift()).cumsum()

turns = (
    parsed_df
    .groupby('turn_id')
    .agg({
        'user': 'first',
        'message': lambda msgs: ' '.join(msgs)   # join all messages in the turn with a space
    })
    .reset_index()
)

# Step 2: select your turns and attach the previous turn as input
self_turns = turns[turns['user'] == self_user].copy()
self_turns['prev_turn_id'] = self_turns['turn_id'] - 1

# Step 3: get the other-person turns
other_turns = turns[turns['user'] != self_user][['turn_id', 'message']]

# Step 4: merge to form input–target pairs
pairs_df = pd.merge(
    self_turns,
    other_turns,
    left_on='prev_turn_id',
    right_on='turn_id',
    how='inner',
    suffixes=('_self', '_other')
)

# Step 5: keep and rename the columns
pairs_df = (
    pairs_df
    [['message_other', 'message_self']]
    .rename(columns={
        'message_other': 'input_text',
        'message_self':  'target_text'
    })
    .reset_index(drop=True)
)

# Inspect the first examples
pairs_df.head()

Unnamed: 0,input_text,target_text
0,Hoola Bebe,Hola chiquita Y este número¿
1,Este es mi numero anterior,"A síii, y que cel tienes?"
2,El mio le mande a arreglar,A síii???
3,Siiii,Muy bien Y cuánto te costó?
4,30.000 pero lo estoy pagando a plazos,Y quedó bien?


## Tensorflow

In [6]:
# Prepare raw texts for tokenizer
input_texts = pairs_df['input_text'].tolist()

# Add start/end tokens to targets
target_texts = ['<start> ' + txt + ' <end>' for txt in pairs_df['target_text'].tolist()]

In [7]:
vocab_size = 10000
oov_token = '<unk>'

# Create and fit input tokenizer
input_tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
max_encoder_len = max(len(seq) for seq in input_sequences)
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_len, padding='post')
encoder_input_data

array([[5324,   19,    0, ...,    0,    0,    0],
       [ 176,   16,   23, ...,    0,    0,    0],
       [  12,  230,   38, ...,    0,    0,    0],
       ...,
       [   7,  281, 1042, ...,    0,    0,    0],
       [  37,   15,   67, ...,    0,    0,    0],
       [  27,   53,    4, ...,    0,    0,    0]], dtype=int32)

In [8]:
# Create and fit target tokenizer

target_tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
max_decoder_len = max(len(seq) for seq in target_sequences)
target_data = pad_sequences(target_sequences, maxlen=max_decoder_len, padding='post')
target_data

array([[   2,   45,   78, ...,    0,    0,    0],
       [   2,    6, 3241, ...,    0,    0,    0],
       [   2,    6, 3241, ...,    0,    0,    0],
       ...,
       [   2,  481,    3, ...,    0,    0,    0],
       [   2,  575,   17, ...,    0,    0,    0],
       [   2,   34,  168, ...,    0,    0,    0]], dtype=int32)

In [9]:
# Split decoder inputs & outputs
decoder_input_data  = target_data[:, :-1]
decoder_target_data = target_data[:, 1:]

In [10]:
# Train/validation split
(enc_in_train, enc_in_val,
 dec_in_train, dec_in_val,
 dec_tar_train, dec_tar_val) = train_test_split(
    encoder_input_data,
    decoder_input_data,
    decoder_target_data,
    test_size=0.2,
    random_state=42
)

In [11]:
# Build tf.data datasets
batch_size = 32

train_dataset = tf.data.Dataset.from_tensor_slices(
    ((enc_in_train, dec_in_train), dec_tar_train)
).shuffle(buffer_size=len(enc_in_train)).batch(batch_size, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices(
    ((enc_in_val, dec_in_val), dec_tar_val)
).batch(batch_size, drop_remainder=True)

# Verify shapes
print("Encoder input shape:", enc_in_train.shape)
print("Decoder input shape:", dec_in_train.shape)
print("Decoder target shape:", dec_tar_train.shape)

Encoder input shape: (40581, 941)
Decoder input shape: (40581, 1170)
Decoder target shape: (40581, 1170)


In [12]:
# Hyperparameters
vocab_size    = 10000    # must match num_words in Tokenizer
embedding_dim = int(256/2)
lstm_units    = int(512/2)

# ----- Encoder -----
# encoder_inputs: sequences of token IDs (padded)
encoder_inputs = Input(shape=(None,), name='encoder_inputs')

# encoder_embedding: map token IDs → dense vectors
encoder_embedding = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    mask_zero=True,       # ignore padding token (0) in the LSTM
    name='encoder_embedding'
)(encoder_inputs)

# encoder_lstm: return the hidden & cell state
encoder_outputs, state_h, state_c = LSTM(
    lstm_units,
    return_state=True,
    name='encoder_lstm'
)(encoder_embedding)

# keep only the states (we don’t need encoder_outputs)
encoder_states = [state_h, state_c]

# ----- Decoder -----
# decoder_inputs: teacher-forced target sequences (shifted right, without <end>)
decoder_inputs = Input(shape=(None,), name='decoder_inputs')

# decoder_embedding: separate embedding layer for the decoder
decoder_embedding = Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    mask_zero=True,
    name='decoder_embedding'
)(decoder_inputs)

# decoder_lstm: return full sequence + states
decoder_lstm = LSTM(
    lstm_units,
    return_sequences=True,
    return_state=True,
    name='decoder_lstm'
)
decoder_outputs, _, _ = decoder_lstm(
    decoder_embedding,
    initial_state=encoder_states
)

# dense layer to generate probability over the vocabulary
decoder_dense = Dense(
    vocab_size,
    activation='softmax',
    name='decoder_dense'
)
decoder_outputs = decoder_dense(decoder_outputs)

# ----- Define & compile model -----
model = Model(
    inputs=[encoder_inputs, decoder_inputs],
    outputs=decoder_outputs,
    name='seq2seq_model'
)

model.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Show the model’s architecture
model.summary()

In [13]:
# Train the seq2seq model
epochs = 5 # few epochs for faster demonstration

history = model.fit(
    x=[enc_in_train, dec_in_train],   # encoder inputs & decoder inputs
    y=dec_tar_train,                  # decoder target outputs
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([enc_in_val, dec_in_val], dec_tar_val)
)

Epoch 1/5
[1m1269/1269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m725s[0m 556ms/step - accuracy: 0.3946 - loss: 6.0176 - val_accuracy: 0.0019 - val_loss: 5.2498
Epoch 2/5
[1m1269/1269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m719s[0m 551ms/step - accuracy: 0.0020 - loss: 5.1571 - val_accuracy: 0.0021 - val_loss: 4.9743
Epoch 3/5
[1m1269/1269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 550ms/step - accuracy: 0.0022 - loss: 4.8967 - val_accuracy: 0.0023 - val_loss: 4.8218
Epoch 4/5
[1m1269/1269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m742s[0m 551ms/step - accuracy: 0.0023 - loss: 4.7178 - val_accuracy: 0.0024 - val_loss: 4.6826
Epoch 5/5
[1m1269/1269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 550ms/step - accuracy: 0.0024 - loss: 4.6112 - val_accuracy: 0.0025 - val_loss: 4.5927


In [16]:
val_loss = history.history['val_loss'][-1]
perplexity = np.exp(val_loss)
print(f"Perplexity de validación: {perplexity:.2f}")

Perplexity de validación: 98.76


In [26]:
# ----- 1. Encoder inference model -----
# This model maps an input sequence to its final LSTM states [h, c].
encoder_model = tf.keras.Model(
    inputs=encoder_inputs,     # original encoder Input layer
    outputs=encoder_states     # [state_h, state_c] from encoder LSTM
)

# ----- 2. Decoder inference model -----
# Define placeholders for one time-step input and decoder LSTM states.
decoder_state_input_h = Input(shape=(lstm_units,), name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(lstm_units,), name='decoder_state_input_c')
decoder_inputs_inf   = Input(shape=(1,),        name='decoder_inputs_inf')

# Reuse layers from the trained model
decoder_embedding_layer = model.get_layer('decoder_embedding')
decoder_lstm_layer      = model.get_layer('decoder_lstm')
decoder_dense_layer     = model.get_layer('decoder_dense')

# Embed the single input token
decoder_embedded_inf = decoder_embedding_layer(decoder_inputs_inf)

# Run one step of LSTM, conditioned on previous states
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm_layer(
    decoder_embedded_inf,
    initial_state=[decoder_state_input_h, decoder_state_input_c]
)

# Compute probability distribution over the vocabulary
decoder_probs_inf = decoder_dense_layer(decoder_outputs_inf)

# Assemble the decoder inference model
decoder_model = tf.keras.Model(
    inputs=[decoder_inputs_inf, decoder_state_input_h, decoder_state_input_c],
    outputs=[decoder_probs_inf, state_h_inf, state_c_inf],
    name='decoder_inference'
)

# ----- 3. Build reverse lookup dictionaries -----
inverse_input_token_index  = {v: k for k, v in input_tokenizer.word_index.items()  if v < vocab_size}
inverse_target_token_index = {v: k for k, v in target_tokenizer.word_index.items() if v < vocab_size}

# ----- 4. Decoding function -----
def decode_sequence(input_seq):
    """
    Given an input sequence, encode it to state vectors,
    then decode token-by-token until 'end' token is produced.
    """
    # 4.1 Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # 4.2 Initialize target sequence with the 'start' token
    start_idx = target_tokenizer.word_index['start']
    target_seq = np.array([[start_idx]])

    decoded_tokens = []
    while True:
        # 4.3 Predict next token probabilities & new states
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # 4.4 Select the token with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = inverse_target_token_index.get(sampled_token_index, '<unk>')

        # 4.5 Check for end-of-sequence or maximum length
        if sampled_word == 'end' or len(decoded_tokens) > max_decoder_len:
            break

        # 4.6 Append the sampled word to the output
        decoded_tokens.append(sampled_word)

        # 4.7 Update the target sequence and states for the next step
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return ' '.join(decoded_tokens)

# ----- 5. Example inference -----
# Take the first validation example
sample_input = enc_in_val[0:1]

# Decode it to a reply
decoded_reply = decode_sequence(sample_input)

# Convert the input back to words (skip padding zeros)
input_text = ' '.join(
    [inverse_input_token_index[idx] for idx in sample_input[0] if idx != 0]
)

print("Input text:   ", input_text)
print("Decoded reply:", decoded_reply)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 588ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 605ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Input text:    está se parece a lina castellanos
Decoded reply: bueno


In [33]:
def chat():
    """
    Start an interactive session where the user can input messages
    and the chatbot (trained seq2seq model) responds.
    Type 'exit' or 'quit' to end the session.
    """
    print("Chatbot is ready! Type 'exit' or 'quit' to stop.")
    while True:
        # Read user input
        user_input = input("Linda: ")
        if user_input.lower() in ('exit', 'quit'):
            print("Alejo_Bot: Adios!")
            break

        # Tokenize and pad the user input
        seq = input_tokenizer.texts_to_sequences([user_input])
        padded_seq = pad_sequences(seq, maxlen=max_encoder_len, padding='post')

        # Generate chatbot reply
        reply = decode_sequence(padded_seq)

        # Display the chatbot response
        print("Alejo_Bot:", reply)

# Launch the chat interface
chat()

Chatbot is ready! Type 'exit' or 'quit' to stop.
Linda: Me voy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Alejo_Bot: cómo vas
Linda: bien y tu?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Alejo_Bot: bien chiquita cómo estás
Linda: Hola Bebe
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1