In [2]:
import pandas as pd
data = pd.read_csv("email_thread_summaries.csv")
print(data.head())


   thread_id                                            summary
0          1  The email thread discusses the Master Terminat...
1          2  A lunch meeting has been scheduled for May 5th...
2          3  Ben is updating a friend on his progress with ...
3          4  The recipient of the email thread initially ex...
4          5  The email thread discusses the long form confi...


In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re  # Added missing import
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Input, Dropout
from tensorflow.keras.models import Model

# Sample dataset (Replace with actual dataset)
data = pd.DataFrame({
    'email_text': [
        "Hello John, as per our last discussion, please find the report attached. Let me know if you have questions.",
        "Reminder: The meeting is scheduled for 10 AM tomorrow. Please confirm your availability.",
        "Team, please update your task progress by EOD. Let me know if any blockers."
    ],
    'summary': [
        "Report attached for review.",
        "Reminder for tomorrow's meeting.",
        "Update task progress by EOD."
    ]
})

# Text Preprocessing Function
def preprocess_text(text):
    if isinstance(text, str):  # Ensure text is a string
        text = text.lower()  # Convert text to lowercase
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text
    return ""

# Apply preprocessing
data['processed_email'] = data['email_text'].fillna('').apply(preprocess_text)
data['processed_summary'] = data['summary'].fillna('').apply(preprocess_text)

# Tokenization
MAX_VOCAB_SIZE = 5000
MAX_EMAIL_LENGTH = 100  # Adjust as needed
MAX_SUMMARY_LENGTH = 20

# Tokenizer for emails
email_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
email_tokenizer.fit_on_texts(data['processed_email'])
email_sequences = email_tokenizer.texts_to_sequences(data['processed_email'])
X = pad_sequences(email_sequences, maxlen=MAX_EMAIL_LENGTH, padding='post')

# Tokenizer for summaries
summary_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
summary_tokenizer.fit_on_texts(data['processed_summary'])
summary_sequences = summary_tokenizer.texts_to_sequences(data['processed_summary'])
y = pad_sequences(summary_sequences, maxlen=MAX_SUMMARY_LENGTH, padding='post')

# Define Bi-GRU Model
EMBEDDING_DIM = 128
HIDDEN_UNITS = 64

# Encoder
encoder_inputs = Input(shape=(MAX_EMAIL_LENGTH,))
embedding_layer = Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(encoder_inputs)
encoder_gru = Bidirectional(GRU(HIDDEN_UNITS, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2))
encoder_outputs, forward_state, backward_state = encoder_gru(embedding_layer)
encoder_state = tf.keras.layers.Concatenate()([forward_state, backward_state])

# Decoder
decoder_inputs = Input(shape=(MAX_SUMMARY_LENGTH,))
decoder_embedding = Embedding(MAX_VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(decoder_inputs)
decoder_gru = GRU(HIDDEN_UNITS * 2, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2)
decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=encoder_state)
decoder_dense = Dense(MAX_VOCAB_SIZE, activation='softmax')  # Vocabulary-sized output
decoder_outputs = decoder_dense(decoder_outputs)

# Model Definition
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Prepare target data (shifted sequences)
y_train = np.zeros_like(y)
y_train[:, :-1] = y[:, 1:]  # Shift left for training target

# Train the Model
EPOCHS = 10
BATCH_SIZE = 16

model.fit([X, y], y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.2)

# Function to Generate Summary
def generate_summary(email_text):
    processed_text = preprocess_text(email_text)
    email_seq = email_tokenizer.texts_to_sequences([processed_text])
    email_padded = pad_sequences(email_seq, maxlen=MAX_EMAIL_LENGTH, padding='post')

    summary_seq = np.zeros((1, MAX_SUMMARY_LENGTH))  # Start with empty sequence
    summary_seq[0, 0] = summary_tokenizer.word_index.get('<OOV>', 0)  # Use OOV token as start token

    for i in range(1, MAX_SUMMARY_LENGTH):
        prediction = model.predict([email_padded, summary_seq])
        predicted_word_index = np.argmax(prediction[0, i-1, :])  # Get highest probability word
        summary_seq[0, i] = predicted_word_index
        if predicted_word_index == 0:  # Stop if <OOV> or padding
            break

    # Convert sequence back to text
    index_to_word = {v: k for k, v in summary_tokenizer.word_index.items()}
    summary_words = [index_to_word.get(idx, '') for idx in summary_seq[0] if idx > 0]
    return ' '.join(summary_words)

# Test summarization
print(generate_summary("Reminder: Submit your project report by EOD."))


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22s/step - accuracy: 0.0000e+00 - loss: 8.5172 - val_accuracy: 0.0000e+00 - val_loss: 8.5148
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 431ms/step - accuracy: 0.0000e+00 - loss: 8.5095 - val_accuracy: 0.0500 - val_loss: 8.5140
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 418ms/step - accuracy: 0.1500 - loss: 8.5017 - val_accuracy: 0.1000 - val_loss: 8.5131
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461ms/step - accuracy: 0.2250 - loss: 8.4925 - val_accuracy: 0.1500 - val_loss: 8.5120
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 443ms/step - accuracy: 0.3250 - loss: 8.4834 - val_accuracy: 0.2500 - val_loss: 8.5108
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471ms/step - accuracy: 0.6750 - loss: 8.4711 - val_accuracy: 0.8000 - val_loss: 8.5094
Epoch 7/10
[1m1/1[0m [32m━