In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Model

In [2]:
# Load the input data
data = pd.read_csv('input_data.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'input_data.csv'

In [None]:
# Split the data into training and testing sets
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Convert the text data into sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

In [None]:
# Pad the sequences to a fixed length
max_len = 1000
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

In [None]:
# Define the RNN model
inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 100)(inputs)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.5)(x)
outputs = Dense(max_len, activation='softmax')(x)
model = Model(inputs, outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Train the model
train_labels = tf.keras.utils.to_categorical(train_data['label'], num_classes=max_len)
test_labels = tf.keras.utils.to_categorical(test_data['label'], num_classes=max_len)
model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels))

In [None]:
# Extract the summary
def extract_summary(text):
    # Convert the text to a sequence of integers
    sequence = tokenizer.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    # Predict the probability of each word in the sequence
    prob = model.predict(sequence)[0]

    # Sort the probability scores in descending order
    idxs = np.argsort(prob)[::-1]

    # Get the top 10 words with the highest probability scores
    words = []
    for idx in idxs[:10]:
        word = tokenizer.index_word.get(idx, '')
        if word:
            words.append(word)

    # Return the summary as a string
    summary = ' '.join(words)
    return summary

In [None]:
# Test the model
sample_text = 'The quick brown fox jumps over the lazy dog.'
summary = extract_summary(sample_text)
print(summary)
