In [1]:
# Project: Text Summarization using NLP

# Objective:
# Build a text summarization tool using NLP techniques to create extractive and abstractive summaries.

# Step 1: Import Required Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Step 2: Load the Text Data
def load_text():
    text = """Natural Language Processing (NLP) is a subfield of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to read, decipher, understand, and make sense of human languages in a valuable way. Many challenges in NLP involve natural language understanding, enabling computers to derive meaning from human or natural language input, and others involve natural language generation."""
    return text

In [4]:
# Step 3: Preprocess the Text Data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    filtered_words = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return filtered_words

In [5]:
# Step 4: Create a Frequency Table
def create_frequency_table(filtered_words):
    freq_table = {}
    for word in filtered_words:
        if word.lower() not in freq_table:
            freq_table[word.lower()] = 1
        else:
            freq_table[word.lower()] += 1
    return freq_table

In [6]:
# Step 5: Score Sentences Based on Frequency
def score_sentences(text, freq_table):
    sentences = sent_tokenize(text)
    sentence_scores = {}
    for sent in sentences:
        for word in nltk.word_tokenize(sent.lower()):
            if word in freq_table.keys():
                if sent not in sentence_scores:
                    sentence_scores[sent] = freq_table[word]
                else:
                    sentence_scores[sent] += freq_table[word]
    return sentence_scores

In [21]:
# Step 6: Generate Extractive Summary
def generate_extractive_summary(sentence_scores, threshold=25):
    summary_sentences = heapq.nlargest(
        int(len(sentence_scores) * 0.3) + 1, sentence_scores, key=sentence_scores.get
    )
    summary = ' '.join(summary_sentences)
    return summary

In [8]:
# Step 7: Generate Abstractive Summary using T5
def generate_abstractive_summary(text):
    # Load pre-trained T5 model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    # Preprocess input text for T5
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return summary

In [11]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [22]:
text = load_text()

# Extractive Summarization
filtered_words = preprocess_text(text)
freq_table = create_frequency_table(filtered_words)
sentence_scores = score_sentences(text, freq_table)

print("Sentence Scores:", sentence_scores)
extractive_summary = generate_extractive_summary(sentence_scores)

# Abstractive Summarization
abstractive_summary = generate_abstractive_summary(text)

print("Original Text:\n", text)
print("\nExtractive Summary:\n", extractive_summary)
print("\nAbstractive Summary:\n", abstractive_summary)

Sentence Scores: {'Natural Language Processing (NLP) is a subfield of artificial intelligence that focuses on the interaction between computers and humans through natural language.': 33, 'The ultimate objective of NLP is to read, decipher, understand, and make sense of human languages in a valuable way.': 16, 'Many challenges in NLP involve natural language understanding, enabling computers to derive meaning from human or natural language input, and others involve natural language generation.': 50}
Original Text:
 Natural Language Processing (NLP) is a subfield of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to read, decipher, understand, and make sense of human languages in a valuable way. Many challenges in NLP involve natural language understanding, enabling computers to derive meaning from human or natural language input, and others involve natural language generation.

Extractive Sum