In [1]:
# Step 1: Import nltk and heapq
import nltk
import heapq
import re

In [2]:
# Step 2: Download necessary NLTK data
nltk.download('punkt')        # For tokenization
nltk.download('stopwords')     # For stopwords list
nltk.download('punkt_tab')     # Additional punkt data

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [3]:
# Step 3: Load or input text document
def load_text():
    print("Choose input method:")
    print("1. Enter text manually")
    print("2. Read from file")

    choice = input("Enter your choice (1 or 2): ")

    if choice == '1':
        print("\nEnter your text (press Enter twice to finish):")
        lines = []
        while True:
            line = input()
            if line:
                lines.append(line)
            else:
                break
        text = ' '.join(lines)
        return text

    elif choice == '2':
        filename = input("Enter filename (e.g., sample.txt): ")
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                text = file.read()
            return text
        except FileNotFoundError:
            print("File not found!")
            return None

    else:
        print("Invalid choice!")
        return None

# Sample text for testing
sample_text = """
Artificial Intelligence (AI) is transforming the world at an unprecedented pace.
From healthcare to transportation, AI technologies are being integrated into various
sectors to improve efficiency and accuracy. Machine learning, a subset of AI, enables
systems to learn and improve from experience without being explicitly programmed.
Deep learning, a more advanced form of machine learning, uses neural networks with
multiple layers to analyze various factors of data. These technologies have led to
breakthroughs in image recognition, natural language processing, and autonomous vehicles.
However, the rapid advancement of AI also raises ethical concerns about privacy,
job displacement, and decision-making transparency. Many experts argue for the need
to develop responsible AI frameworks that ensure these technologies benefit humanity
while minimizing potential harm. As AI continues to evolve, it will be crucial to
strike a balance between innovation and regulation. The future of AI holds immense
potential, but it must be guided by ethical principles and human values.
"""

# Load text
text = load_text()
if text is None:
    print("Using sample text instead...")
    text = sample_text

print("\nOriginal Text:")
print("-" * 50)
print(text)

Choose input method:
1. Enter text manually
2. Read from file
Enter your choice (1 or 2): 1

Enter your text (press Enter twice to finish):


Original Text:
--------------------------------------------------



In [4]:
# Step 4: Tokenize text into sentences and words

# Tokenize into sentences
sentences = sent_tokenize(text)
print(f"\nNumber of sentences: {len(sentences)}")
print("\nSentences:")
for i, sentence in enumerate(sentences, 1):
    print(f"{i}. {sentence[:50]}...")

# Tokenize into words
words = word_tokenize(text.lower())
print(f"\nNumber of words: {len(words)}")
print(f"First 20 words: {words[:20]}")


Number of sentences: 0

Sentences:

Number of words: 0
First 20 words: []


In [7]:
# Step 5: Compute word frequencies, ignoring stopwords

# Get stopwords
stop_words = set(stopwords.words('english'))
print(f"\nStopwords sample: {list(stop_words)[:10]}")

# Calculate word frequencies
word_frequencies = {}

for word in words:
    if word not in stop_words and word.isalnum():  # Ignore stopwords and punctuation
        if word not in word_frequencies:
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

print(f"\nUnique words after removing stopwords: {len(word_frequencies)}")

# Normalize frequencies (divide by maximum frequency)
if word_frequencies: # Add this check
    max_frequency = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] = word_frequencies[word] / max_frequency
else:
    print("\nCannot normalize word frequencies: No words were processed (text might be empty or contain only stopwords/punctuation).")
    # Handle the empty case appropriately if subsequent steps rely on a normalized frequency.

# Show top 10 words by frequency
print("\nTop 10 important words:")
if word_frequencies: # Add this check for printing top words
    sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
    for word, freq in sorted_words[:10]:
        print(f"{word}: {freq:.3f}")
else:
    print("No words to display.")


Stopwords sample: ['until', 'been', "doesn't", 'herself', "i'll", "hasn't", 'hasn', "needn't", 'above', 'm']

Unique words after removing stopwords: 0

Cannot normalize word frequencies: No words were processed (text might be empty or contain only stopwords/punctuation).

Top 10 important words:
No words to display.


In [8]:
# Step 6: Score each sentence based on important words

sentence_scores = {}

for sentence in sentences:
    # Check sentence length (avoid very long sentences)
    if len(sentence.split()) < 30:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_frequencies[word]
                else:
                    sentence_scores[sentence] += word_frequencies[word]

print("\nSentence Scores:")
# Show scores for first few sentences
for i, sentence in enumerate(sentences[:5]):
    score = sentence_scores.get(sentence, 0)
    print(f"Sentence {i+1}: Score = {score:.3f}")
    print(f"  Text: {sentence[:70]}...")


Sentence Scores:


In [9]:
# Step 7: Select top sentences to form summary

# Calculate how many sentences to include in summary (e.g., 30% of original)
summary_percentage = 0.3
num_summary_sentences = max(1, int(len(sentences) * summary_percentage))

print(f"\nOriginal sentences: {len(sentences)}")
print(f"Summary sentences: {num_summary_sentences}")

# Select top scoring sentences
summary_sentences = heapq.nlargest(num_summary_sentences,
                                  sentence_scores,
                                  key=sentence_scores.get)

print("\nSelected sentences (unsorted):")
for i, sentence in enumerate(summary_sentences):
    print(f"{i+1}. {sentence[:70]}...")

# Sort sentences in original order
summary_sentences.sort(key=lambda x: sentences.index(x))

print("\nSelected sentences (in original order):")
for i, sentence in enumerate(summary_sentences):
    print(f"{i+1}. {sentence[:70]}...")


Original sentences: 0
Summary sentences: 1

Selected sentences (unsorted):

Selected sentences (in original order):


In [11]:
# Step 8: Display summarized text

# Join sentences to form summary
summary = ' '.join(summary_sentences)

print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(summary)

# Display statistics
print("\n" + "="*60)
print("STATISTICS")
print("="*60)
original_words = len(words)
summary_words = len(word_tokenize(summary))
original_sentences = len(sentences)
summary_sentences_count = len(summary_sentences)

print(f"Original text: {original_words} words, {original_sentences} sentences")
print(f"Summary text: {summary_words} words, {summary_sentences_count} sentences")

# Add checks to prevent ZeroDivisionError if original_words or original_sentences is 0
if original_words > 0:
    print(f"Compression ratio: {summary_words/original_words*100:.1f}%")
else:
    print("Compression ratio: N/A (Original text has no words)")

if original_sentences > 0:
    print(f"Summary keeps {summary_sentences_count/original_sentences*100:.1f}% of sentences")
else:
    print("Summary keeps: N/A (Original text has no sentences)")


FINAL SUMMARY


STATISTICS
Original text: 0 words, 0 sentences
Summary text: 0 words, 0 sentences
Compression ratio: N/A (Original text has no words)
Summary keeps: N/A (Original text has no sentences)
