In [1]:
import re
from collections import Counter
from typing import List, Dict

In [4]:
class QAFocusedSummarizer:    
    def __init__(self):
        # Keywords that indicate important information for QA
        self.qa_keywords = [
            'what', 'when', 'where', 'who', 'why', 'how', 'which',
            'because', 'due to', 'caused by', 'resulted in', 'led to',
            'definition', 'means', 'refers to', 'defined as'
        ]
    
    def extractive_summary(self, document: str, query: str, max_sentences: int = 3) -> str:
        sentences = self._split_sentences(document)
        query_words = set(query.lower().split())
        
        # Score sentences based on query relevance
        scored_sentences = []
        for sentence in sentences:
            score = self._calculate_relevance_score(sentence, query_words)
            scored_sentences.append((sentence, score))
        
        # Select top sentences
        top_sentences = sorted(scored_sentences, key=lambda x: x[1], reverse=True)[:max_sentences]
        return ' '.join([sent[0] for sent in top_sentences])
    
    def abstractive_summary(self, document: str, query: str) -> str:
        sentences = self._split_sentences(document)
        query_words = set(query.lower().split())
        
        # Extract key facts and entities
        key_facts = self._extract_key_facts(sentences, query_words)
        entities = self._extract_entities(document)
        
        # Generate focused summary
        summary_parts = []
        
        # Add direct answer if found
        direct_answer = self._find_direct_answer(sentences, query_words)
        if direct_answer:
            summary_parts.append(direct_answer)
        
        # Add supporting context
        context = self._extract_supporting_context(key_facts, entities)
        if context:
            summary_parts.append(context)
        
        return ' '.join(summary_parts)
    
    def _split_sentences(self, text: str) -> List[str]:
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def _calculate_relevance_score(self, sentence: str, query_words: set) -> float:
        sentence_words = set(sentence.lower().split())
        
        # Base score from word overlap
        overlap = len(query_words.intersection(sentence_words))
        base_score = overlap / len(query_words) if query_words else 0
        
        # Bonus for QA keywords
        qa_bonus = sum(1 for word in self.qa_keywords if word in sentence.lower()) * 0.1
        
        # Bonus for sentence length (avoid very short sentences)
        length_bonus = min(len(sentence.split()) / 20, 0.2)
        
        return base_score + qa_bonus + length_bonus
    
    def _extract_key_facts(self, sentences: List[str], query_words: set) -> List[str]:
        facts = []
        for sentence in sentences:
            if self._calculate_relevance_score(sentence, query_words) > 0.3:
                facts.append(sentence)
        return facts[:2]  # Top 2 facts
    
    def _extract_entities(self, text: str) -> List[str]:
        words = text.split()
        entities = [word for word in words if word[0].isupper() and len(word) > 2]
        return list(set(entities))[:5]  # Top 5 unique entities
    
    def _find_direct_answer(self, sentences: List[str], query_words: set) -> str:
        for sentence in sentences:
            sentence_lower = sentence.lower()
            # Look for definition patterns
            if any(pattern in sentence_lower for pattern in ['is defined as', 'means', 'refers to']):
                return sentence
        return ""
    
    def _extract_supporting_context(self, facts: List[str], entities: List[str]) -> str:
        if not facts:
            return ""
        
        # Combine most relevant fact with entities
        main_fact = facts[0] if facts else ""
        entity_context = f"Key entities mentioned: {', '.join(entities[:3])}" if entities else ""
        
        return f"{main_fact} {entity_context}".strip()


In [5]:
class GenericSummarizer:
    
    def summarize(self, document: str, max_sentences: int = 3) -> str:
        sentences = self._split_sentences(document)
        
        # Score sentences based on general importance
        scored_sentences = []
        for sentence in sentences:
            score = self._calculate_generic_score(sentence, sentences)
            scored_sentences.append((sentence, score))
        
        # Select top sentences
        top_sentences = sorted(scored_sentences, key=lambda x: x[1], reverse=True)[:max_sentences]
        return ' '.join([sent[0] for sent in top_sentences])
    
    def _split_sentences(self, text: str) -> List[str]:
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def _calculate_generic_score(self, sentence: str, all_sentences: List[str]) -> float:
        words = sentence.lower().split()
        
        # Word frequency across document
        all_words = ' '.join(all_sentences).lower().split()
        word_freq = Counter(all_words)
        
        # Score based on word frequencies
        freq_score = sum(word_freq[word] for word in words) / len(words)
        
        # Position bonus (first and last sentences often important)
        position_bonus = 0.1 if sentence in all_sentences[:2] or sentence in all_sentences[-2:] else 0
        
        # Length bonus
        length_bonus = min(len(words) / 15, 0.2)
        
        return freq_score + position_bonus + length_bonus



In [6]:
# Example usage and comparison
def compare_summarizers():
    # Sample document and query
    document = """
    Machine learning is a subset of artificial intelligence that focuses on algorithms 
    that can learn from data. Deep learning is a specialized form of machine learning 
    that uses neural networks with multiple layers. Neural networks are inspired by 
    the human brain and consist of interconnected nodes called neurons. The process 
    of training involves feeding data to the network and adjusting weights. 
    Backpropagation is the algorithm used to update these weights. Convolutional 
    neural networks are particularly effective for image recognition tasks. 
    They use filters to detect features in images. Recurrent neural networks 
    are designed for sequential data like text and time series.
    """
    
    query = "What is deep learning?"
    
    # Initialize summarizers
    qa_summarizer = QAFocusedSummarizer()
    generic_summarizer = GenericSummarizer()
    
    print("=== COMPARISON OF SUMMARIZATION APPROACHES ===\n")
    print(f"Query: {query}\n")
    print(f"Original Document Length: {len(document.split())} words\n")
    
    # QA-Focused Extractive Summary
    extractive_summary = qa_summarizer.extractive_summary(document, query)
    print("1. QA-Focused Extractive Summary:")
    print(f"{extractive_summary}\n")
    print(f"Length: {len(extractive_summary.split())} words")
    print(f"Noise Removal: Focuses on query-relevant sentences\n")
    
    # QA-Focused Abstractive Summary  
    abstractive_summary = qa_summarizer.abstractive_summary(document, query)
    print("2. QA-Focused Abstractive Summary:")
    print(f"{abstractive_summary}\n")
    print(f"Length: {len(abstractive_summary.split())} words")
    print(f"Noise Removal: Synthesizes answer-specific information\n")
    
    # Generic Summary (ChatGPT-like)
    generic_summary = generic_summarizer.summarize(document)
    print("3. Generic Summary (ChatGPT-like):")
    print(f"{generic_summary}\n")
    print(f"Length: {len(generic_summary.split())} words")
    print(f"Noise Removal: General importance, may include irrelevant info\n")
    
    # Analysis
    print("=== ANALYSIS ===")
    print("QA-Focused Summaries:")
    print("✓ Target specific information relevant to the query")
    print("✓ Remove noise unrelated to the question")
    print("✓ Maintain context needed for accurate answers")
    print()
    print("Generic Summaries:")
    print("• May include important but irrelevant information")
    print("• Don't prioritize query-specific content")
    print("• Good for general understanding, poor for specific QA")

if __name__ == "__main__":
    compare_summarizers()

=== COMPARISON OF SUMMARIZATION APPROACHES ===

Query: What is deep learning?

Original Document Length: 101 words

1. QA-Focused Extractive Summary:
Deep learning is a specialized form of machine learning 
    that uses neural networks with multiple layers Machine learning is a subset of artificial intelligence that focuses on algorithms 
    that can learn from data Backpropagation is the algorithm used to update these weights

Length: 42 words
Noise Removal: Focuses on query-relevant sentences

2. QA-Focused Abstractive Summary:
Machine learning is a subset of artificial intelligence that focuses on algorithms 
    that can learn from data Key entities mentioned: Neural, Convolutional, Backpropagation

Length: 23 words
Noise Removal: Synthesizes answer-specific information

3. Generic Summary (ChatGPT-like):
Deep learning is a specialized form of machine learning 
    that uses neural networks with multiple layers The process 
    of training involves feeding data to the network and