In [3]:
import html
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import spacy
from textblob import TextBlob


nlp = spacy.load("en_core_web_sm")


model_name = 'google/flan-t5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def analyze_content_structure(text):
    """
    Analyze text to understand its structure, key elements, and suggest the best summarization format.
    """
    doc = nlp(text)
    num_entities = len(doc.ents)
    num_sentences = len(list(doc.sents))
    num_tokens = len(doc)
    num_keywords = sum(1 for token in doc if not token.is_stop)
    num_bullets = text.count('- ') + text.count('* ') + text.count('• ')
    num_arrows = text.count('->') + text.count('=>')

    
    if num_bullets > 3 or 'list' in text.lower():
        return "bullet points"
    elif num_arrows > 2 or "process" in text.lower() or "steps" in text.lower():
        return "flowchart"
    elif num_entities > 5 and num_keywords / num_tokens < 0.5:
        return "table"
    else:
        return "paragraph"

def get_optional_context():
    context = input("Would you like to provide additional context for better summarization? (optional, press enter to skip): ")
    return context

def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment
    return sentiment.polarity, sentiment.subjectivity

def clean_text(text):

    text = html.unescape(text)    
    text = text.replace('&nbsp;', ' ').replace('&lt;', '<').replace('&gt;', '>')
    return text

def clean_summary(summary, style):

    if style == "bullet points":
        cleaned_lines = []
        for line in summary.split(". "):
            line = line.strip().lstrip("1234567890.- ")
            if line:
                cleaned_lines.append(f"- {line}")
        return "\n".join(cleaned_lines)
    elif style == "flowchart":
        cleaned_lines = []
        for line in summary.split(". "):
            line = line.strip().lstrip("1234567890.- ")
            if line:
                cleaned_lines.append(f"-> {line}")
        return "\n".join(cleaned_lines)
    else:
        return summary.replace(". ", ".\n")

def summarize_section(section_text, context='', style='paragraph'):

    section_text = clean_text(section_text)
    combined_input = context + "\n" + section_text if context else section_text    
    prompt = f"summarize the content as {style}: {combined_input}"

    input_tokens = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
    input_length = input_tokens.shape[1]
    max_length = 1000 
    min_length = 100 

    summary_ids = model.generate(
        input_tokens,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        no_repeat_ngram_size=3,  
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def summarize_text(file_path, context=''):

    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    text = clean_text(text)
    sections = text.split("\n## ")
    summarized_sections = []

    for section in sections:
        
        style = analyze_content_structure(section)
        
        section_summary = summarize_section(section, context, style)
        
        cleaned_summary = clean_summary(section_summary, style)
        summarized_sections.append(f"## {cleaned_summary}")

    
    final_summary = "\n\n".join(summarized_sections)
    
    output = f"\n{'='*40}\n"
    output += f"Summarized Text:\n{'-'*40}\n"
    output += final_summary + "\n"

    return output

def main():
    
    # input_file = input("Enter the path to the text file you want to summarize: ")
    input_file = 'try.txt'
    context = get_optional_context()
    summarized_text = summarize_text(input_file, context)
    print(summarized_text)

    with open('summarized_output.txt', 'w', encoding='utf-8') as output_file:
        output_file.write(summarized_text)
    print("\nSummarized text saved to 'summarized_output.txt'.")

if __name__ == "__main__":
    main()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Summarized Text:
----------------------------------------
## # Project Management Guide Project Management: A Guide to Project Management and Project Management.
- a guide to project management and project management.
&lt;b&gt;...&nbsp; Project Management & Project Management |||| - Project Management is a project management tool for project managers and project managers.
It is designed to help project managers understand and implement the project's goals and objectives.
 Global Project Management, Inc.
(Georgia) - USATODAY.com

## - Introduction to Project Management Project management involves planning, organizing, and overseeing a project from start to finish
- It helps organizations achieve specific goals by effectively using resources, such as time, budget, and personnel
- ### Key Components: - **Planning:** Establishing the project's goals and defining the tasks required to achieve them
- **Organizing:*** Allocating resources and assigning tasks to team members
- **Monitoring: *