In [1]:
!pip install pypdf simplify_docx docx numpy tensorflow torch



DEPRECATION: simplify-docx 0.1.2 has a non-standard dependency specifier six>=1.12.0<2. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of simplify-docx or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [2]:
from transformers import logging, T5Tokenizer, T5ForConditionalGeneration
from pypdf import PdfReader
import docx
import torch
from simplify_docx import simplify

logging.set_verbosity_error()

def create_t5_tokenizer(model_name="t5-base"):
    return T5Tokenizer.from_pretrained(model_name)

def create_t5_summarizer(model_name="t5-base", device=None):
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    if device:
        model = model.to(device)
    return model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

def break_text_in_sequences1(extracted_text, max_length_sequence):
    segments = []
    for text in extracted_text:
        text = text.replace("\n", " ")
        for i in range(0, len(text), max_length_sequence):
            segment = text[i:i + max_length_sequence]
            segments.append(segment)
    return segments

def break_text_in_sequences(tokenizer, text, max_length_sequence):
    words = text.split()
    current_chunk = 1
    chunks = ['']
    for word in words:
        if len(tokenizer.encode(chunks[current_chunk - 1] + ' ' + word, add_special_tokens=True)) > max_length_sequence:
            current_chunk += 1
            chunks.append(word)
        else:
            chunks[current_chunk - 1] += ' ' + word
    chunks = [chunk.strip() for chunk in chunks]
    return chunks

In [4]:

def extract_text_recursive(docx_json):
    text_list = []
    if isinstance(docx_json, dict):
        if docx_json.get('TYPE') == 'text':
            text_list.append(docx_json.get('VALUE', ''))
        else:
            for key, value in docx_json.items():
                text_list.extend(extract_text_recursive(value))
    elif isinstance(docx_json, list):
        for item in docx_json:
            text_list.extend(extract_text_recursive(item))
    return text_list

def extract_text_from_pdf(filename):
    reader = PdfReader('./inputs/' + filename)
    extracted_text = []
    for page in reader.pages:
        text = page.extract_text()
        extracted_text.append(text)
    return extracted_text

def extract_text_from_docx(filename):
    document = docx.Document('./inputs/' + filename)
    my_doc_as_json = simplify(document)
    extracted_text = extract_text_recursive(my_doc_as_json)
    return extracted_text

In [5]:
def concat_summaries(summarizer, text_segments, tokenizer, max_length=512, num_beams=4, early_stopping=True):
    final_summary = ""
    for segment in text_segments:
        input_ids = tokenizer.encode(segment, return_tensors="pt")
        output = summarizer.generate(
            input_ids,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=early_stopping
        )
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        final_summary += summary
    return final_summary

def calculate_max_length_of_summary(text_list):
    min_text_length = 1024
    for text in text_list:
        if(len(text.strip()) < min_text_length):
            min_text_length = len(text.strip())
    return int(min_text_length/2)

In [6]:
tokenizer = create_t5_tokenizer()
raw_text = " ".join(extract_text_from_docx('once-upon-a-time-test.docx'))
processed_text = break_text_in_sequences(tokenizer, raw_text, 1024)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [7]:
summarizer = create_t5_summarizer(model_name="t5-base", device="cuda" if torch.cuda.is_available() else "cpu")
final_summary = concat_summaries(summarizer, processed_text, tokenizer)
print(final_summary)

the Enchanted Slumber: Elara's Tale of Kindness and Courage Once upon a time, in a kingdom nestled between towering mountains, there lived a kind and beautiful princess named Elara. she was known far and wide for her ebony hair, porcelain skin, and lips as red as the blood-red roses that bloomed in the royal gardens. her stepmother, Queen Malvina, was an enchantress, but her heart was as cold as the snow that
