In [None]:
import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# Path to the folder containing the original .txt files
input_folder = r"C:\Users\ADMIN\Documents\Document\USTH\B3\nlp\legaldoc_summarize\under2k_chunked"

# Output folder where the split chunks will be saved
output_folder = r"C:\Users\ADMIN\Documents\Document\USTH\B3\nlp\split_chunks"
os.makedirs(output_folder, exist_ok=True)

# Loop through all .txt files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_path = os.path.join(input_folder, filename)
        with open(input_path, "r", encoding="utf-8") as file:
            content = file.read()

        # Split content by chunks, using "### Chunk X ###" as the marker
        chunks = re.split(r'(?=### Chunk\s*\d+\s*###)', content)
        chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
        
        base_name = os.path.splitext(filename)[0]  # e.g., "187" from "187.txt"

        # Save each chunk to a new file
        for index, chunk in enumerate(chunks, start=1):
            output_filename = f"{base_name}_{index}.txt"
            output_path = os.path.join(output_folder, output_filename)
            with open(output_path, "w", encoding="utf-8") as output_file:
                output_file.write(chunk)

print("✅ All chunks have been successfully split and saved to 'split_chunks' folder.")


In [None]:
# Load the pretrained Vietnamese summarization model
model_name = "VietAI/vit5-base-vietnews-summarization"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
def summarize_text(text, max_sentences=2):
    # Remove headers like "### Chunk 1 ###"
    text = re.sub(r"### Chunk\s*\d+\s*###", "", text).strip()

    # Tokenize the input text
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    input_ids = input_ids.to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids,
        max_length=100,
        min_length=30,
        length_penalty=1.0,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Return only the first 2 sentences
    sentences = re.split(r'(?<=[.!?]) +', summary)
    return " ".join(sentences[:max_sentences])


In [None]:
# Loop through all .txt files and summarize them
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_path = os.path.join(input_folder, filename)
        with open(input_path, "r", encoding="utf-8") as f:
            text = f.read()

        # Generate summary
        summary = summarize_text(text)

        # Construct output filename, e.g., 187_1_summarize.txt
        base_name = os.path.splitext(filename)[0]
        output_filename = base_name + "_summarize.txt"
        output_path = os.path.join(output_folder, output_filename)

        # Save summary to file
        with open(output_path, "w", encoding="utf-8") as out_f:
            out_f.write(summary)

        print(f"✅ Summarized: {filename} → {output_filename}")