<a href="https://colab.research.google.com/github/Zerothlaw0095/docpdf/blob/main/Fine_tune_Bart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tune BART (trained on custom DATASET specifically Software Eng. textbook named Software Engineering - Ian **Sommerville**)

In [None]:
!pip install transformers datasets evaluate openpyxl accelerate sentencepiece --quiet

import pandas as pd
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from evaluate import load

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h


# Loading dataset

In [None]:
df = pd.read_excel("/content/Clean_Book_Summary.xlsx")

# Rename columns for simplicity
df = df.rename(columns={
    "Chapter Text (first 2000 chars)": "text",
    "Summary": "summary"
})

# Drop missing rows (if any)
df = df.dropna(subset=["text", "summary"])

print("Sample rows:")
print(df[["text", "summary"]].head(2))
print("\nTotal samples:", len(df))

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

Sample rows:
                                                text  \
0  Global edition Software Engineering TENTH edit...   
1  . Minor changes and updates have been made to ...   

                                             summary  
0  Software Engineering, 10th edition, by Ian Som...  
1  The book is aimed at university and college st...  

Total samples: 535


# Tokenizing

In [None]:
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def preprocess(batch):
    inputs = tokenizer(batch["text"], max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(batch["summary"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Map:   0%|          | 0/481 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


# Training setup

In [None]:
args = TrainingArguments(
    output_dir="./bart-software-eng-manual",
    eval_strategy="epoch", # Changed from evaluation_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=50,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
)

trainer.train()

model.save_pretrained("./bart-software-eng-manual")
tokenizer.save_pretrained("./bart-software-eng-manual")
print("✅ Fine-tuning complete and model saved!")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3264,1.058386
2,0.6177,1.062588
3,0.5829,1.192145




✅ Fine-tuning complete and model saved!


In [None]:
!pip install rouge_score --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


# Evaluate briefly with ROUGE

In [None]:
rouge = load("rouge")
preds, refs = [], []
device = model.device # Get the device of the model
for s in dataset["test"].select(range(min(50, len(dataset["test"])))):
    input_ids = tokenizer(s["text"], return_tensors="pt", truncation=True, max_length=1024).input_ids
    input_ids = input_ids.to(device) # Move input_ids to the same device as the model
    summary_ids = model.generate(input_ids, max_length=128, min_length=40)
    preds.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
    refs.append(s["summary"])

print("ROUGE:", rouge.compute(predictions=preds, references=refs))

Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE: {'rouge1': np.float64(0.557433070072213), 'rouge2': np.float64(0.4408551193608171), 'rougeL': np.float64(0.47738645704701266), 'rougeLsum': np.float64(0.4768300563275404)}


# Test Summarization

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model="./bart-software-eng-manual")

text_query = """As computer systems have become deeply embedded in our business and personal
lives, the problems that result from system and software failure are increasing. A
failure of server software in an e-commerce company could lead to a major loss of
revenue and customers for that company. A software error in an embedded control
system in a car could lead to expensive recalls of that model for repair and, in the
worst case, could be a contributory factor in accidents. The infection of company
PCs with malware requires expensive clean-up operations to sort out the problem
and could lead to the loss of or damage to sensitive information.
 Because software-intensive systems are so important to governments, companies,
and individuals, we have to be able to trust these systems. The software should be
available when it is needed, and it should operate correctly without undesirable side
effects, such as unauthorized information disclosure. In short, we should be able to
depend on our software systems.
 The term dependability was proposed by Jean-Claude Laprie in 1995 to cover the
related systems attributes of availability, reliability, safety, and security. His ideas
were revised over the next few years and are discussed in a definitive paper pub
lished in 2004 (Avizienis et al. 2004). As I discuss in Section 10.1, these properties
are inextricably linked, so having a single term to cover them all makes sense.
 The dependability of systems is usually more important than their detailed func
tionality for the following reasons:
 1. System failures affect a large number of people Many systems include func
tionality that is rarely used. If this functionality were left out of the system, only
a small number of users would be affected. System failures that affect the avail
ability of a system potentially affect all users of the system. Unavailable sys
tems may mean that normal business is impossible.
 2. Users often reject systems that are unreliable, unsafe, or insecure If users find
that a system is unreliable or insecure, they will refuse to use it. Furthermore,
they may also refuse to buy or use other products from the company that pro
duced the unreliable system. They do not want a repetition of their bad experi
ence with an undependable system.
 3. System failure costs may be enormous For some applications, such as a reactor
control system or an aircraft navigation system, the cost of system failure is
orders of magnitude greater than the cost of the control system. Failures in sys
tems that control critical infrastructure such as the power network have wide
spread economic consequences.
 4.
Undependable systems may cause information loss Data is very expensive to collect
and maintain; it is usually worth much more than the computer system on which it
is processed. The cost of recovering lost or corrupt data is usually very high."""
# print("\nInput:", text_query)
print("\nGenerated Summary:\n", summarizer(text_query, max_length=150, min_length=40)[0]["summary_text"])

Device set to use cuda:0



Generated Summary:
 Software-intensive systems are so important to governments, companies,  metrics, and individuals, we have to be able to trust these systems. The term dependability was proposed by Jean-Claude Laprie in 1995 to cover the related systems attributes of availability, reliability, safety, and security. System failures affect a large number of people. Data is very expensive to collect  and maintain; it is usually worth much more than the computer system on which it is processed.


In [None]:
# 🔧 Install dependencies
!apt install -y tesseract-ocr
!pip install pytesseract PyMuPDF pandas Pillow tqdm opencv-python-headless

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytesseract, PyMuPDF
Successfully installed PyMuPDF-1.26.6 pytesseract-0.3.13


# PDF TEXT EXTRACTION

# EXAMPLE 1 (Assumption: pdf named split.pdf is uploaded here in colab)

# IMAGE PREPROCESSING FUNCTION

In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import pandas as pd
from tqdm import tqdm
import io
import cv2
import numpy as np

PDF_PATH = "/content/Split.pdf" # Here the assumption is pdf path is given

def preprocess_image_for_ocr( pil_img ):
    """
    Convert PIL image to OpenCV format, clean it, and enhance readability.
    """
    img = np.array(pil_img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.medianBlur(gray, 3)
    _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    processed = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
    processed = cv2.morphologyEx(processed, cv2.MORPH_CLOSE, kernel)
    return processed

# ============================================================
# OCR PIPELINE — MERGED TEXT
# ============================================================
def extract_pdf_text_as_one(pdf_path):
    """
    Performs full-page OCR across all pages and merges all extracted text into one paragraph.
    """
    pdf = fitz.open(pdf_path)
    all_text = ""

    for page_num in tqdm(range(len(pdf)), desc="🧠 Extracting OCR from all pages"):
        page = pdf[page_num]
        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))  # 3x scaling for clarity
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        processed_img = preprocess_image_for_ocr(img)
        ocr_text = pytesseract.image_to_string(processed_img, lang="eng")

        # Merge into single text (add space between pages)
        all_text += " " + ocr_text.strip().replace("\n", " ")

    pdf.close()
    return " ".join(all_text.split())  # remove extra spaces

# ============================================================
# RUN EXTRACTION
# ============================================================
merged_text = extract_pdf_text_as_one(PDF_PATH)
print()
print(merged_text)

🧠 Extracting OCR from all pages: 100%|██████████| 33/33 [03:59<00:00,  7.26s/it]


23 Project planning Objectives The objective of this chapter is to introduce project planning, scheduling, and cost estimation. When you have read the chapter, you will: understand the fundamentals of software costing and the factors that affect the price of a software system to be developed for external clients; know what sections should be included in a project plan that is created within a plan-driven development process; understand what is involved in project scheduling and the use of bar charts to present a project schedule; have been introduced to agile project planning based on the “planning game”; understand cost estimation techniques and how the COCOMO II model can be used for software cost estimation. Contents 23.1 Software pricing 23.2 Plan-driven development 23.3 Project scheduling 23.4 Agile planning 23.5 Estimation techniques 23.6 COCOMO cost modeling 668 Chapter 23 Project planning Project planning is one of the most important jobs of a software project manager. As a ma




# Test Summarization from pdf's extracted text

In [None]:
from transformers import pipeline

# Initialize the summarization pipeline with the fine-tuned model
summarizer = pipeline("summarization", model="./bart-software-eng-manual", device_map="auto")

# Define the maximum input length for the model
max_input_length = 1024  # BART's maximum input length

# Split the merged text into smaller chunks
# A simple split by character count; more sophisticated methods might be needed
text_chunks = [merged_text[i:i + max_input_length] for i in range(0, len(merged_text), max_input_length)]

# Summarize each chunk
summaries = []
print("\nGenerating summaries for text chunks:")
for i, chunk in enumerate(text_chunks):
    print(f"Summarizing chunk {i+1}/{len(text_chunks)}")
    try:
        # Generate summary for the current chunk
        summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]["summary_text"]
        summaries.append(summary)
    except Exception as e:
        print(f"Error summarizing chunk {i+1}: {e}")
        continue

# Join the summaries of the chunks to get a comprehensive summary
full_summary = " ".join(summaries)

print("\nGenerated Full Summary:\n", full_summary)

Device set to use cpu



Generating summaries for text chunks:
Summarizing chunk 1/80
Summarizing chunk 2/80
Summarizing chunk 3/80
Summarizing chunk 4/80
Summarizing chunk 5/80
Summarizing chunk 6/80
Summarizing chunk 7/80
Summarizing chunk 8/80
Summarizing chunk 9/80
Summarizing chunk 10/80
Summarizing chunk 11/80
Summarizing chunk 12/80
Summarizing chunk 13/80
Summarizing chunk 14/80
Summarizing chunk 15/80
Summarizing chunk 16/80
Summarizing chunk 17/80
Summarizing chunk 18/80
Summarizing chunk 19/80
Summarizing chunk 20/80
Summarizing chunk 21/80
Summarizing chunk 22/80
Summarizing chunk 23/80
Summarizing chunk 24/80
Summarizing chunk 25/80
Summarizing chunk 26/80
Summarizing chunk 27/80
Summarizing chunk 28/80
Summarizing chunk 29/80
Summarizing chunk 30/80
Summarizing chunk 31/80
Summarizing chunk 32/80
Summarizing chunk 33/80
Summarizing chunk 34/80
Summarizing chunk 35/80
Summarizing chunk 36/80
Summarizing chunk 37/80
Summarizing chunk 38/80
Summarizing chunk 39/80
Summarizing chunk 40/80
Summarizin

# After uploading the above model on Hugging face we can pull the model for backend purpose

# EXAMPLE 2 (Note: pdf has to uploaded when the console asks after running cell)

In [None]:
%pip install pymupdf pytesseract
%pip install PyPDF2

In [None]:
import os
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import cv2
import numpy as np
import requests
from tqdm import tqdm
from google.colab import files

uploaded = files.upload()  # upload PDF interactively
PDF_PATH = next(iter(uploaded))  # get the first uploaded filename


# ============================================================
# IMAGE PREPROCESSING FOR OCR
# ============================================================
def preprocess_image_for_ocr(pil_img):
    """Clean and enhance scanned PDF images for OCR."""
    img = np.array(pil_img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.medianBlur(gray, 3)
    _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    processed = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
    processed = cv2.morphologyEx(processed, cv2.MORPH_CLOSE, kernel)
    return processed


# ============================================================
# OCR FUNCTION — MERGE ALL TEXT
# ============================================================
def extract_pdf_text_as_one(pdf_path):
    """Perform OCR on all pages and merge the extracted text."""
    pdf = fitz.open(pdf_path)
    all_text = ""

    for page_num in tqdm(range(len(pdf)), desc="Extracting OCR from all pages"):
        page = pdf[page_num]
        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))  # scale up for clarity
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        processed_img = preprocess_image_for_ocr(img)
        ocr_text = pytesseract.image_to_string(processed_img, lang="eng")
        all_text += " " + ocr_text.strip().replace("\n", " ")

    pdf.close()
    return " ".join(all_text.split())


# ============================================================
# CALL HUGGING FACE MODEL
# ============================================================
def query_hf_model(text):
    """Send text chunk to the Hugging Face summarization model."""
    payload = {"inputs": text}
    response = requests.post(HF_MODEL_API_URL, headers=headers, json=payload)

    if response.status_code == 200:
        try:
            return response.json()[0]["summary_text"]
        except Exception:
            return str(response.json())
    else:
        print(f" Error {response.status_code}: {response.text}")
        return ""


# ============================================================
# CHUNKING & SUMMARIZATION
# ============================================================
def summarize_text(text, chunk_size=1024):
    """Split text into manageable chunks and summarize each via API."""
    text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    summaries = []

    print(f"\n📑 Generating summaries for {len(text_chunks)} chunks:")
    for i, chunk in enumerate(text_chunks):
        print(f"⏳ Summarizing chunk {i+1}/{len(text_chunks)} ...")
        summary = query_hf_model(chunk)
        summaries.append(summary)

    return " ".join(summaries)


# ============================================================
# MAIN EXECUTION
# ============================================================
if os.path.exists(PDF_PATH):
    merged_text = extract_pdf_text_as_one(PDF_PATH)
    print(f"\n Extracted {len(merged_text)} characters of text.")

else:
    print(f" Error: PDF not found at {PDF_PATH}")


In [None]:
from huggingface_hub import login
from transformers import pipeline
import textwrap

# ------------------------------------------------------------
# Authenticate with Hugging Face
# ------------------------------------------------------------
login("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")  # your personal access token

# ------------------------------------------------------------
# Load your model from Hugging Face
# ------------------------------------------------------------
summarizer = pipeline(
    "summarization",
    model="Aditya-devop/academica_summarizer",
    tokenizer="Aditya-devop/academica_summarizer",
    device_map="auto"
)

# ------------------------------------------------------------
# Summarization helper
# ------------------------------------------------------------
def summarize_text(text, max_chunk_len=1500):
    chunks = textwrap.wrap(text, max_chunk_len)
    print(f"🪄 Summarizing {len(chunks)} chunks...")

    summaries = []
    for i, chunk in enumerate(chunks, 1):
        print(f"⏳ Chunk {i}/{len(chunks)}")
        summary = summarizer(chunk, max_length=300, min_length=80, do_sample=False)[0]["summary_text"]
        summaries.append(summary.strip())

    return " ".join(summaries)


# ------------------------------------------------------------
# Run summarization
# ------------------------------------------------------------
final_summary = summarize_text(merged_text)

print("\n FINAL SUMMARY:\n")
print(final_summary)

with open("/content/summary_output.txt", "w", encoding="utf-8") as f:
    f.write(final_summary)
print("\n Saved summary_output.txt")