<a href="https://colab.research.google.com/github/badrinath2605/sentiment_semmarization_sih/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Check GPU
!nvidia-smi -L
import torch
print("Torch:", torch.__version__, "CUDA available:", torch.cuda.is_available())


GPU 0: Tesla T4 (UUID: GPU-b5488df8-6677-421c-0e45-a7126a719027)
Torch: 2.8.0+cu126 CUDA available: True


In [None]:
# Run this as a single code cell (starts with ! so it's executed in shell)
!pip install -q sentence-transformers transformers[torch] torch torchvision torchaudio scikit-learn nltk && python -m nltk.downloader punkt


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# hybrid_colab.py (paste in a cell)
import gc, os, sys
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

nltk.download('punkt', quiet=True)

nltk.download('punkt_tab')   # fallback (some tokenizers expect this)


# ---------- CONFIG (colab-friendly defaults) ----------
SENT_MODEL = "sentence-transformers/all-mpnet-base-v2"   # good general SBERT (replace with legal SBERT if you want)
# For Colab (T4 16GB) start with a small abstractive model; switch to LED only if you run out of options
ABSTR_MODEL = "t5-small"   # alternatives: "facebook/bart-large-cnn" or "allenai/led-base-16384" (may OOM)
TOP_K = 6
MIN_SENT_LEN = 15
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CHUNK_OVERLAP = 50
VERBOSE = True

# ---------- Setup models ----------
print("Device:", DEVICE)
sent_model = SentenceTransformer(SENT_MODEL)

def load_abstractive(model_name):
    try:
        tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model.to(DEVICE)
        is_led = ("led" in model_name.lower()) or ("longformer" in model_name.lower())
        if VERBOSE:
            print(f"Loaded {model_name} on {DEVICE} (LED-like={is_led})")
        return tok, model, is_led
    except Exception as e:
        print("Error loading abstractive model:", e)
        raise

tokenizer, abstr_model, abstr_is_led = load_abstractive(ABSTR_MODEL)

# ---------- helpers ----------
def split_sentences(text, min_len=MIN_SENT_LEN):
    sents = [s.strip() for s in sent_tokenize(text)]
    sents = [s for s in sents if len(s) >= min_len]
    return sents

def embed_sentences(sents):
    return sent_model.encode(sents, convert_to_numpy=True, show_progress_bar=False)

def extract_by_centroid(text, top_k=TOP_K):
    sents = split_sentences(text)
    if not sents:
        return "", []
    embs = embed_sentences(sents)
    centroid = embs.mean(axis=0, keepdims=True)
    sims = cosine_similarity(embs, centroid).squeeze()
    top_idx = np.argsort(-sims)[:min(top_k, len(sents))]
    chosen_idx = sorted(top_idx)
    chosen = [(int(i), sents[i]) for i in chosen_idx]
    summary = " ".join([s for _, s in chosen])
    return summary, chosen

def chunk_text_for_tokenizer(text, tokenizer, max_tokens, overlap=CHUNK_OVERLAP):
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return [text]
    chunks = []
    start = 0
    while start < len(ids):
        end = start + max_tokens
        chunk_ids = ids[start:end]
        chunk_text = tokenizer.decode(chunk_ids, clean_up_tokenization_spaces=True)
        chunks.append(chunk_text)
        if end >= len(ids):
            break
        start = end - overlap
    return chunks

def set_led_global_attention(input_ids):
    mask = torch.zeros_like(input_ids)
    mask[:, 0] = 1
    return mask

def generate_with_model(texts, tokenizer, model, is_led, max_new_tokens=120, num_beams=4, batch_size=4):
    device = DEVICE
    summaries = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, return_tensors="pt", truncation=True, padding=True)
            input_ids = enc["input_ids"].to(device)
            attention_mask = enc["attention_mask"].to(device)
            gen_kwargs = dict(input_ids=input_ids, attention_mask=attention_mask,
                              max_new_tokens=max_new_tokens, num_beams=num_beams, early_stopping=True)
            if is_led:
                gen_kwargs["global_attention_mask"] = set_led_global_attention(input_ids)
            outs = model.generate(**gen_kwargs)
            decoded = [tokenizer.decode(o, skip_special_tokens=True, clean_up_tokenization_spaces=True) for o in outs]
            summaries.extend(decoded)
    return summaries

def hybrid_summarize(long_text, extract_method="centroid", top_k=TOP_K,
                     abstr_max_new_tokens=120, beam_size=4):
    if extract_method=="kmeans":
        extracted_text, provenance = extract_by_kmeans(long_text, top_k)
    else:
        extracted_text, provenance = extract_by_centroid(long_text, top_k)
    if VERBOSE:
        print(f"Extracted {len(provenance)} sentences.")
    if not extracted_text:
        return "", provenance

    tok = tokenizer; model = abstr_model; is_led = abstr_is_led
    model_max = tok.model_max_length if getattr(tok, "model_max_length", None) else 1024
    abstr_max_input = model_max - 32

    enc_len = len(tok.encode(extracted_text, add_special_tokens=False))
    if enc_len <= abstr_max_input:
        if VERBOSE: print("Fits model, generating...")
        final = generate_with_model([extracted_text], tok, model, is_led,
                                    max_new_tokens=abstr_max_new_tokens, num_beams=beam_size)[0]
        return final, provenance

    if VERBOSE: print(f"Too long for model ({enc_len} tokens). Chunking...")
    chunks = chunk_text_for_tokenizer(extracted_text, tok, abstr_max_input)
    if VERBOSE: print(f"{len(chunks)} chunk(s). Summarizing chunks...")
    chunk_summaries = generate_with_model(chunks, tok, model, is_led,
                                          max_new_tokens=abstr_max_new_tokens, num_beams=beam_size)
    if len(chunk_summaries) == 1:
        final = chunk_summaries[0]
    else:
        combined = " ".join(chunk_summaries)
        comb_len = len(tok.encode(combined, add_special_tokens=False))
        if comb_len <= abstr_max_input:
            if VERBOSE: print("Combining chunks and doing final rewrite...")
            final = generate_with_model([combined], tok, model, is_led,
                                        max_new_tokens=abstr_max_new_tokens, num_beams=beam_size)[0]
        else:
            if VERBOSE: print("Combined too long: returning concatenated chunk summaries.")
            final = " ".join(chunk_summaries)
    # free mem
    gc.collect(); torch.cuda.empty_cache()
    return final, provenance

# Fallback kmeans (if user chooses)
def extract_by_kmeans(text, top_k=TOP_K):
    sents = split_sentences(text)
    if not sents:
        return "", []
    embs = embed_sentences(sents)
    n_clusters = min(top_k, len(sents))
    km = KMeans(n_clusters=n_clusters, random_state=42).fit(embs)
    centers = km.cluster_centers_
    labels = km.labels_
    chosen = []
    for c in range(n_clusters):
        cluster_idx = np.where(labels == c)[0]
        if len(cluster_idx)==0: continue
        cluster_embs = embs[cluster_idx]
        sims = cosine_similarity(cluster_embs, centers[c].reshape(1,-1)).squeeze()
        best_local = cluster_idx[int(np.argmax(sims))]
        chosen.append(best_local)
    chosen_sorted = sorted(chosen)
    chosen_list = [(int(i), sents[i]) for i in chosen_sorted]
    summary = " ".join([s for _, s in chosen_list])
    return summary, chosen_list

# ---------- Example usage ----------
sample_text = """Paste your long legal document here. Or load from Drive:
with open('/content/yourfile.txt') as f: text = f.read()
"""
# run the pipeline
final_summary, provenance = hybrid_summarize(sample_text, extract_method="centroid", top_k=6)
print("\nFINAL SUMMARY:\n", final_summary)
print("\nPROVENANCE:")
for idx, sent in provenance:
    print(idx, ":", sent)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Device: cuda
Loaded t5-small on cuda (LED-like=False)
Extracted 2 sentences.
Fits model, generating...

FINAL SUMMARY:
 .txt') as f: text = f.read().

PROVENANCE:
0 : Paste your long legal document here.
1 : Or load from Drive:
with open('/content/yourfile.txt') as f: text = f.read()


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from transformers import pipeline
import evaluate

# make sure punkt is ready
nltk.download("punkt")
nltk.download('punkt_tab')

# 1. Load models
embedder = SentenceTransformer("all-MiniLM-L6-v2")   # fast embeddings
abstractive_model = pipeline("summarization", model="facebook/bart-large-cnn")
rouge = evaluate.load("rouge")

# 2. Split text and pick key sentences (extractive step)
def extract_key_sentences(text, top_k=3):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= top_k:
        return sentences
    embeddings = embedder.encode(sentences)
    centroid = np.mean(embeddings, axis=0)
    print(centroid)
    print(embeddings)
    scores = cosine_similarity([centroid], embeddings)[0]
    print(scores)
    ranked = [s for _, s in sorted(zip(scores, sentences), reverse=True)]
    print(ranked)
    return ranked[:top_k]

# 3. Abstractive rewrite
def abstractive_summarize(sentences, max_len=80):
    joined = " ".join(sentences)
    result = abstractive_model(joined, max_length=max_len, min_length=20, do_sample=False)
    return result[0]["summary_text"]

# 4. Full hybrid pipeline (now also returns extracted sentences)
def hybrid_summarize(text, top_k=3, show_extracts=True):
    key_sentences = extract_key_sentences(text, top_k=top_k)
    summary = abstractive_summarize(key_sentences)
    if show_extracts:
        print("\n🔎 Key Extracted Sentences:")
        for i, s in enumerate(key_sentences, 1):
            print(f"{i}. {s}")
    return summary

# 5. Accuracy check (with reference summary)
def check_accuracy(text, reference, top_k=3):
    pred = hybrid_summarize(text, top_k)
    scores = rouge.compute(predictions=[pred], references=[reference])
    return pred, scores

# -------------------------------
# 🔹 TEST WITH NEW EXAMPLES
# -------------------------------

example_text = """
The new environmental regulation draft requires companies to report carbon emissions every quarter.
However, the section on penalty clauses is vague and does not specify fines clearly.
Many stakeholders argue that without strict enforcement mechanisms, the draft will not be effective.
Others support the draft, saying it encourages gradual compliance rather than harsh punishment.
"""

reference_summary = "The draft regulation mandates quarterly emission reports but lacks clarity on penalties and enforcement."

# run summarizer
summary, metrics = check_accuracy(example_text, reference_summary, top_k=3)

print("\n📝 Model Summary:\n", summary)
print("\n📊 ROUGE Metrics:\n", metrics)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Device set to use cuda:0
Your max_length is set to 80, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)



🔎 Key Extracted Sentences:
1. Others support the draft, saying it encourages gradual compliance rather than harsh punishment.
2. Many stakeholders argue that without strict enforcement mechanisms, the draft will not be effective.
3. 
The new environmental regulation draft requires companies to report carbon emissions every quarter.

📝 Model Summary:
 Many stakeholders argue that without strict enforcement mechanisms, the draft will not be effective. Others support the draft, saying it encourages gradual compliance rather than harsh punishment.

📊 ROUGE Metrics:
 {'rouge1': np.float64(0.14634146341463417), 'rouge2': np.float64(0.05128205128205129), 'rougeL': np.float64(0.0975609756097561), 'rougeLsum': np.float64(0.0975609756097561)}


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
!pip install rouge_score evaluate


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=739187de051d61ab355b4c21927dfbf29e067b42024a1e0a1e229f3c4242f0fe
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
def extract_key_sentences(text, top_k=3):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= top_k:
        return sentences
    embeddings = embedder.encode(sentences)
    centroid = np.mean(embeddings, axis=0)
    print("centroid",centroid)
    print("embeddings",embeddings)
    scores = cosine_similarity([centroid], embeddings)[0]
    print("scores",scores)
    ranked = [s for _, s in sorted(zip(scores, sentences), reverse=True)]
    print("ranked",ranked)
    return ranked[:top_k]


In [None]:
example_text = """
The new environmental regulation draft requires companies to report carbon emissions every quarter.
However, the section on penalty clauses is vague and does not specify fines clearly.
Many stakeholders argue that without strict enforcement mechanisms, the draft will not be effective.
Others support the draft, saying it encourages gradual compliance rather than harsh punishment.
"""

reference_summary = "The draft regulation mandates quarterly emission reports but lacks clarity on penalties and enforcement."

# run summarizer
summary, metrics = check_accuracy(example_text, reference_summary, top_k=3)

print("\n📝 Model Summary:\n", summary)
print("\n📊 ROUGE Metrics:\n", metrics)


Your max_length is set to 80, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


centroid [ 7.80226663e-03  5.72779067e-02  1.44031523e-02  4.09040367e-03
  5.34865782e-02  4.86362427e-02 -5.39609864e-02 -6.64051920e-02
 -1.80745032e-04  3.37831527e-02  2.74890084e-02  3.70184258e-02
 -7.62730511e-03  3.72977778e-02  2.42742766e-02 -2.34567225e-02
  3.52557711e-02 -1.13050584e-02 -2.87542585e-03  1.23913437e-02
  3.89079675e-02  2.39060614e-02  1.60450414e-02  7.87554979e-02
 -1.32592302e-02 -3.64009142e-02 -3.44943851e-02  2.07959525e-02
 -7.84841832e-04 -2.29594409e-02 -4.50914018e-02  6.70173019e-02
 -2.12794635e-02  3.16747017e-02  5.29159512e-03 -9.14734509e-03
  2.85529979e-02 -1.07273525e-02 -5.57542662e-05 -2.39412710e-02
 -2.17141267e-02 -1.46933766e-02 -5.04318513e-02  4.30593342e-02
 -5.19973878e-03 -5.48326643e-05 -7.29370490e-03 -5.17267622e-02
 -9.50782746e-02 -2.03482620e-02  4.98113548e-03  8.04084446e-03
 -1.16994279e-02  9.92887001e-03 -1.72812231e-02 -6.22903258e-02
 -1.19393338e-02 -4.64573875e-02  3.14650685e-02 -3.33945416e-02
 -1.87320244e-02

In [None]:
he new environmental regulation draft requires companies to report carbon emissions every quarter.
However, the section on penalty clauses is vague and does not specify fines clearly.
Many stakeholders argue that without strict enforcement mechanisms, the draft will not be effective.
Others support the draft, saying it encourages gradual compliance rather than harsh punishment.

In [None]:
Many stakeholders argue that without strict enforcement mechanisms, the draft will not be effective. Others support the draft, saying it encourages gradual
compliance rather than harsh punishment.


In [None]:
The draft regulation mandates quarterly emission reports but lacks clarity on penalties and enforcement.