<a href="https://colab.research.google.com/github/badrinath2605/sentiment_semmarization_sih/blob/main/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import random
from datasets import Dataset
from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
import evaluate

# -----------------------
# CONFIG
# -----------------------
MODEL_NAME = "t5-small"  # smaller model = faster
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 100
BATCH_SIZE = 8           # larger if GPU memory allows
EPOCHS = 10              # only 1 epoch for fast results
OUTPUT_DIR = "./t5_econsult_quick"

# -----------------------
# SAMPLE DATA (fast prototyping)
# -----------------------
topics = [
    "data privacy", "environmental regulations", "labor law",
    "tax compliance", "consumer protection", "financial reporting"
]
issues = [
    "reporting timelines", "compliance requirements",
    "enforcement mechanisms", "penalty clauses", "accountability"
]

# Generate 300 synthetic examples
texts, summaries = [], []
for _ in range(100):  # long comments
    t, i = random.choice(topics), random.choice(issues)
    texts.append(f"The draft on {t} is confusing regarding {i}. Clearer guidance is needed.")
    summaries.append(f"{t.capitalize()} unclear on {i}.")
for _ in range(100):  # medium comments
    t, i = random.choice(topics), random.choice(issues)
    texts.append(f"Clause on {t} is useful but {i} is unclear.")
    summaries.append(f"{t.capitalize()} clause needs clarification on {i}.")
for _ in range(100):  # short comments
    t, i = random.choice(topics), random.choice(issues)
    texts.append(f"{t.capitalize()} unclear on {i}.")
    summaries.append(f"{t.capitalize()} clause unclear on {i}.")

dataset = Dataset.from_dict({"text": texts, "summary": summaries})
dataset = dataset.train_test_split(test_size=0.2)
train_ds, valid_ds = dataset['train'], dataset['test']

# -----------------------
# TOKENIZER & MODEL
# -----------------------
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

def preprocess(examples):
    inputs = ["summarize: " + t for t in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH,
                             truncation=True, padding="max_length")
    labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LENGTH,
                       truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
valid_tokenized = valid_ds.map(preprocess, batched=True, remove_columns=valid_ds.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# -----------------------
# METRICS
# -----------------------
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[l if l != -100 else tokenizer.pad_token_id for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: float(v * 100) for k, v in result.items()}

# -----------------------
# TRAINING
# -----------------------
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to=[],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# -----------------------
# INFERENCE FUNCTION
# -----------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def summarize_comments(comments):
    combined_text = "\n".join(comments)
    inputs = tokenizer("summarize: " + combined_text, return_tensors="pt",
                       max_length=MAX_INPUT_LENGTH, truncation=True, padding="longest")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    summary_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=MAX_TARGET_LENGTH,
        num_beams=4,
        early_stopping=True,
        length_penalty=1.0,
        no_repeat_ngram_size=2
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# -----------------------
# TEST
# -----------------------
example_comments = [
    "Clause 5 is ambiguous. Please clarify the definitions.",
    "Supportive of transparency, but suggest clear timelines for reporting."
]
summary = summarize_comments(example_comments)
print("Summary:", summary)

ModuleNotFoundError: No module named 'evaluate'