In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install -q transformers datasets accelerate

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

In [None]:
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
dataset = load_dataset("FiscalNote/billsum")

In [None]:
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
dataset

In [None]:
train_data

In [None]:
test_data

In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["text", "summary", "title"])


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-p100-output",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    predict_with_generate=True,
    # fp16=False for P100 (no native support)
    gradient_checkpointing=True,
    generation_max_length=128,
    logging_dir="./logs",
    logging_steps=250,
    report_to="none",  # or "wandb"
)


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
!nvidia-smi

In [None]:
import time
start = time.time()
trainer.train()
print("Training finished in", time.time() - start, "seconds.")


In [None]:
trainer.save_model("./flan-t5-legal-summary")
tokenizer.save_pretrained("./flan-t5-legal-summary")


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="./flan-t5-legal-summary", tokenizer="./flan-t5-legal-summary")

sample_text = dataset["test"][0]["text"]
summary = summarizer("summarize: " + sample_text, max_length=128, truncation=True)[0]['summary_text']

print(" Original Title:", dataset["test"][0]["title"])
print(" Generated Summary:", summary)
print(" Reference Summary:", dataset["test"][0]["summary"])


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# Loading fine-tuned summarization model
summ_model_path = "./flan-t5-legal-summary"  

summ_tokenizer = AutoTokenizer.from_pretrained(summ_model_path)
summ_model = AutoModelForSeq2SeqLM.from_pretrained(summ_model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summ_model.to(device)

# Loading a pretrained NER model
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple", device=0 if torch.cuda.is_available() else -1)

#  Inference function
def process_legal_document(text):
    # Summarization
    inputs = summ_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)
    summary_ids = summ_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=200,
        num_beams=4,
        early_stopping=True
    )
    summary = summ_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # NER
    ner_results = ner_pipeline(text)

    return summary, ner_results

# Example usage
if __name__ == "__main__":
    legal_text = """
    IN THE SUPREME COURT OF PAKISTAN  
CIVIL APPELLATE JURISDICTION  
CIVIL APPEAL NO. 1456 OF 2022  

Between:  
Mr. Ahsan Raza, Resident of F-6/3, Islamabad  
— Appellant  

And  

The Federal Board of Revenue (FBR), through Chairman, Islamabad  
The Ministry of Law and Justice, Government of Pakistan  
The Commissioner Inland Revenue, Zone-IV, Karachi  
— Respondents  

JUDGMENT

Justice Syed Mansoor Ali Shah delivering the opinion of the Court:

The present appeal arises out of the impugned judgment dated 16th November 2021 passed by the Islamabad High Court in W.P. No. 3189/2020, whereby the petitioner’s plea for tax exemption under Section 53(2)(c) of the Income Tax Ordinance, 2001 was dismissed.

The appellant, Mr. Ahsan Raza, a corporate consultant by profession, claims that the consultancy fee received by him from GlobalTech Solutions (Pvt.) Ltd. — a Singapore-based IT firm with regional operations in Pakistan — was wrongly taxed by the Inland Revenue Department under Section 21(b)(iv) of the Ordinance. He contends that the said fee qualifies as income from a foreign source and should be exempt under bilateral tax treaties signed between Pakistan and the Republic of Singapore.

The FBR, on the other hand, asserts that the consultancy was rendered within Pakistan's territorial jurisdiction and thus attracts domestic tax obligations. The Ministry of Law and Justice also submitted that no overriding clause in the treaty nullifies the domestic provisions in this context.

After hearing both parties and examining the nature of cross-border service agreements submitted into evidence — including the Memorandum of Understanding (MoU) signed on 12th February 2020 between Mr. Raza and GlobalTech — the Court is of the opinion that the payment was subject to tax under Pakistani law.
.
Accordingly, the appeal is dismissed. No order as to costs

    """

    summary, entities = process_legal_document(legal_text)

    print(" Summary:\n", summary)
    print("\n Named Entities:")
    for ent in entities:
        print(f"{ent['word']} ({ent['entity_group']}) — Score: {ent['score']:.2f}")


In [None]:
!zip -r flan-t5-legal-summary.zip flan-t5-legal-summary


In [30]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()  # logs you in

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model.push_to_hub("flan-t5-legal-summary")
tokenizer.push_to_hub("flan-t5-legal-summary")




VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/aun09/flan-t5-legal-summary/commit/ef21e3af61e72711b6ef18e50364b8706c3c36ea', commit_message='Upload tokenizer', commit_description='', oid='ef21e3af61e72711b6ef18e50364b8706c3c36ea', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aun09/flan-t5-legal-summary', endpoint='https://huggingface.co', repo_type='model', repo_id='aun09/flan-t5-legal-summary'), pr_revision=None, pr_num=None)