In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset
import os

ModuleNotFoundError: No module named 'transformers'

In [4]:
train_judgement_path = 'IN-Abs/train-data/judgement'
train_summary_path = 'IN-Abs/train-data/summary'
test_judgement_path = 'IN-Abs/test-data/judgement'
test_summary_path = 'IN-Abs/test-data/summary'

In [3]:
model_name = "nsi319/legal-led-base-16384"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    inference_mode=False,
    r=8,                
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"]
)
model = get_peft_model(model, peft_config)

In [5]:
def load_data(judgement_path, summary_path):
    judgements, summaries = [], []
    # Load judgement files
    for filename in os.listdir(judgement_path):
        with open(os.path.join(judgement_path, filename), 'r', encoding='utf-8') as f:
            judgements.append(f.read())
    # Load summary files
    for filename in os.listdir(summary_path):
        with open(os.path.join(summary_path, filename), 'r', encoding='utf-8') as f:
            summaries.append(f.read())
    return judgements, summaries

In [6]:
train_judgements, train_summaries = load_data(train_judgement_path, train_summary_path)
test_judgements, test_summaries = load_data(test_judgement_path, test_summary_path)

In [7]:
train_dataset = Dataset.from_dict({"text": train_judgements, "summary": train_summaries})
test_dataset = Dataset.from_dict({"text": test_judgements, "summary": test_summaries})

In [8]:
def tokenize_function(examples):
    inputs = examples["text"]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(examples["summary"], max_length=128, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 7030/7030 [01:30<00:00, 77.86 examples/s]
Map: 100%|██████████| 100/100 [00:01<00:00, 68.46 examples/s]


In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,          
    gradient_accumulation_steps=16,         
    num_train_epochs=3,                     
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()

  0%|          | 0/1317 [00:00<?, ?it/s]Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024
  1%|          | 10/1317 [07:28<16:53:35, 46.53s/it]

{'loss': 3.9139, 'grad_norm': 12.986339569091797, 'learning_rate': 4.962034927866363e-05, 'epoch': 0.02}


  2%|▏         | 20/1317 [15:17<17:02:34, 47.30s/it]

{'loss': 3.6122, 'grad_norm': 13.790632247924805, 'learning_rate': 4.924069855732726e-05, 'epoch': 0.05}


  2%|▏         | 30/1317 [22:21<14:55:45, 41.76s/it]

{'loss': 3.6346, 'grad_norm': 14.327286720275879, 'learning_rate': 4.886104783599089e-05, 'epoch': 0.07}


  3%|▎         | 40/1317 [33:19<20:02:33, 56.50s/it] 

{'loss': 3.4928, 'grad_norm': 14.004500389099121, 'learning_rate': 4.848139711465452e-05, 'epoch': 0.09}


  4%|▍         | 50/1317 [41:14<16:16:39, 46.25s/it]

{'loss': 3.464, 'grad_norm': 12.694475173950195, 'learning_rate': 4.810174639331815e-05, 'epoch': 0.11}


  5%|▍         | 60/1317 [51:24<28:36:38, 81.94s/it]

{'loss': 3.3033, 'grad_norm': 13.04708194732666, 'learning_rate': 4.772209567198178e-05, 'epoch': 0.14}


  5%|▌         | 70/1317 [1:20:09<57:28:56, 165.95s/it]

{'loss': 3.326, 'grad_norm': 12.691873550415039, 'learning_rate': 4.734244495064541e-05, 'epoch': 0.16}


  6%|▌         | 80/1317 [1:42:29<42:56:17, 124.96s/it]

{'loss': 3.1017, 'grad_norm': 11.350688934326172, 'learning_rate': 4.696279422930904e-05, 'epoch': 0.18}


  7%|▋         | 90/1317 [2:03:42<47:32:25, 139.48s/it]

{'loss': 3.0369, 'grad_norm': 10.93515396118164, 'learning_rate': 4.658314350797267e-05, 'epoch': 0.2}


  8%|▊         | 100/1317 [2:25:28<45:13:59, 133.80s/it]

{'loss': 3.1726, 'grad_norm': 11.622984886169434, 'learning_rate': 4.62034927866363e-05, 'epoch': 0.23}


  8%|▊         | 110/1317 [2:44:27<34:33:11, 103.06s/it]

{'loss': 3.2037, 'grad_norm': 11.53631591796875, 'learning_rate': 4.5823842065299926e-05, 'epoch': 0.25}


  9%|▉         | 116/1317 [2:55:33<36:35:53, 109.70s/it]

In [None]:
trainer.save_model("./final_model/IN_model")