In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
#!pip list

Package                      Version
---------------------------- -----------
absl-py                      2.1.0
accelerate                   1.1.1
aiohappyeyeballs             2.4.3
aiohttp                      3.10.10
aiosignal                    1.3.1
anyio                        4.6.2
argon2-cffi                  21.3.0
argon2-cffi-bindings         21.2.0
asttokens                    2.0.5
astunparse                   1.6.3
async-lru                    2.0.4
async-timeout                4.0.3
attrs                        24.2.0
Babel                        2.11.0
beautifulsoup4               4.12.3
bleach                       4.1.0
Brotli                       1.0.9
cachetools                   5.5.0
certifi                      2024.8.30
cffi                         1.17.1
charset-normalizer           3.3.2
colorama                     0.4.6
comm                         0.2.1
datasets                     3.1.0
debugpy                      1.6.7
decorator                    5.1.1


In [3]:
#pip install datasets

Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Using cached datasets-3.1.0-py3-none-any.whl (480 kB)
Installing collected packages: datasets
Successfully installed datasets-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
#!pip uninstall numpy -y
#!pip install numpy==1.25

Found existing installation: numpy 2.0.0
Uninstalling numpy-2.0.0:
  Successfully uninstalled numpy-2.0.0
Collecting numpy==1.25
  Downloading numpy-1.25.0-cp310-cp310-win_amd64.whl.metadata (5.7 kB)
Downloading numpy-1.25.0-cp310-cp310-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
    --------------------------------------- 0.3/15.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/15.0 MB 1.9 MB/s eta 0:00:08
   --- ------------------------------------ 1.3/15.0 MB 2.2 MB/s eta 0:00:07
   ---- ----------------------------------- 1.8/15.0 MB 2.5 MB/s eta 0:00:06
   ----- ---------------------------------- 2.1/15.0 MB 2.6 MB/s eta 0:00:06
   ------- -------------------------------- 2.9/15.0 MB 2.4 MB/s eta 0:00:06
   --------- ------------------------------ 3.4/15.0 MB 2.5 MB/s eta 0:00:05
   ----------- ---------------------------- 4.2/15.0 MB 2.7 MB/s eta 0:00:05
   ----------- ---------------------------- 4.5/15.0 MB 2.5 

In [3]:
import tensorflow as tf

# List all available GPU devices
gpus = tf.config.list_physical_devices('GPU')
print("GPUs:", gpus)

# Optional: Display details of the first GPU, if available
if gpus:
    print("GPU details:", tf.config.experimental.get_device_details(gpus[0]))
else:
    print("No GPU found.")


GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details: {'device_name': 'NVIDIA GeForce RTX 3050 Laptop GPU', 'compute_capability': (8, 6)}


In [4]:
train_judgement_path = 'IN-Abs/train-data/judgement'
train_summary_path = 'IN-Abs/train-data/summary'
test_judgement_path = 'IN-Abs/test-data/judgement'
test_summary_path = 'IN-Abs/test-data/summary'

In [5]:
model_name = "nsi319/legal-led-base-16384"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    inference_mode=False,
    r=8,                
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"]
)
model = get_peft_model(model, peft_config)

In [9]:
def load_data(judgement_path, summary_path):
    judgements, summaries = [], []
    # Load judgement files
    for filename in os.listdir(judgement_path):
        with open(os.path.join(judgement_path, filename), 'r', encoding='utf-8') as f:
            judgements.append(f.read())
    # Load summary files
    for filename in os.listdir(summary_path):
        with open(os.path.join(summary_path, filename), 'r', encoding='utf-8') as f:
            summaries.append(f.read())
    return judgements, summaries

In [11]:
train_judgements, train_summaries = load_data(train_judgement_path, train_summary_path)
test_judgements, test_summaries = load_data(test_judgement_path, test_summary_path)

In [12]:
train_dataset = Dataset.from_dict({"text": train_judgements, "summary": train_summaries})
test_dataset = Dataset.from_dict({"text": test_judgements, "summary": test_summaries})

In [13]:
def tokenize_function(examples):
    inputs = examples["text"]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(examples["summary"], max_length=128, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 7030/7030 [01:05<00:00, 106.80 examples/s]
Map: 100%|██████████| 100/100 [00:01<00:00, 86.35 examples/s]


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,          
    gradient_accumulation_steps=16,         
    num_train_epochs=3,                     
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2
)



In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

with tf.device('/GPU:0'):  # This forces operations to run on GPU
    tf.debugging.set_log_device_placement(True)
    trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    per_device_train_batch_size=4,  # Set based on your GPU memory
    per_device_eval_batch_size=4,  # Set based on your GPU memory
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,2.6489,2.700825
2,2.4568,2.641926
3,2.4759,2.627377


TrainOutput(global_step=5274, training_loss=2.5892653365816907, metrics={'train_runtime': 21104.2575, 'train_samples_per_second': 0.999, 'train_steps_per_second': 0.25, 'total_flos': 7156628796211200.0, 'train_loss': 2.5892653365816907, 'epoch': 3.0})

In [20]:
trainer.save_model("./final_model/IN_model")