In [None]:
Initialising GPU IN Pytorch

In [1]:
import torch

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"using device:{device}")

devNumber=torch.cuda.current_device()
print(f"Current Device number is:{devNumber}")
devName=torch.cuda.get_device_name(devNumber)
print(f"Gpu name is {devName}")

using device:cuda
Current Device number is:0
Gpu name is NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
!pip install transformers peft datasets




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset

import os

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
!pip install datasets




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
train_judgement_path = 'IN-Abs/train-data/judgement'
train_summary_path = 'IN-Abs/train-data/summary'
test_judgement_path = 'IN-Abs/test-data/judgement'
test_summary_path = 'IN-Abs/test-data/summary'

In [4]:
model_name = "nsi319/legal-led-base-16384"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    inference_mode=False,
    r=16,                
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj"]
)
model = get_peft_model(model, peft_config)

In [6]:
def load_data(judgement_path, summary_path):
    judgements, summaries = [], []
    # Load judgement files
    for filename in os.listdir(judgement_path):
        with open(os.path.join(judgement_path, filename), 'r', encoding='utf-8') as f:
            judgements.append(f.read())
    # Load summary files
    for filename in os.listdir(summary_path):
        with open(os.path.join(summary_path, filename), 'r', encoding='utf-8') as f:
            summaries.append(f.read())
    return judgements, summaries

In [7]:
train_judgements, train_summaries = load_data(train_judgement_path, train_summary_path)
test_judgements, test_summaries = load_data(test_judgement_path, test_summary_path)

In [8]:
train_dataset = Dataset.from_dict({"text": train_judgements, "summary": train_summaries})
test_dataset = Dataset.from_dict({"text": test_judgements, "summary": test_summaries})

In [9]:
def tokenize_function(examples):
    inputs = examples["text"]
    model_inputs = tokenizer(inputs, max_length=512, padding=True, truncation=True)
    labels = tokenizer(examples["summary"], max_length=128, padding=True, truncation=True)
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████| 7030/7030 [01:09<00:00, 101.22 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 70.68 examples/s]


Moving model to Gpu

In [10]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

Using device: cuda


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): LEDForConditionalGeneration(
      (led): LEDModel(
        (shared): Embedding(50265, 768, padding_idx=1)
        (encoder): LEDEncoder(
          (embed_tokens): Embedding(50265, 768, padding_idx=1)
          (embed_positions): LEDLearnedPositionalEmbedding(16384, 768)
          (layers): ModuleList(
            (0-5): 6 x LEDEncoderLayer(
              (self_attn): LEDEncoderAttention(
                (longformer_self_attn): LEDEncoderSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_features=768, out_features=768, bias=True)
                  (query_global): Linear(in_features=768, out_features=768, bias=True)
                  (key_global): Linear(in_features=768, out_features=768, bias=True)
                  (value_global): Linear(in_features=768, out_features

In [11]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=128)
# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 10

# define training arguments
training_args = TrainingArguments(
    output_dir= "shawgpt-ft",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

trainer.train()


Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024


Epoch,Training Loss,Validation Loss
1,10.3889,2.573749
2,9.5063,2.482429
3,9.2079,2.438074
4,9.012,2.415317
5,8.8735,2.399761
6,8.7608,2.381631
7,8.6763,2.378129
8,8.609,2.369457
9,8.5399,2.366632




TrainOutput(global_step=4390, training_loss=9.014280057659454, metrics={'train_runtime': 6648.7558, 'train_samples_per_second': 10.573, 'train_steps_per_second': 0.66, 'total_flos': 2.39316362330112e+16, 'train_loss': 9.014280057659454, 'epoch': 9.978384527872583})

In [12]:
trainer.save_model("./final_model/IN_model")

# Merge LoRA adapters into the base model
model = model.merge_and_unload()

# Save the full model with merged weights
model.save_pretrained("./final_model/IN_model")
tokenizer.save_pretrained("./final_model/IN_model")

('./final_model/IN_model\\tokenizer_config.json',
 './final_model/IN_model\\special_tokens_map.json',
 './final_model/IN_model\\vocab.json',
 './final_model/IN_model\\merges.txt',
 './final_model/IN_model\\added_tokens.json',
 './final_model/IN_model\\tokenizer.json')

In [1]:
pip freeze

absl-py==2.1.0
accelerate==1.2.1
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
anyio==4.8.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==3.0.0
async-lru==2.0.4
attrs==24.3.0
babel==2.16.0
beautifulsoup4==4.12.3
bleach==6.2.0
certifi==2024.12.14
cffi==1.17.1
charset-normalizer==3.4.1
click==8.1.8
colorama==0.4.6
comm==0.2.2
datasets==3.2.0
debugpy==1.8.11
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.8
evaluate==0.4.3
executing==2.1.0
fastjsonschema==2.21.1
filelock==3.13.1
fqdn==1.5.1
frozenlist==1.5.0
fsspec==2024.2.0
h11==0.14.0
httpcore==1.0.7
httpx==0.28.1
huggingface-hub==0.27.1
idna==3.10
ipykernel==6.29.5
ipython==8.31.0
isoduration==20.11.0
jedi==0.19.2
Jinja2==3.1.3
joblib==1.4.2
json5==0.10.0
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
jupyter-events==0.11.0
jupyter-lsp==2.2.5
jupyter_client==8.6.3
jupyter_core==5.7.2
jupyter_server==2.15.0
jupyter_server_terminals==0.5.3
jupyterlab==4.3.4
jupyterlab_pygmen

In [None]:
!pip install bitsa