In [None]:
drive_url = "https://drive.google.com/file/d/12bDMoJPop-jID8E8SekZM-PFLNTc8MtL/view?usp=sharing"

In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret("wandb_api")
import wandb
! wandb login $wandb_key

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install -q pdfplumber
!pip install -q gdown

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.4.1 requires cubinlinker, which is not installed.
cudf 24.4.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.4.1 requires ptxcompiler, which is not installed.
cuml 24.4.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.4.1 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 16.1.0 which is incompatible.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.2.1 which is incompatible.
cudf 24.4.1 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.5.0 

In [2]:
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
import os
import torch
from time import time
from decouple import config
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer,setup_chat_format

2024-07-09 19:17:52.152528: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-09 19:17:52.152644: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-09 19:17:52.292910: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
model_id = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"

In [None]:
compute_dtype = torch.bfloat16
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True)

In [None]:
time_start = time()

model_config = AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_end = time()
print(f"Prepare model, tokenizer: {round(time_end-time_start, 3)} sec.")

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

### ================================ DATASET ================================

In [None]:
import os
import pandas as pd
import pdfplumber

In [None]:
def extract_pdf_data(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages])
    return text

In [None]:
def create_dataset(pdf_dir):
    system_prompts = []
    pdf_contents = []
    
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, filename)
            pdf_text = extract_pdf_data(pdf_path)
            
            system_prompt = "You have to answer lawyer questions based on this legal data"
            
            system_prompts.append(system_prompt)
            
            pdf_contents.append(pdf_text)
            

    dataset = pd.DataFrame({
        "prompt": system_prompts,
        "answer": pdf_contents
    })
    
    return dataset

In [None]:
!gdown --id config("DATA_FOLDER")

In [None]:
import zipfile
os.makedirs("sample_pdfs", exist_ok=True)

with zipfile.ZipFile("/kaggle/working/200-pdfs.zip", 'r') as zip_ref:
    zip_ref.extractall("/kaggle/working/sample_pdfs")

In [None]:
dataset = create_dataset("/kaggle/working/sample_pdfs")

In [None]:
dataset['answer'][0]

In [None]:
dataset.to_excel("data.xlsx", index=False)

In [None]:
from datasets import Dataset

excel_file = "data.xlsx"
df = pd.read_excel(excel_file)

dataset = Dataset.from_pandas(df)
dataset
dataset.save_to_disk("training_data")

In [None]:
# saved_dataset = load_from_disk("training_data")
# saved_dataset

In [None]:
EOS_TOKEN = tokenizer.eos_token

In [None]:
def form_llama_3_dataset(hf_data_format):
    messages = []
    
    for text in hf_data_format:
        message = f"""<|im_start|>system\n{text['prompt']}|im_end|><|im_start|>assistant\n{text['answer']}<|im_end|>"""
        messages.append(message)
    
    return messages

In [None]:
formatted_messages = form_llama_3_dataset(dataset)

In [None]:
len(formatted_messages)

In [None]:
dataset = dataset.add_column('text', formatted_messages)

In [None]:
dataset

In [None]:
dataset.save_to_disk("final_training_data")

In [None]:
dataset = load_from_disk("final_training_data")

### ============================= Training Configure =============================

In [None]:
peft_config = LoraConfig(
        lora_alpha=64,
        lora_dropout=0.05,
        r=4,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",]
)

In [None]:
training_arguments = TrainingArguments(
        output_dir="./results_llama3_sft/",
#         evaluation_strategy="steps",
#         do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        log_level="debug",
        save_steps=1,
        logging_steps=1,
        learning_rate=8e-6,
#         eval_steps=1,
        max_steps=20,
        num_train_epochs=20,
        warmup_steps=3,
        lr_scheduler_type="linear",
)

In [None]:
os.environ["WANDB_DISABLED"] = "false"

In [None]:
# import os
# os.environ["HF_TOKEN"] = "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"

# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# dataset.push_to_hub("yusuf802/legal-llama3-data")

In [None]:
# dataset_name = "yusuf802/legal-llama3-data"
# dataset = load_dataset(dataset_name)

In [None]:
dataset

In [None]:
train_dataset = Dataset.from_list(dataset['train'])

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments
)

In [None]:
trainer.train()

In [None]:
trainer