In [None]:
# 🛠️ Packages

!pip install -q \
    accelerate==0.29.2 \
    aiohttp==3.9.5 \
    aiosignal==1.3.1 \
    async-timeout==4.0.3 \
    attrs==23.2.0 \
    bitsandbytes==0.43.1 \
    certifi==2024.2.2 \
    charset-normalizer==3.3.2 \
    click==8.1.7 \
    colorama==0.4.6 \
    datasets==2.18.0 \
    dill==0.3.8 \
    filelock==3.13.4 \
    frozenlist==1.4.1 \
    fsspec==2024.2.0 \
    huggingface-hub==0.22.2 \
    idna==3.7 \
    jinja2==3.1.3 \
    joblib==1.4.0 \
    markupsafe==2.1.5 \
    mpmath==1.3.0 \
    multidict==6.0.5 \
    multiprocess==0.70.16 \
    networkx==3.2.1 \
    nltk==3.8.1 \
    numpy==1.26.4 \
    packaging==24.0 \
    pandas==2.2.2 \
    peft==0.10.0 \
    psutil==5.9.8 \
    pyarrow==15.0.2 \
    pyarrow-hotfix==0.6 \
    python-dateutil==2.9.0.post0 \
    pytz==2024.1 \
    pyyaml==6.0.1 \
    regex==2023.12.25 \
    requests==2.31.0 \
    safetensors==0.4.3 \
    six==1.16.0 \
    sympy==1.12 \
    tokenizers==0.15.2 \
    torch==2.2.2 \
    tqdm==4.66.2 \
    transformers==4.39.3 \
    typing-extensions==4.11.0 \
    urllib3==2.2.1 \
    xxhash==3.4.1 \
    yarl==1.9.4


In [None]:
!pip install numpy==1.26.4

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
print("CUDA:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

In [None]:
import numpy as np
import pandas as pd

print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Import Libraries

In [None]:
# Import libraries
import bitsandbytes as bnb
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed,  BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments, LlamaTokenizer, EarlyStoppingCallback
from datasets import load_dataset
import random
import pandas as pd

seed = 42
set_seed(seed)

Tokenizer and Quantization

In [None]:
def create_bnb_config():
    return BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

def load_model(model_name, bnb_config):
    print("N GPUS:", torch.cuda.device_count())
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return model, tokenizer

Prompt Format and Dataset

In [None]:
def create_prompt_formats(sample):
    instruction = "What is the key word that represents the interaction between the proteins which are tagged with [Protein1] and [Protein2] in the given sentence?"
    sample["text"] = f"<s>[INST] {instruction}\\n\\n{sample['Sentence']} [/INST] {sample['Keywords']} </s>"
    return sample

def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(batch["text"], max_length=max_length, truncation=True)

def preprocess_dataset(tokenizer, max_length, seed, dataset):
    dataset = dataset.map(create_prompt_formats)
    print("Sample prompt:\\n", dataset[0]["text"])
    f = partial(preprocess_batch, tokenizer=tokenizer, max_length=max_length)
    dataset = dataset.map(f, batched=True)
    dataset = dataset.filter(lambda x: len(x["input_ids"]) < max_length)
    return dataset.shuffle(seed=seed)

LoRA Settings

In [None]:
def create_peft_config(target_modules):
    return LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=target_modules,
        lora_dropout=0.0,
        bias="none",
        task_type="CAUSAL_LM",
    )

def find_all_linear_names(model):
    cls = torch.nn.Linear
    return list({name.split(".")[-1] for name, mod in model.named_modules() if isinstance(mod, cls)})

Train Function

In [None]:
def train(model, tokenizer, train_dataset, val_dataset, output_dir):
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    modules = find_all_linear_names(model)
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)

    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")

    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        args=TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=1,
            num_train_epochs=4,
            gradient_accumulation_steps=4,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            evaluation_strategy="steps",
            save_strategy="steps",
            eval_steps=5,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",  # ✅ BU satır gerekli
            report_to=None
    ),


        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

Load Dataset and Train

In [None]:
# Dataset
dataset = load_dataset("bengisucam/LLL_INO-tagged", split="train")
train_test_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_dataset["train"]
val_dataset = train_test_dataset["test"]

# Model ve tokenizer
model_name = "meta-llama/Llama-2-13b-chat-hf"  # "mistralai/Mistral-7B-Instruct-v0.1"
bnb_config = create_bnb_config()
model, tokenizer = load_model(model_name, bnb_config)

# Preprocessing
max_length = 1024
train_dataset_processed = preprocess_dataset(tokenizer, max_length, seed, train_dataset)
val_dataset_processed = preprocess_dataset(tokenizer, max_length, seed, val_dataset)

# Train
output_dir = "results/llama2_colab/"
train(model, tokenizer, train_dataset_processed, val_dataset_processed, output_dir)

TEST

In [None]:
import torch
from transformers import  LlamaTokenizer, set_seed
from peft import  AutoPeftModelForCausalLM
from datasets import load_dataset
from datetime import datetime as dt
import logging

In [None]:
# Reproducibility
seed = 42
set_seed(seed)

def create_prompt_formats_for_test(sample):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    # Instruction Key without protein tags:
    # INSTRUCTION_KEY = "### Instruction: What is the key word that represents the interaction between the proteins " + sample["Gene1"] + " and " + sample["Gene2"] + " in the given sentence?"

    # Instruction Key with protein tags:
    INSTRUCTION_KEY = "### Instruction: What is the key word that represents the interaction between the proteins which are tagged with [Protein1] and [Protein2] in the given sentence?"
    INPUT_KEY = "### Input:"
    RESPONSE_KEY = "### Response:"


    blurb = f"{INTRO_BLURB}"
    instruction = INSTRUCTION_KEY
    input_context = f"{INPUT_KEY}\n{sample['Sentence']}"   # Sentence, passage
    response = f"{RESPONSE_KEY}\n"


    parts = [part for part in [blurb, instruction, input_context, response] if part]

    formatted_prompt = "\n\n".join(parts)

    sample["text"] = formatted_prompt

    return sample

In [None]:
logging.basicConfig(filename="finetune_results/finetuned-7B-chat-test-5.log", level=logging.INFO)
logging.info(f"({dt.now().strftime('%d/%m/%Y %H:%M:%S')})| START")


test_on_lll=True

# Specify device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## TEST Finetuned Model From Checkpoint ##
tmp_model_path = "results/llama2_colab"
print("Loading the checkpoint in a Llama model.")
model = AutoPeftModelForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(device)
tokenizer = LlamaTokenizer.from_pretrained(tmp_model_path, use_fast=False)

## check the total model parameters
print(sum(p.numel() for p in model.parameters()))

if test_on_lll:
    test_dataset = load_dataset("bengisucam/LLL_INO-tagged", split="test")
else:
    test_dataset = load_dataset("bengisucam/HPRD50_true_only_tagged", split="test")
    print(test_dataset[:2])
    test_dataset = test_dataset.filter(lambda example: example["isValid"]==True)
    print(test_dataset[:2])

print(len(test_dataset))
# Add prompt to each sample
print("Preprocessing dataset...")
dataset = test_dataset.map(create_prompt_formats_for_test)  # , batched=True)
print(len(dataset))



for i in range(len(dataset)):
   # Specify input
    text = dataset[i]["text"]
    sentence_id = dataset[i]["Unnamed: 0"]


    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt").to(device)

    # Get answer
    # (Adjust max_new_tokens variable as you wish (maximum number of tokens the model can generate to answer the input))  #.to(device)
    outputs = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"],
                             max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)

    print("EXAMPLE ", i+1)
    # Decode output & print it
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Sentence Id: ", sentence_id)
    print(response)
    print("##############################################################################")
    logging.info("Sentence Id: %s, Response: %s  .\n\n", sentence_id, response)