In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
from transformers import AutoTokenizer , AutoModelForCausalLM , BitsAndBytesConfig

MODEL_HF_ID = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True , 
    bnb_4bit_use_double_quant=True  , 
    bnb_4bit_quant_type="nf4" , 
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [4]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_HF_ID , trust_remote_code = True )
model = AutoModelForCausalLM.from_pretrained(MODEL_HF_ID , quantization_config = bnb_config , device_map="auto" , trust_remote_code = True)


In [5]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
model.lm_head

Linear(in_features=4544, out_features=65024, bias=False)

In [8]:
from torch import nn
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [9]:
model

RWForCausalLM(
  (transformer): RWModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x DecoderLayer(
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
        (self_attention): Attention(
          (maybe_rotary): RotaryEmbedding()
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): MLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): CastOutputToFloat(
    (0): Linear(in_features=4544, out_features=65024, bias=False)
  )
)

In [10]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM" , 
     target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 16318464 || all params: 3625063296 || trainable%: 0.4501566639679441


In [11]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={
    "train": "./Dataset/train.csv" , 
    'test': "./Dataset/test.csv" , 
    'valid' : './Dataset/valid.csv'
    })


Found cached dataset csv (/home/ahora/.cache/huggingface/datasets/csv/default-4caa2e3dddaa112f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
dataset['train'][0]

{'Unnamed: 0': 0,
 'Abstract': 'Catecholamine-depleting drugs (eg, reserpine) may have an additive effect when given with beta-blocking agents. Patients treated with TENORMIN plus a catecholamine depletor should therefore be closely observed for evidence of hypotension and/or marked bradycardia which may produce vertigo, syncope, or postural hypotension. Calcium channel blockers may also have an additive effect when given with TENORMIN . Beta blockers may exacerbate the rebound hypertension which can follow the withdrawal of clonidine. If the two drugs are coadministered, the beta blocker should be withdrawn several days before the gradual withdrawal of clonidine. If replacing clonidine by beta-blocker therapy, the introduction of beta blockers should be delayed for several days after clonidine administration has stopped. Concomitant use of prostaglandin synthase inhibiting drugs, eg, indomethacin, may decrease the hypotensive effects of beta blockers. Information on concurrent usage o

In [13]:
from pprint import pprint
def merge_columns(example):
    example["prediction"] = example["Abstract"] + " ->: " + str(example["Relations"])
    return example

dataset['train'] = dataset['train'].map(merge_columns)
dataset['test'] = dataset['test'].map(merge_columns)
dataset['valid'] = dataset['valid'].map(merge_columns)
pprint(dataset['train']["prediction"][:1])
pprint(dataset['test']["prediction"][:1])
pprint(dataset['valid']["prediction"][:1])

Loading cached processed dataset at /home/ahora/.cache/huggingface/datasets/csv/default-4caa2e3dddaa112f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-7626c92e47fdcc1c.arrow
Loading cached processed dataset at /home/ahora/.cache/huggingface/datasets/csv/default-4caa2e3dddaa112f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-45288de2ad65f53f.arrow
Loading cached processed dataset at /home/ahora/.cache/huggingface/datasets/csv/default-4caa2e3dddaa112f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-1e1f40f73e403fa0.arrow


['Catecholamine-depleting drugs (eg, reserpine) may have an additive effect '
 'when given with beta-blocking agents. Patients treated with TENORMIN plus a '
 'catecholamine depletor should therefore be closely observed for evidence of '
 'hypotension and/or marked bradycardia which may produce vertigo, syncope, or '
 'postural hypotension. Calcium channel blockers may also have an additive '
 'effect when given with TENORMIN . Beta blockers may exacerbate the rebound '
 'hypertension which can follow the withdrawal of clonidine. If the two drugs '
 'are coadministered, the beta blocker should be withdrawn several days before '
 'the gradual withdrawal of clonidine. If replacing clonidine by beta-blocker '
 'therapy, the introduction of beta blockers should be delayed for several '
 'days after clonidine administration has stopped. Concomitant use of '
 'prostaglandin synthase inhibiting drugs, eg, indomethacin, may decrease the '
 'hypotensive effects of beta blockers. Information on 

In [14]:
dataset = dataset.map(lambda samples: tokenizer(samples['prediction'] , max_length=2048 , truncation=True), batched=True)

Loading cached processed dataset at /home/ahora/.cache/huggingface/datasets/csv/default-4caa2e3dddaa112f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-f61f1de940ffe691.arrow
Loading cached processed dataset at /home/ahora/.cache/huggingface/datasets/csv/default-4caa2e3dddaa112f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-9bcb954d79e509a7.arrow
Loading cached processed dataset at /home/ahora/.cache/huggingface/datasets/csv/default-4caa2e3dddaa112f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-be1143ecb1d9ec25.arrow


In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Abstract', 'Relations', 'prediction', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 469
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Abstract', 'Relations', 'prediction', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 191
    })
    valid: Dataset({
        features: ['Unnamed: 0', 'Abstract', 'Relations', 'prediction', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 41
    })
})

In [16]:
print(dataset['train']['input_ids'][0])

[46, 375, 47037, 11759, 24, 8642, 274, 802, 6192, 204, 19, 12849, 23, 560, 246, 33550, 20, 724, 413, 267, 41367, 1334, 635, 2132, 335, 13220, 24, 54587, 7836, 25, 19705, 7016, 335, 301, 798, 1951, 22241, 2383, 241, 28431, 47037, 11759, 336, 12709, 252, 808, 4859, 314, 8285, 7790, 312, 3941, 275, 30194, 2874, 273, 26, 252, 9673, 809, 3968, 10006, 494, 585, 724, 4634, 7686, 10280, 23, 18362, 1419, 23, 379, 1106, 2240, 30194, 2874, 25, 50666, 6443, 60032, 724, 614, 413, 267, 41367, 1334, 635, 2132, 335, 301, 798, 1951, 22241, 204, 25, 26455, 60032, 724, 26701, 375, 248, 33346, 24841, 585, 418, 1122, 248, 15204, 275, 514, 245, 32720, 25, 972, 248, 847, 6192, 362, 739, 34906, 1263, 23, 248, 13220, 50602, 808, 314, 33309, 1988, 1522, 996, 248, 28237, 15204, 275, 514, 245, 32720, 25, 972, 15951, 514, 245, 32720, 431, 13220, 24, 8575, 246, 6030, 23, 248, 9705, 275, 13220, 60032, 808, 314, 16648, 312, 1988, 1522, 852, 514, 245, 32720, 6354, 504, 5954, 25, 1412, 994, 25023, 745, 275, 11253, 353,

In [17]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

In [18]:
len(dataset['train'])

469

In [19]:

trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset["train"],
    # eval_dataset = dataset['valid'] , 
    args=transformers.TrainingArguments(
        do_eval = False , 
        auto_find_batch_size = True , 
        # per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        num_train_epochs=10,
        learning_rate=2e-4,
        bf16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit" , 
        dataloader_drop_last= True , 
        # eval_steps = 100
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [20]:
trainer.train()

  0%|          | 0/140 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/290 [00:00<?, ?it/s]

  0%|          | 0/580 [00:00<?, ?it/s]

{'loss': 1.6708, 'learning_rate': 5.2000000000000004e-05, 'epoch': 0.17}
{'loss': 1.4578, 'learning_rate': 9.200000000000001e-05, 'epoch': 0.34}
{'loss': 1.2144, 'learning_rate': 0.000132, 'epoch': 0.51}
{'loss': 1.2381, 'learning_rate': 0.000172, 'epoch': 0.68}
{'loss': 1.1776, 'learning_rate': 0.00019333333333333333, 'epoch': 0.85}
{'loss': 1.1932, 'learning_rate': 0.0001711111111111111, 'epoch': 1.03}
{'loss': 1.0665, 'learning_rate': 0.0001488888888888889, 'epoch': 1.2}
{'loss': 1.0587, 'learning_rate': 0.00012666666666666666, 'epoch': 1.37}
{'loss': 1.0676, 'learning_rate': 0.00010444444444444445, 'epoch': 1.54}
{'loss': 1.1134, 'learning_rate': 8.222222222222222e-05, 'epoch': 1.71}
{'loss': 1.0081, 'learning_rate': 6e-05, 'epoch': 1.88}
{'loss': 1.0023, 'learning_rate': 3.777777777777778e-05, 'epoch': 2.05}
{'loss': 0.9774, 'learning_rate': 1.5555555555555555e-05, 'epoch': 2.22}
{'loss': 0.8137, 'learning_rate': 0.0, 'epoch': 2.39}
{'loss': 0.8324, 'learning_rate': 0.0, 'epoch': 

TrainOutput(global_step=580, training_loss=0.9256400059009421, metrics={'train_runtime': 4996.7766, 'train_samples_per_second': 0.939, 'train_steps_per_second': 0.116, 'train_loss': 0.9256400059009421, 'epoch': 9.91})

In [21]:
save_directory = "./Model_Saves"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./Model_Saves/tokenizer_config.json',
 './Model_Saves/special_tokens_map.json',
 './Model_Saves/tokenizer.json')

In [22]:
test_prompt = "Although clinical studies have not established a cause and effect relationship, physicians should be aware that variable effects an blood coagulation have been reported very rarely in patients receiving oral anticoagulants and chlordiazepoxide. The concomitant use of alcohol or other central nervous system depressants may have an additive effect."

In [44]:
inputs = tokenizer.encode_plus(test_prompt, return_tensors="pt" )

In [45]:
input_ids = inputs["input_ids"].to('cuda')
attention_mask = inputs["attention_mask"].to('cuda')

In [49]:
with torch.no_grad():
    output = model.generate(input_ids =input_ids , attention_mask = attention_mask , max_length=512, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


In [50]:
output[0]

tensor([ 8127,  6077,  4048,   413,   416,  5305,   241,  2887,   273,  1334,
         2392,    23, 16923,   808,   314,  3671,   325,  9647,  3524,   267,
         2902,   739, 51627,   413,   650,  3713,   829, 10803,   272,  2634,
         7537, 10720, 54788,   353,   338,  1221,   273,   425, 42322, 19383,
          538, 24862,    25,   390, 59819, 25023,   745,   275,  7185,   379,
          599,  5213, 10886,  1092, 51929,  1221,   724,   413,   267, 41367,
         1334,    25,   204,  1579,    37,   258,    19,   425, 42322, 19383,
          538, 24862,   204,    23,  1334,    74,    47, 10211,   204,    23,
        54788,   353,   338,  1221,   204,    20,   204,    24,   258,    19,
          425, 42322, 19383,   538, 24862,   204,    23,  1334,    74,    47,
        10211,   204,    23,  5213, 10886,  1092, 51929,  1221,   204,    20,
          204,    24,   258,    19,  7185,   204,    23,  1334,    74,    47,
        10211,   204,    23,   425, 42322, 19383,   538, 24862, 

In [51]:
generated_text = tokenizer.decode(output[0])
print(generated_text)

Although clinical studies have not established a cause and effect relationship, physicians should be aware that variable effects an blood coagulation have been reported very rarely in patients receiving oral anticoagulants and chlordiazepoxide. The concomitant use of alcohol or other central nervous system depressants may have an additive effect. ->:  ( chlordiazepoxide, effect_DDI, anticoagulants ) -  ( chlordiazepoxide, effect_DDI, central nervous system depressants ) -  ( alcohol, effect_DDI, chlordiazepoxide ) -  ( alcohol, effect_DDI, central nervous system depressants ) -  ( central nervous system depressants, effect_DDI, chlordiazepoxide ) -  ( alcohol, effect_DDI, anticoagulants ) -  ( alcohol, effect_DDI, central nervous system depressants ) -  ( alcohol, effect_DDI, chlordiazepoxide ) -  ( alcohol, effect_DDI, anticoagulants ) -  ( alcohol, effect_DDI, central nervous system depressants ) -  ( alcohol, effect_DDI, chlordiazepoxide ) -  ( central nervous system depressants, ef