In [1]:
%pip install transformers
%pip install sacremoses
%pip install torch
%pip install datasets


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.



In [6]:
from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import torch

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
model.to('cuda')



BioGptForCausalLM(
  (biogpt): BioGptModel(
    (embed_tokens): BioGptScaledWordEmbedding(42384, 1024, padding_idx=1)
    (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-23): 24 x BioGptDecoderLayer(
        (self_attn): BioGptSdpaAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((

In [7]:
import torch
torch.cuda.empty_cache()

In [8]:
from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_5.csv') 

# Extract the abstracts
abstracts = df['Abstract']
dataset = Dataset.from_dict({"text": abstracts})

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
)

train_size = int(0.7 * len(tokenized_dataset))
val_size = int(0.15 * len(tokenized_dataset))
test_size = len(tokenized_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)


Map:   0%|          | 0/13 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 13/13 [00:00<00:00, 123.94 examples/s]
Map: 100%|██████████| 13/13 [00:00<00:00, 2599.44 examples/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 6/6 [02:12<00:00, 22.14s/it]


{'train_runtime': 132.8642, 'train_samples_per_second': 0.203, 'train_steps_per_second': 0.045, 'train_loss': 2.5947402318318686, 'epoch': 2.67}


100%|██████████| 1/1 [00:00<00:00, 499.62it/s]

Test set evaluation: {'eval_loss': 2.5468780994415283, 'eval_runtime': 2.4243, 'eval_samples_per_second': 1.237, 'eval_steps_per_second': 0.412, 'epoch': 2.6666666666666665}





In [16]:
prompt = "Lung cancer is"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=100,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Lung cancer is the leading cause of cancer-related death worldwide and is often diagnosed at an advanced stage.
