In [1]:
%pip install transformers
%pip install sacremoses
%pip install torch
%pip install datasets



Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [9]:
from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import torch

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
model.to('cuda')



BioGptForCausalLM(
  (biogpt): BioGptModel(
    (embed_tokens): BioGptScaledWordEmbedding(42384, 1024, padding_idx=1)
    (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-23): 24 x BioGptDecoderLayer(
        (self_attn): BioGptSdpaAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((

In [3]:
import torch
torch.cuda.empty_cache()

In [8]:
from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_1.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)




0     Osimertinib is used as the first-line treatmen...
1     We wanted to present a rare case of metastatic...
2     Chemotherapy in an integral part of cancer tre...
3     DNA methylation plays a regulatory role in the...
4     Lung cancer is the leading cause of cancer-rel...
5     Diffuse large B-cell lymphoma is the most comm...
6     To evaluate the efficacy and safety of program...
7     Breast cancer (BC) is the most frequently occu...
8     Breast cancer has the potential to metastasize...
9     Bronchiolitis obliterans syndrome (BOS) occurr...
10    For drugs with a narrow therapeutic window, th...
11    The IROC head and neck phantom is used to cred...
12    To compare the efficacy of first-line regimens...
13    This systematic review and network meta-analys...
14    Accurate staging improves lung cancer survival...
15    This research aimed to systematically uncover ...
16    Lymph node metastasis (LNM) plays a crucial ro...
17    Lung cancer is a leading cause of cancer-r

Map: 100%|██████████| 50/50 [00:00<00:00, 243.54 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3697.70 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


100%|██████████| 21/21 [05:40<00:00, 16.20s/it]


{'train_runtime': 340.2966, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.062, 'train_loss': 2.0035483950660344, 'epoch': 2.8}


100%|██████████| 2/2 [00:01<00:00,  1.33it/s]

Test set evaluation: {'eval_loss': 2.1885786056518555, 'eval_runtime': 13.9675, 'eval_samples_per_second': 0.716, 'eval_steps_per_second': 0.143, 'epoch': 2.8}





In [5]:
print(test_dataset)

prompt = "Treatment for lung cancer includes"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


<torch.utils.data.dataset.Subset object at 0x00000156B93948E0>
Treatment for lung cancer includes surgery, radiation therapy and chemotherapy. The role of radiation in the management of non-small-cell lung carcinoma. A multidisciplinary approach. Treatment strategies, the role and the value of the radiation oncologist. This article reviews the current state of lung-cancer treatment and discusses the indications, clinical role, and value for radiation oncologists. Our purpose is to provide a framework for multidisciplinary team (MDT) management for patients with lung cancers. We present a comprehensive review of treatment for non small cell carcinoma (NSCLC) in an MDT setting. Lung-care is a multidisciplinary effort that encompasses the patient, family, oncology,
Treatment for lung cancer includes chemotherapy, radiation, and targeted therapy. The first-line treatment options for non-small-cell lung carcinoma include cisplatin-based doublet chemotherapy and platinum-free doublet therapy

In [6]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_2.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)



0     Real-world data are limited for patients with ...
1     Risk management and self-management strategies...
2     Postoperative pancreatic fistula (POPF) contin...
3     We aim to develop a predictive model for lymph...
4     Breast cancer is the most common invasive canc...
5     The titled molecule 2-Amino-N-(2-chloro-6-meth...
6     In resource-limited settings, data regarding t...
7     Chemotherapy in an integral part of cancer tre...
8     STK11 germline pathogenic variants are typical...
9                Background Methods Results Conclusions
10             Introduction Methods Results Conclusions
11    The presented study depicts the synthesis of 1...
12    Breast cancer is one of the most common cancer...
13    Breast cancer ranks as the second most prevale...
14    As the spatial resolution of positron emission...
15    We aimed to investigate the impact of social c...
16    Breast cancer (BC) is the most frequently occu...
17    Breast Cancer (BC) poses significant chall

Map: 100%|██████████| 50/50 [00:00<00:00, 274.74 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 4165.73 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


100%|██████████| 21/21 [06:14<00:00, 17.83s/it]


{'train_runtime': 374.4316, 'train_samples_per_second': 0.24, 'train_steps_per_second': 0.056, 'train_loss': 2.340392521449498, 'epoch': 2.8}


100%|██████████| 2/2 [00:23<00:00, 11.97s/it]

Test set evaluation: {'eval_loss': 2.2494518756866455, 'eval_runtime': 23.9823, 'eval_samples_per_second': 0.417, 'eval_steps_per_second': 0.083, 'epoch': 2.8}





In [7]:
print(test_dataset)

prompt = "Treatment for breast cancer includes"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


<torch.utils.data.dataset.Subset object at 0x00000157017E7A30>
Treatment for breast cancer includes surgery, chemotherapy and / or radiation, and is usually administered over a course of several weeks. (1) Despite being the leading cause of cancer death in women worldwide, the management of breast cancers has evolved dramatically over the past decades. The development of targeted therapy has improved the survival of patients with breast carcinoma; however, it has also increased the risk of drug-related adverse events (ADRs), which can limit the use of these drugs and cause unnecessary side effects. In this context, understanding the causes of ADRs is crucial for the development and clinical use, as well as for improving drug safety. We conducted
Treatment for breast cancer includes surgery, chemotherapy, and hormone therapy. An understanding of breast anatomy and the breast and its surrounding structures is essential for surgical planning. The aim of this study was to provide a compreh

In [8]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_3.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)



0     Pediatric neurosurgical practice is prevalent ...
1     Posttraumatic osteoarthritis is a common indic...
2     Delay in diagnosis and treatment of lung cance...
3     Optical imaging is a powerful tool for early d...
4     Successful acute migraine treatment potentiall...
5     Masses in the forefoot and midfoot are common ...
6     Acute kidney injury (AKI) is a prevalent compl...
7     Despite the significant decline in the inciden...
8     Neoadjuvant endocrine therapy presents an impo...
9     Auxiliary diagnosis of different types of cyst...
10    Osteoarthritis is more prevalent and severe am...
11    The integration of positron emission tomograph...
12    With the advent of PET imaging in 1976, 2-deox...
13    Alzheimer's disease (AD) is the most common ne...
14    Despite an increase in maternal prenatal canna...
15    There is an unmet need for effective topical t...
16    Attention Deficit Hyperactivity Disorder (ADHD...
17    Gastric cancer (GC) is one of the most com

Map: 100%|██████████| 50/50 [00:00<00:00, 241.58 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3845.23 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


100%|██████████| 21/21 [07:50<00:00, 22.39s/it]


{'train_runtime': 470.258, 'train_samples_per_second': 0.191, 'train_steps_per_second': 0.045, 'train_loss': 2.1990267435709634, 'epoch': 2.8}


100%|██████████| 2/2 [00:15<00:00,  7.58s/it]

Test set evaluation: {'eval_loss': 2.397444725036621, 'eval_runtime': 15.2271, 'eval_samples_per_second': 0.657, 'eval_steps_per_second': 0.131, 'epoch': 2.8}





In [9]:
print(test_dataset)

prompt = "Symptoms for lung cancer include"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


<torch.utils.data.dataset.Subset object at 0x000001571816AE60>
Symptoms for lung cancer include cough, dyspnea, hemoptysis, chest pain, and chest tightness. The purpose of this study was to investigate the impact of lung-cancer-specific symptoms on quality of life (QOL) of patients and caregivers. A survey was conducted among patients with lung cancers and their caregivers in China. Patients were administered questionnaires to collect socio-demographic data and lung symptoms, including cough and dyspnea. Caregivers were also administered the same questionnaire. QOL was evaluated using the lung Cancer Symptom Scale (LCSS). Multiple regression analysis was used to identify factors that influence patients' QOL and caregiver QOL. Results: A total of 200
Symptoms for lung cancer include dyspnea, chest pain, cough, and hemoptysis. The diagnosis of lung carcinoma typically requires advanced imaging techniques, such as computed tomography, which are usually performed in combination with tissue

In [10]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_4.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)



0     Pediatric neurosurgical practice is prevalent ...
1     Posttraumatic osteoarthritis is a common indic...
2     Delay in diagnosis and treatment of lung cance...
3     Paclitaxel-induced peripheral neuropathy (PN) ...
4     Breast cancer is the leading cause of cancer-r...
5     Optical imaging is a powerful tool for early d...
6     The study objective was to determine the effec...
7     Successful acute migraine treatment potentiall...
8     Masses in the forefoot and midfoot are common ...
9     Acute kidney injury (AKI) is a prevalent compl...
10    Despite the significant decline in the inciden...
11    Neoadjuvant endocrine therapy presents an impo...
12    Auxiliary diagnosis of different types of cyst...
13    Osteoarthritis is more prevalent and severe am...
14    The integration of positron emission tomograph...
15    With the advent of PET imaging in 1976, 2-deox...
16    Alzheimer's disease (AD) is the most common ne...
17    Despite an increase in maternal prenatal c

Map: 100%|██████████| 50/50 [00:00<00:00, 242.83 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3845.23 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


100%|██████████| 21/21 [05:40<00:00, 16.21s/it]


{'train_runtime': 340.3298, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.062, 'train_loss': 1.8107521420433408, 'epoch': 2.8}


100%|██████████| 2/2 [00:20<00:00, 10.25s/it]

Test set evaluation: {'eval_loss': 2.204596757888794, 'eval_runtime': 20.5334, 'eval_samples_per_second': 0.487, 'eval_steps_per_second': 0.097, 'epoch': 2.8}





In [11]:
print(test_dataset)

prompt = "Symptoms for breast cancer include"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


<torch.utils.data.dataset.Subset object at 0x00000156B9395690>
Symptoms for breast cancer include pain, swelling, and itching. A rare but serious complication of lymphedema is cellulitis, which can result in disfigurement and functional impairment. To our knowledge, this is the first report of cellulitis secondary to lymphedema after breast-conserving treatment. The patient was a 66-year-old woman with breast carcinoma who developed cellulitis after receiving adjuvant radiotherapy and chemotherapy. She had swelling and pain in the left axilla, resulting in considerable functional loss. After the cellulitis resolved, she underwent axillary lymph node dissection with axillary sentinel lymph nodes biopsy. Histopathological examination revealed that the lymph vessels were dilated and tortuous, with focal fibrinoid
Symptoms for breast cancer include pain, bleeding, and swelling. The diagnosis of breast lump is often delayed, which can lead to an increased risk of local recurrence and distan

In [12]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_5.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)



0     Delay in diagnosis and treatment of lung cance...
1     As catabolic enzyme, CD73 dephosphorylates ade...
2     Autophagy serves as a critical regulator of im...
3     This study aims to compare the cost-effectiven...
4     4D cone-beam computed tomography (CBCT) plays ...
5     Utidelone is an ebomycin derivative chemothera...
6     The enhancer of rudimentary homolog (ERH) is s...
7     Liver cancer (LC) is among the deadliest cance...
8     Lung cancer remains the most prevalent maligna...
9     Improvements in cancer control have led to a d...
10    Non-small cell lung cancer (NSCLC) is the lead...
11    Multiple microRNAs encapsulated in extracellul...
12    Cisplatin resistance is common in non‑small ce...
13    Cancer will affect more than one in three U.S....
14    Thyroid cancer is the most prevalent form of e...
15    With significant advancements in the study of ...
16    Dysregulation of lung tissue collagen level pl...
17    Primary gastric small cell carcinoma (GSCC

Map: 100%|██████████| 50/50 [00:00<00:00, 284.53 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3767.52 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


100%|██████████| 21/21 [09:33<00:00, 27.33s/it]


{'train_runtime': 573.8683, 'train_samples_per_second': 0.157, 'train_steps_per_second': 0.037, 'train_loss': 2.098057156517392, 'epoch': 2.8}


100%|██████████| 2/2 [00:17<00:00,  8.81s/it]

Test set evaluation: {'eval_loss': 1.9998762607574463, 'eval_runtime': 17.6431, 'eval_samples_per_second': 0.567, 'eval_steps_per_second': 0.113, 'epoch': 2.8}





In [13]:
print(test_dataset)

prompt = "Lung cancer is"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


<torch.utils.data.dataset.Subset object at 0x00000157B4A83790>
Lung cancer is the leading cause of cancer-related deaths worldwide. A substantial number of lung cancer patients are diagnosed at an advanced stage of the disease, resulting in a poor prognosis. The standard of care for advanced non-small cell lung cancers (NSCLC) is platinum-based chemotherapy, with targeted therapies and immunotherapy being investigated for the treatment of advanced NSCLC. This review summarizes the state of knowledge regarding the use of targeted therapy, immunotherapy and combination therapies in advanced lung adenocarcinoma. We provide a comprehensive review of clinical trials investigating targeted agents, immunotherapies and combined treatments for lung adenocarcinomas. Expert opinion: The advent of new targeted drugs has changed the landscape
Lung cancer is a leading cause of cancer-related deaths worldwide, with the majority of patients diagnosed in late stages. The diagnosis of lung cancer at an 

In [14]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_6.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)



0     Paclitaxel-induced peripheral neuropathy (PN) ...
1     Breast cancer is the leading cause of cancer-r...
2     The study objective was to determine the effec...
3     One of the main health issues in the modern wo...
4     There are currently no molecular tests to iden...
5     Utidelone is an ebomycin derivative chemothera...
6     Neoadjuvant endocrine therapy presents an impo...
7     Social support has been linked to increased us...
8     Sialyltransferases are enzymes that play a cru...
9     Eribulin is an inhibitor of microtubule dynami...
10    Benzofuropyridines (BFP) are polycyclic compou...
11    This study was conducted to determine the leve...
12    Improvements in cancer control have led to a d...
13    Gut microbiota and associated metabolites have...
14    Breast cancer (BC) remains a prevalent and cha...
15    Differential RNA expression is becoming increa...
16    Cancer will affect more than one in three U.S....
17    Salt‑induced kinase 1 (SIK1) is a serine/t

Map: 100%|██████████| 50/50 [00:00<00:00, 261.61 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 2324.85 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


100%|██████████| 21/21 [04:40<00:00, 13.34s/it]


{'train_runtime': 280.142, 'train_samples_per_second': 0.321, 'train_steps_per_second': 0.075, 'train_loss': 2.1254348754882812, 'epoch': 2.8}


100%|██████████| 2/2 [00:01<00:00,  1.83it/s]

Test set evaluation: {'eval_loss': 2.038769245147705, 'eval_runtime': 3.1513, 'eval_samples_per_second': 3.173, 'eval_steps_per_second': 0.635, 'epoch': 2.8}





In [15]:
print(test_dataset)

prompt = "Breast cancer is"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


<torch.utils.data.dataset.Subset object at 0x0000015718E2BF10>
Breast cancer is the most common cancer affecting women worldwide. The incidence of this malignancy is increasing worldwide, with the highest incidence in developing countries. Despite significant advances in the diagnosis and treatment of breast cancer, the disease remains a major cause of mortality in women. New diagnostic and therapeutic methods are needed to improve the treatment outcomes for this cancer. In this review, we summarize the current advances and future prospects of non-invasive, real-time detection of circulating tumor cells (CTCs). We also discuss the clinical implications of CTCs in cancer treatment, including the use of cell-free DNA (cfDNA) as a biomarker for monitoring treatment response,
Breast cancer is the most common malignancy among women and the second leading cause of cancer-related death in women worldwide. The present study aimed to investigate the association between serum vitamin D levels an

In [4]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_7.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)



0     Pancreatic adenosquamous carcinoma (PASC) is a...
1     Sonodynamic therapy (SDT) is a promising strat...
2     Pancreatic ductal adenocarcinoma (PDAC), the m...
3     Pancreatoduodenectomy (PD) is a highly complex...
4     V-domain Imuunoglobulin suppressor of T-cell a...
5     Anaplastic thyroid cancer (ATC) is one of the ...
6     Several tyrosine kinase receptors inhibitors (...
7     Circulating tumor cells (CTCs) are tumor cells...
8     Indoleamine 2,3-dioxygenase 1 (IDO1) plays a k...
9     Tumor microenvironment (TME) is essential for ...
10    Thromboembolic events (TEEs) are frequent amon...
11    The aggressiveness of pancreatic ductal adenoc...
12    Disconnected pancreatic duct syndrome (DPDS) i...
13    Pancreatic ductal adenocarcinoma (PDAC) poses ...
14    The purpose of this study was to translate the...
15    Immunotherapy has made significant strides in ...
16    Type 1 diabetes (T1D) is characterized by an a...
17    Ubiquitination is one of the important mod

Map: 100%|██████████| 50/50 [00:00<00:00, 228.92 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 3124.39 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})


100%|██████████| 7/7 [07:49<00:00, 67.13s/it]


{'train_runtime': 469.9399, 'train_samples_per_second': 0.064, 'train_steps_per_second': 0.015, 'train_loss': 2.2471866607666016, 'epoch': 0.93}


100%|██████████| 2/2 [00:08<00:00,  4.03s/it]

Test set evaluation: {'eval_loss': 1.9083530902862549, 'eval_runtime': 207.2702, 'eval_samples_per_second': 0.048, 'eval_steps_per_second': 0.01, 'epoch': 0.9333333333333333}





In [12]:
print(test_dataset)

prompt = "Pancreatic cancer is"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


<torch.utils.data.dataset.Subset object at 0x0000013872AD5630>
Pancreatic cancer is a leading cause of cancer-related death worldwide. A pancreatic cancer stem cell (CSC) model has been developed, in which the pancreatic CSC subpopulation is enriched by enriching cells in a serum-free, chemically defined medium containing epidermal growth factor, hepatocyte growth factors, and insulin-like growth Factor-2. The pancreatic CSCs are characterized by a high level of aldehyde dehydrogenase (ALDH) activity and cancer cell self-renewal. Here, we investigated the effect of resveratrol (RSV), a polyphenol compound, on the proliferation and viability of pancreatic ductal adenocarcinoma (PDAC) cells, as well as on pancreatic and CSC characteristics.
Pancreatic cancer is a highly invasive and fatal cancer. A review of clinical studies using the most frequently used therapeutic approaches: chemotherapy, targeted therapy and immunotherapy. The aim of this review is to discuss the current evidence re

In [None]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_8.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)

In [None]:
print(test_dataset)

prompt = "Symptoms for pancreatic cancer include"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


In [17]:
import torch
torch.cuda.empty_cache()

from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=1024)

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
df = pd.read_csv('Question_9.csv') 

# Extract the abstracts
abstracts = df['Abstract']
abstracts = abstracts[:50]
print(abstracts)

dataset = Dataset.from_dict({"text": abstracts})
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

input_ids = tokenized_dataset['input_ids']
labels = input_ids.clone()

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    seed=42,
    per_device_train_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients
    metric_for_best_model="loss",
    greater_is_better=False,
    load_best_model_at_end=True,
)

print(tokenized_dataset)

train_size = 30
val_size = 10
test_size = 10

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    tokenized_dataset, [train_size, val_size, test_size]
)

# Update Trainer with train and validation sets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,    # Training set
    eval_dataset=val_dataset,       # Validation set for tuning during training
    data_collator=data_collator,
)

# Train the model
trainer.train()
metrics = trainer.evaluate(test_dataset)
print("Test set evaluation:", metrics)



0     To investigate the practices of clinicians pre...
1     Postoperative pancreatic fistula (POPF) contin...
2     STK11 germline pathogenic variants are typical...
3     The overall treatment response among patients ...
4     Post-transplant HCC recurrence significantly i...
5     Patients with synchronous pancreatic ductal ad...
6     This research aimed to assess the value of rad...
7     Pancreatic Ductal Adenocarcinoma (PDAC) primar...
8     Muscular dystrophies and myotonic disorders ar...
9     Tumor-infiltrating lymphocytes (TILs) are sign...
10    Although atezolizumab plus bevacizumab (Atezo/...
11    Cutaneous malignant melanoma is one of the mos...
12    Pancreatic cancer, the 12th-most common cancer...
13    Cancer immunotherapy using immune checkpoint i...
14    Given the extensive role of lipids in cancer d...
15    Pancreatic cancer is a prevalent malignant tum...
16    Conducting"health economic evaluation"is one o...
17    The prognosis of pancreatic cancer (PDAC) 

Map: 100%|██████████| 50/50 [00:00<00:00, 225.81 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 2082.90 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})




KeyboardInterrupt: 

In [None]:
print(test_dataset)

prompt = "Treatment for pancreatic cancer include"
inputs = tokenizer(prompt, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    num_return_sequences=5, 
    no_repeat_ngram_size=2, 
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
)

for i in range(5):
    generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(generated_text)


In [6]:
# from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
# from datasets import Dataset
# import torch

# tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
# model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
# model.to('cuda')



BioGptForCausalLM(
  (biogpt): BioGptModel(
    (embed_tokens): BioGptScaledWordEmbedding(42384, 1024, padding_idx=1)
    (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
    (layers): ModuleList(
      (0-23): 24 x BioGptDecoderLayer(
        (self_attn): BioGptSdpaAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((

In [10]:
# import torch
# torch.cuda.empty_cache()

# from transformers import BioGptTokenizer, BioGptForCausalLM, Trainer, TrainingArguments
# from datasets import Dataset, concatenate_datasets
# import pandas as pd
# from datasets import Dataset

# tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
# q1 = pd.read_csv('Question_1.csv') 
# q2 = pd.read_csv('Question_2.csv')
# q3 = pd.read_csv('Question_3.csv')
# q4 = pd.read_csv('Question_4.csv')
# q5 = pd.read_csv('Question_5.csv')
# q6 = pd.read_csv('Question_6.csv')
# q7 = pd.read_csv('Question_7.csv')
# q8 = pd.read_csv('Question_8.csv')
# q9 = pd.read_csv('Question_9.csv')

# a1 = q1['Abstract'][:30]
# a2 = q2['Abstract'][:30]
# a3 = q3['Abstract'][:30]
# a4 = q4['Abstract'][:30]
# a5 = q5['Abstract'][:30]
# a6 = q6['Abstract'][:30]
# a7 = q7['Abstract'][:30]
# a8 = q8['Abstract'][:30]
# a9 = q9['Abstract'][:30]

# def preprocess_function(examples):
#     return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=1024)

# def get_abstracts(abstracts):
#     dataset = Dataset.from_dict({"text": abstracts})
#     tokenized_dataset = dataset.map(preprocess_function, batched=True)
#     tokenized_dataset = tokenized_dataset.remove_columns(["text"])
#     tokenized_dataset.set_format("torch")
#     tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})
#     return tokenized_dataset

# tokenized_dataset1 = get_abstracts(a1)
# tokenized_dataset2 = get_abstracts(a2)
# tokenized_dataset3 = get_abstracts(a3)
# tokenized_dataset4 = get_abstracts(a4)
# tokenized_dataset5 = get_abstracts(a5)
# tokenized_dataset6 = get_abstracts(a6)
# tokenized_dataset7 = get_abstracts(a7)
# tokenized_dataset8 = get_abstracts(a8)
# tokenized_dataset9 = get_abstracts(a9)

# def subset_to_dataset(subset):
#     return Dataset.from_dict(subset.dataset[subset.indices])

# def get_datasets(tokenized_dataset):
#     train_size = 20
#     val_size = 5
#     test_size = 5

#     train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
#         tokenized_dataset, [train_size, val_size, test_size]
#     )
#     return subset_to_dataset(train_dataset), subset_to_dataset(val_dataset), subset_to_dataset(test_dataset)

# a1_train_dataset, a1_val_dataset1, a1_test_dataset1 = get_datasets(tokenized_dataset1)
# a2_train_dataset, a2_val_dataset2, a2_test_dataset2 = get_datasets(tokenized_dataset2)
# a3_train_dataset, a3_val_dataset3, a3_test_dataset3 = get_datasets(tokenized_dataset3)
# a4_train_dataset, a4_val_dataset4, a4_test_dataset4 = get_datasets(tokenized_dataset4)
# a5_train_dataset, a5_val_dataset5, a5_test_dataset5 = get_datasets(tokenized_dataset5)
# a6_train_dataset, a6_val_dataset6, a6_test_dataset6 = get_datasets(tokenized_dataset6)
# a7_train_dataset, a7_val_dataset7, a7_test_dataset7 = get_datasets(tokenized_dataset7)
# a8_train_dataset, a8_val_dataset8, a8_test_dataset8 = get_datasets(tokenized_dataset8)
# a9_train_dataset, a9_val_dataset9, a9_test_dataset9 = get_datasets(tokenized_dataset9)


# final_test_dataset = concatenate_datasets([a1_test_dataset1,  a7_test_dataset7])
# final_train_dataset = concatenate_datasets([a1_train_dataset, a7_train_dataset])
# final_val_dataset = concatenate_datasets([a1_val_dataset1, a7_val_dataset7])

# # print(final_train_dataset)

# input_ids = final_train_dataset['input_ids']
# labels = input_ids

# from transformers import DataCollatorForLanguageModeling
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",
#     learning_rate=2e-5,
#     lr_scheduler_type="linear",
#     seed=42,
#     per_device_train_batch_size=1,
#     num_train_epochs=1,
#     weight_decay=0.01,
#     gradient_accumulation_steps=4,  # Accumulate gradients
#     metric_for_best_model="loss",
#     greater_is_better=False,
#     load_best_model_at_end=True,
# )


# # Update Trainer with train and validation sets
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=final_train_dataset,    # Training set
#     eval_dataset=final_val_dataset,       # Validation set for tuning during training
#     data_collator=data_collator,
# )

# # Train the model
# trainer.train()
# metrics = trainer.evaluate(final_test_dataset)
# print("Test set evaluation:", metrics)

Map: 100%|██████████| 30/30 [00:00<00:00, 231.73 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 2726.47 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 221.59 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 2999.36 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 200.66 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 3332.43 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 197.82 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 2788.58 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 232.32 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 3332.69 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 216.81 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 3526.80 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 204.75 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 3332.60 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 186.80 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 2726.94 examples/s]
Map: 100%|██████

{'train_runtime': 270.2112, 'train_samples_per_second': 0.148, 'train_steps_per_second': 0.037, 'train_loss': 1.9663129806518556, 'epoch': 1.0}


100%|██████████| 2/2 [00:23<00:00, 11.60s/it]

Test set evaluation: {'eval_loss': 2.1346869468688965, 'eval_runtime': 23.3001, 'eval_samples_per_second': 0.429, 'eval_steps_per_second': 0.086, 'epoch': 1.0}





In [None]:
# def generate_text(prompt):
#     inputs = tokenizer(prompt, return_tensors="pt")

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)  
#     inputs = {key: val.to(device) for key, val in inputs.items()}

#     outputs = model.generate(
#         input_ids=inputs["input_ids"],
#         max_length=128,
#         num_return_sequences=5, 
#         no_repeat_ngram_size=2, 
#         do_sample=True,
#         top_k=50,
#         top_p=0.95,
#         temperature=0.7,
#     )

#     print('Prompt:', prompt)
#     for i in range(5):
#         generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
#         print(generated_text)


# generate_text("Treatment for lung cancer includes")
# generate_text("Treatment for breast cancer includes")
# generate_text("Symptoms for lung cancer include")
# generate_text("Symptoms for breast cancer include")
# generate_text("Lung cancer is")
# generate_text("Breast cancer is")
# generate_text("Pancreatic cancer is")
# generate_text("Symptoms for pancreatic cancer include")
# generate_text("Treatment for pancreatic cancer include")

In [None]:
# # write the final dataset to a csv file
# final_train_dataset.to_csv('final_train_dataset.csv')
# final_val_dataset.to_csv('final_val_dataset.csv')
# final_test_dataset.to_csv('final_test_dataset.csv')