In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader

  cpu = _conversion_method_template(device=torch.device("cpu"))
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SummarizationDataset(Dataset):
    def __init__(self, articles_folder, summaries_folder, tokenizer, max_length):
        self.articles_folder = articles_folder
        self.summaries_folder = summaries_folder
        self.file_list = os.listdir(articles_folder)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_name = self.file_list[idx]
        article_path = os.path.join(self.articles_folder, file_name)
        summary_path = os.path.join(self.summaries_folder, file_name)

        with open(article_path, "r", encoding="utf-8") as f:
            article = f.read()

        with open(summary_path, "r", encoding="utf-8") as f:
            summary = f.read()

        inputs = self.tokenizer(
            article,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        labels = self.tokenizer(
            summary,
            max_length=self.max_length // 2,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0),
        }


In [3]:
# Paths
articles_folder = r"C:\Users\abeya\OneDrive\Desktop\Summ_implementation\pmtexts"
summaries_folder = r"C:\Users\abeya\OneDrive\Desktop\Summ_implementation\pmabstract"

# Tokenizer and Dataset
model_name = "google/bigbird-pegasus-large-arxiv"
max_length = 1024
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=max_length)

dataset = SummarizationDataset(articles_folder, summaries_folder, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [4]:
device = torch.device("cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [5]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Optimization settings
num_epochs = 5
learning_rate = 2e-5
accumulation_steps = 4  

# Initialize optimizer with weight decay
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
steps_per_epoch = max(1, len(dataloader) // accumulation_steps)  # Ensure it's at least 1


scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=learning_rate,
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop with optimizations
model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch + 1}/{num_epochs}")
    for i, batch in progress_bar:
        # Transfer batch to device
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        labels = batch["labels"].to(device, non_blocking=True)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps 

        # Backward pass
        loss.backward()

        # Gradient accumulation
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad(set_to_none=True)

        
        progress_bar.set_postfix({"Loss": f"{loss.item() * accumulation_steps:.4f}"})

# Save model and tokenizer
output_dir = r"C:\Users\abeya\OneDrive\Desktop\Summ_implementation\mod_out17"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Model and tokenizer saved!")


Epoch 1/5: 100%|██████████| 86/86 [1:07:41<00:00, 47.22s/it, Loss=7.9178] 
Epoch 2/5: 100%|██████████| 86/86 [1:03:08<00:00, 44.05s/it, Loss=7.0461]
Epoch 3/5: 100%|██████████| 86/86 [56:16<00:00, 39.26s/it, Loss=5.8017]
Epoch 4/5: 100%|██████████| 86/86 [56:51<00:00, 39.67s/it, Loss=3.4823]
Epoch 5/5: 100%|██████████| 86/86 [1:00:24<00:00, 42.14s/it, Loss=3.5326]


Model and tokenizer saved!


In [11]:
sample_text2 =  """BACKGROUND
micrornas  are a class of small non-coding rnas that regulate gene expression by binding to their target mrnas and triggering either protein translation repression or rna degradation  <cit> . recent studies show that some mirnas are located at fragile sites and genomic regions involved in cancers  <cit> . the aberrant expression of mirna genes could lead to human disease, including cancer  <cit> , and are regarded as potential biomarkers for cancer diagnosis  <cit> . the roles mirnas play have been demonstrated in a few cancer types including breast cancer  <cit> , lung cancer  <cit>  and chronic lymphocytic leukemia  <cit> , while the roles of mirna in other cancers remain largely unknown.

there are several approaches of studying mirnas and their expression profiles, including northern blotting and real-time pcr assay. there are also available high-throughput methods such as oligonucleotide mirna microarray analysis  <cit> , bead-based flow-cytometric technique  <cit> , and sage-based mirage  <cit> . mirna microarray analysis is a commonly used high-throughput technique for the assessment of previously discovered mirnas. with the sage-based technique, such as mirage, the expression profiles of known mirnas could be retrieved together with the unknown ones which are possible mirna candidates.

for gene expression sage studies  <cit> , there exist several well developed methods for data analysis together with web services provided, such as sagemap  <cit>  and sage genie  <cit> . for mirna-related sage, however, the data analysis is much more complicated. the extracted tags have to be compared with various rna databases in addition to mrna sequences. the tags also need to be mapped to the human genome and to be analyzed for precursors with thermodynamically stable hairpin structures. this is a very troublesome process and current users have to refer to several different databases to retrieve related biologically significant data  <cit> . to aid the processing and data analysis of this method, we constructed a web-based system, named mirna analysis system . the expression profile of known mirnas in submitted sequences were returned and compared with public dataset using fisher's exact test. public available datasets of known mirnas expression in liver were collected for the annotation of mirna expression in liver. several public available gene expression datasets were included to reveal differentially expressed genes in liver cancer and normal liver tissues. the differentially expressed mirnas and genes are highlighted and the relationship between mirnas and genes is shown according to mirna target prediction.

RESULTS
users could upload the raw sequencing data and specify the sequencing parameters through the web interface. the known mirnas and possible mirna candidates will be analyzed together with their expression profiles. the target genes predicted by mirna target prediction software are provided together with the annotation information. to demonstrate the biological significance of the retrieved mirnas, the profiles of public datasets of known mirnas and target genes were collected and included in the annotation.

the miras system provides an easy and friendly way for scientists to analyze and process raw mirna sequence data to obtain new mirna candidates. it also provides tools for the annotations of the predicted mirnas.

CONCLUSIONS
in this work, we established a web-based analysis platform for mirnas, called miras  <cit> , to analyze the mirna expression in specific tissue and to predict and study the possible mirna candidates. the differentially expressed mirnas that target differentially expressed genes are retrieved together with mirna and target gene annotation, to uncover the biological significance. currently it supports liver cancer genes, while in the future, the analysis platform is planned to be expanded to support other cancers and to integrate all public available expression data of the mirnas and genes in cancer and normal tissues.
"""

In [9]:
output_dir = r"C:\Users\abeya\OneDrive\Desktop\Summ_implementation\mod_out17"

In [6]:
# Load fine-tuned model
device = torch.device("cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Generate summary for test input
test_text = sample_text2
inputs = tokenizer(test_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
summary_ids = model.generate(inputs["input_ids"], max_length=512, min_length=100)

print("Generated Summary:")
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

Input ids are automatically padded from 797 to 832 to be a multiple of `config.block_size`: 64


Generated Summary:
expression data of micrornas ( mirnas ) are a class of small non -coding rnas that regulate gene expression by binding to their target mrnas and triggering either protein translation repression or rna degradation .<n> recent studies show that some mirnas are located at fragile sites and genomic regions involved in cancers , and are regarded as potential biomarkers for cancer diagnosis . in this work , we established a web - based system , named mirna analysis system , to analyze the expression profiles of known mirnas and to predict and study the possible mirna candidates .<n> the differentially expressed mirnas and genes were retrieved together with mirna and target gene annotation to uncover the biological significance . to aid the processing and data analysis of the retrieved mirna sequences , the profiles of known mirnas and target genes were collected and included in the annotation miras system .


In [10]:
# Load fine-tuned model
device = torch.device("cpu")
model2 = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
print(model2)

BigBirdPegasusForConditionalGeneration(
  (model): BigBirdPegasusModel(
    (shared): BigBirdPegasusScaledWordEmbedding(96103, 1024, padding_idx=0)
    (encoder): BigBirdPegasusEncoder(
      (embed_tokens): BigBirdPegasusScaledWordEmbedding(96103, 1024, padding_idx=0)
      (embed_positions): BigBirdPegasusLearnedPositionalEmbedding(4096, 1024)
      (layers): ModuleList(
        (0-15): 16 x BigBirdPegasusEncoderLayer(
          (self_attn): BigBirdPegasusEncoderAttention(
            (self): BigBirdPegasusBlockSparseAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (output): Linear(in_features=1024, out_features=1024, bias=False)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): NewGELUAc

In [5]:
# Hyperparameters
num_epochs = 3
learning_rate = 2e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training
model.train()
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

Epoch 1/3, Loss: 9.7796
Epoch 1/3, Loss: 9.6340
Epoch 1/3, Loss: 7.9094
Epoch 1/3, Loss: 9.3209
Epoch 1/3, Loss: 8.6479
Epoch 1/3, Loss: 8.5033
Epoch 1/3, Loss: 9.1245
Epoch 1/3, Loss: 6.0807
Epoch 1/3, Loss: 6.8284
Epoch 1/3, Loss: 9.0147
Epoch 1/3, Loss: 8.5330
Epoch 1/3, Loss: 10.2837
Epoch 1/3, Loss: 7.7836
Epoch 1/3, Loss: 10.3661
Epoch 1/3, Loss: 7.9583
Epoch 1/3, Loss: 7.3588
Epoch 1/3, Loss: 8.8226
Epoch 1/3, Loss: 6.5705
Epoch 1/3, Loss: 7.8581
Epoch 1/3, Loss: 8.0010
Epoch 1/3, Loss: 7.8379
Epoch 1/3, Loss: 6.9810
Epoch 1/3, Loss: 7.0189
Epoch 1/3, Loss: 5.7712
Epoch 1/3, Loss: 6.1234
Epoch 1/3, Loss: 7.4748
Epoch 1/3, Loss: 7.2503
Epoch 1/3, Loss: 7.2808
Epoch 1/3, Loss: 8.1749
Epoch 1/3, Loss: 6.4271
Epoch 1/3, Loss: 9.0498
Epoch 1/3, Loss: 6.6214
Epoch 1/3, Loss: 7.2302
Epoch 1/3, Loss: 7.3719
Epoch 1/3, Loss: 7.0062
Epoch 1/3, Loss: 7.0565
Epoch 1/3, Loss: 7.3765
Epoch 1/3, Loss: 8.0411
Epoch 1/3, Loss: 6.9570
Epoch 1/3, Loss: 6.2664
Epoch 1/3, Loss: 6.7050
Epoch 1/3, Los

In [6]:
output_dir = r"C:\Users\abeya\OneDrive\Desktop\Summ_implementation\mod_out7"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model and tokenizer saved!")



Model and tokenizer saved!


In [7]:
output_dir = r"C:\Users\abeya\OneDrive\Desktop\Summ_implementation\mod_out7"

In [7]:
# Load fine-tuned model
device = torch.device("cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Generate summary for a test input
test_text = sample_text2
inputs = tokenizer(test_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
summary_ids = model.generate(inputs["input_ids"], max_length=512, min_length=100)

print("Generated Summary:")
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

Input ids are automatically padded from 797 to 832 to be a multiple of `config.block_size`: 64


Generated Summary:
expression data of micrornas ( mirnas ) are a class of small non-coding rnas that regulate gene expression by binding to their target mrnas and triggering either protein translation repression or rna degradation cit>. the differentially expressed mirnas and genes are retrieved together with mirna and target gene annotation , to uncover the biological significance . in this work , we established a web - based system , named mirna analysis system , to analyze the expression profiles of known mirnas and to predict and study the possible mirna candidates . the differentially expressed mirnas and genes are retrieved together with mirna and target gene annotation , to uncover the biological significance . the known mirnas and possible mirna candidates will be analyzed together with their expression profiles . to aid the processing and data analysis of this method , we constructed a web - based system , named mirna analysis system . the expression profile of known mirnas in

In [8]:
# Load fine-tuned model
device = torch.device("cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(output_dir)
print(model)

BigBirdPegasusForConditionalGeneration(
  (model): BigBirdPegasusModel(
    (shared): BigBirdPegasusScaledWordEmbedding(96103, 1024, padding_idx=0)
    (encoder): BigBirdPegasusEncoder(
      (embed_tokens): BigBirdPegasusScaledWordEmbedding(96103, 1024, padding_idx=0)
      (embed_positions): BigBirdPegasusLearnedPositionalEmbedding(4096, 1024)
      (layers): ModuleList(
        (0-15): 16 x BigBirdPegasusEncoderLayer(
          (self_attn): BigBirdPegasusEncoderAttention(
            (self): BigBirdPegasusBlockSparseAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
            )
            (output): Linear(in_features=1024, out_features=1024, bias=False)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): NewGELUAc