# <center>**T5-Small Summarization Model**</center>

In [1]:
pip install beautifulsoup4 tqdm transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cuda


In [4]:
papers_df = pd.read_csv('/kaggle/input/biorxiv-genomics-papers-7070/biorxiv_genomics_papers_7070.csv')
print(f"Fetched {len(papers_df)} articles.")

Fetched 7070 articles.


In [5]:
papers_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Title,Authors,DOI,Date,Paper URL
0,Genome sequence assembly and annotation of MAT...,,https://doi.org/10.1101/2025.03.24.645116,,https://www.biorxiv.org/content/10.1101/2025.0...
1,Development of metagenomic methods for non-inv...,,https://doi.org/10.1101/2025.01.21.633432,,https://www.biorxiv.org/content/10.1101/2025.0...
2,Evaluating Methods for the Prediction of Cell ...,,https://doi.org/10.1101/2024.08.21.609075,,https://www.biorxiv.org/content/10.1101/2024.0...
3,Single-cell multiome and enhancer connectome o...,,https://doi.org/10.1101/2025.03.21.644670,,https://www.biorxiv.org/content/10.1101/2025.0...
4,Phenotypic tolerance for rDNA copy number vari...,,https://doi.org/10.1101/2025.03.21.644675,,https://www.biorxiv.org/content/10.1101/2025.0...


In [6]:
# Function to get abstract from a paper URL
def get_abstract(paper_url):
    try:
        response = requests.get(paper_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
        if response.status_code != 200:
            return "Abstract not available."
        
        soup = BeautifulSoup(response.text, "html.parser")
        abstract_section = soup.find("div", class_="abstract")
        return abstract_section.text.strip() if abstract_section else "Abstract not found."
    except requests.RequestException:
        return "Failed to fetch abstract."

In [7]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
# Function to summarize text
def summarize_text(text, max_length=150):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(device)
    
    summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [9]:
papers_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Title,Authors,DOI,Date,Paper URL
0,Genome sequence assembly and annotation of MAT...,,https://doi.org/10.1101/2025.03.24.645116,,https://www.biorxiv.org/content/10.1101/2025.0...
1,Development of metagenomic methods for non-inv...,,https://doi.org/10.1101/2025.01.21.633432,,https://www.biorxiv.org/content/10.1101/2025.0...
2,Evaluating Methods for the Prediction of Cell ...,,https://doi.org/10.1101/2024.08.21.609075,,https://www.biorxiv.org/content/10.1101/2024.0...
3,Single-cell multiome and enhancer connectome o...,,https://doi.org/10.1101/2025.03.21.644670,,https://www.biorxiv.org/content/10.1101/2025.0...
4,Phenotypic tolerance for rDNA copy number vari...,,https://doi.org/10.1101/2025.03.21.644675,,https://www.biorxiv.org/content/10.1101/2025.0...


In [10]:
results = []
for _, row in tqdm(papers_df.iterrows(), total=len(papers_df), desc="Processing Papers"):
    paper_url = row['Paper URL']
    abstract = get_abstract(paper_url)
    summary = summarize_text(abstract) if "Abstract not found." not in abstract else "Summary not available."

    results.append({"Paper URL": paper_url, "Abstract": abstract, "Summary": summary})

Processing Papers: 100%|██████████| 7070/7070 [7:43:08<00:00,  3.93s/it]


In [11]:
# Save the results to a csv
abstract_summary_df = pd.DataFrame(results)
abstract_summary_df.to_csv("generated_abstract_summaries.csv", index=False)
print("All papers processed and saved to generated_abstract_summaries.csv")

All papers processed and saved to generated_abstract_summaries.csv
