In [1]:
import pandas as pd

df = pd.read_csv('/biolaysumm2024_data/eLife_train.csv')
df.head(3)

Unnamed: 0,lay_summary,article,headings,keywords,id
0,"In the USA , more deaths happen in the winter ...","In temperate climates , winter deaths exceed s...","['Abstract', 'Introduction', 'Results', 'Discu...",['epidemiology and global health'],elife-35500-v1
1,Most people have likely experienced the discom...,Whether complement dysregulation directly cont...,"['Abstract', 'Introduction', 'Results', 'Discu...","['microbiology and infectious disease', 'immun...",elife-48378-v2
2,The immune system protects an individual from ...,Variation in the presentation of hereditary im...,"['Abstract', 'Introduction', 'Results', 'Discu...","['microbiology and infectious disease', 'immun...",elife-04494-v1


In [2]:
val_df = pd.read_csv('/biolaysumm2024_data/eLife_val.csv')
val_df.head(3)

Unnamed: 0,lay_summary,article,headings,keywords,id
0,The DNA in genes encodes the basic information...,Cell-fate reprograming is at the heart of deve...,"['Abstract', 'Introduction', 'Results', 'Discu...",['developmental biology'],elife-15477-v3
1,Klebsiella pneumoniae is a type of bacteria th...,"Klebsiella pneumoniae is a respiratory , blood...","['Abstract', 'Introduction', 'Results', 'Discu...","['microbiology and infectious disease', 'immun...",elife-56656-v2
2,Malaria is one of the world's most deadly infe...,Plasmodium vivax relapse infections occur foll...,"['Abstract', 'Introduction', 'Results', 'Discu...",['epidemiology and global health'],elife-04692-v2


In [None]:
from datasets import Dataset

data = Dataset.from_pandas(df[['lay_summary', 'article']])
val_data = Dataset.from_pandas(val_df[['lay_summary', 'article']])

In [17]:
# !huggingface-cli download TheBloke/Orca-2-13B-GGUF orca-2-13b.Q5_K_S.gguf --local-dir . --local-dir-use-symlinks False

In [None]:
model_type = 'orca13b' # orca13b

if model_type == 'gemma2b':
    model_id = "google/gemma-2b-it"
    
elif model_type == 'orca13b':
    model_id = 'microsoft/Orca-2-13b'

In [3]:
import os
import torch
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
    
# model_id = "TheBloke/Orca-2-13B-GGUF"
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_quant_type="nf8",
    bnb_8bit_compute_dtype=torch.bfloat16
)
os.environ['HF_TOKEN'] = '<your access token>'
os.environ['HF_HOME'] = 'path/huggingface/hub'

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME'])
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map={"":0}, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME']
)

lora_config = LoraConfig(
    r=4,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
def formatting_func(example):
    output_texts = []
    for i in range(len(example['article'])):
        messages = [
            {"role": "user",
             "content": f"""
                Summarize this document. Text: {example['article'][i]}. 
                Summary:
                """},
             {"role": "assistant",
             "content": "{}".format(example['lay_summary'][i])}
         ]
        output_texts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False))
    return output_texts
    
# Print the first training example
print(formatting_func(data[:1])[0])

<bos><start_of_turn>user
Summarize this document. Text: In temperate climates , winter deaths exceed summer ones . However , there is limited information on the timing and the relative magnitudes of maximum and minimum mortality , by local climate , age group , sex and medical cause of death . We used geo-coded mortality data and wavelets to analyse the seasonality of mortality by age group and sex from 1980 to 2016 in the USA and its subnational climatic regions . Death rates in men and women ≥ 45 years peaked in December to February and were lowest in June to August , driven by cardiorespiratory diseases and injuries . In these ages , percent difference in death rates between peak and minimum months did not vary across climate regions , nor changed from 1980 to 2016 . Under five years , seasonality of all-cause mortality largely disappeared after the 1990s . In adolescents and young adults , especially in males , death rates peaked in June/July and were lowest in December/January , d

In [10]:

import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    eval_dataset=val_data,
    max_seq_length=700,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=20,
        eval_steps=5, 
        learning_rate=1e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)
trainer.train()


Map:   0%|          | 0/4346 [00:00<?, ? examples/s]

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
1,3.1143
2,3.3058
3,3.1894
4,3.2854
5,2.9474
6,3.1241
7,2.776
8,2.8651
9,3.0859
10,2.9531


TrainOutput(global_step=20, training_loss=2.8798494219779966, metrics={'train_runtime': 159.3942, 'train_samples_per_second': 1.004, 'train_steps_per_second': 0.125, 'total_flos': 1335121084416000.0, 'train_loss': 2.8798494219779966, 'epoch': 0.04})

### Save model

In [11]:
# trainer.model.save_pretrained('/data/vep52/nlp/model/lora_adapter')

# Merge the adapters into the base model so you can use the model like a normal transformers model
model = trainer.model.merge_and_unload()
model.save_pretrained(f'/nlp/model/{model_type}')



### Inference

In [15]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


trained_model = AutoModelForCausalLM.from_pretrained(f"/nlp/model/{model_type}", torch_dtype=torch.bfloat16)
    
os.environ['HF_TOKEN'] = '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'
os.environ['HF_HOME'] = '/data/vep52/nlp/huggingface/hub'

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME'])

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [1]:
# import torch
# torch.cuda.empty_cache()

In [16]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=trained_model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.float16},
    # device='cuda',
    max_new_tokens=700
)
messages = [
    {
        "role": "user",
        "content": f"""
        Summarize this document. Text: {val_df['article'].iloc[0]}. 
        Summary:
        """
    }
]

prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Using the previous pipeline with the previous model
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_k=20,
    top_p=0.3,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.

































































































































































































































































































































































































































































































































































































































































































































Above output should have the model output text generation i.e., text summary

In [None]:
# TODO: write the code to save outputs of all testing data.

In [18]:
outputs

[{'generated_text': "<bos><start_of_turn>user\nSummarize this document. Text: Cell-fate reprograming is at the heart of development , yet very little is known about the molecular mechanisms promoting or inhibiting reprograming in intact organisms . In the C . elegans germline , reprograming germ cells into somatic cells requires chromatin perturbation . Here , we describe that such reprograming is facilitated by GLP-1/Notch signaling pathway . This is surprising , since this pathway is best known for maintaining undifferentiated germline stem cells/progenitors . Through a combination of genetics , tissue-specific transcriptome analysis , and functional studies of candidate genes , we uncovered a possible explanation for this unexpected role of GLP-1/Notch . We propose that GLP-1/Notch promotes reprograming by activating specific genes , silenced by the Polycomb repressive complex 2 ( PRC2 ) , and identify the conserved histone demethylase UTX-1 as a crucial GLP-1/Notch target facilitat

In [18]:
text = f"""
You are a knowledgable medical researcher.
Given this text: {df['article'].iloc[0]}. 
Summarize the medical document such that the quality of summary is best in terms of Relevance, Readability, and Factuality.
"""
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



You are a knowledgable medical researcher.
Given this text: In temperate climates , winter deaths exceed summer ones . However , there is limited information on the timing and the relative magnitudes of maximum and minimum mortality , by local climate , age group , sex and medical cause of death . We used geo-coded mortality data and wavelets to analyse the seasonality of mortality by age group and sex from 1980 to 2016 in the USA and its subnational climatic regions . Death rates in men and women ≥ 45 years peaked in December to February and were lowest in June to August , driven by cardiorespiratory diseases and injuries . In these ages , percent difference in death rates between peak and minimum months did not vary across climate regions , nor changed from 1980 to 2016 . Under five years , seasonality of all-cause mortality largely disappeared after the 1990s . In adolescents and young adults , especially in males , death rates peaked in June/July and were lowest in December/Januar

In [25]:
df['lay_summary'].iloc[0]



In [22]:
trainer.save_model('/data/vep52/nlp/model/gemma_2b_it')

In [26]:
model_id = '/data/vep52/nlp/model/gemma_2b_it'


# model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
os.environ['HF_TOKEN'] = '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'
os.environ['HF_HOME'] = '/data/vep52/nlp/huggingface/hub'

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME'])
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map={"":0}, token=os.environ['HF_TOKEN'], cache_dir=os.environ['HF_HOME']
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [31]:
prefix_text = f"""
You are a knowledgable medical researcher. 
Summarize the medical document such that the quality of summary is best in terms of Relevance, Readability, and Factuality.
<start_of_turn>user: {df['article'].iloc[0]}  <end_of_turn>\n
<start_of_turn>model: {df['lay_summary'].iloc[0]} <end_of_turn>\n
<start_of_turn>user: {df['article'].iloc[1]}  <end_of_turn>\n
<start_of_turn>model: 
"""

device = "cuda:0"
inputs = tokenizer(prefix_text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


You are a knowledgable medical researcher. 
Summarize the medical document such that the quality of summary is best in terms of Relevance, Readability, and Factuality.
<start_of_turn>user: In temperate climates , winter deaths exceed summer ones . However , there is limited information on the timing and the relative magnitudes of maximum and minimum mortality , by local climate , age group , sex and medical cause of death . We used geo-coded mortality data and wavelets to analyse the seasonality of mortality by age group and sex from 1980 to 2016 in the USA and its subnational climatic regions . Death rates in men and women ≥ 45 years peaked in December to February and were lowest in June to August , driven by cardiorespiratory diseases and injuries . In these ages , percent difference in death rates between peak and minimum months did not vary across climate regions , nor changed from 1980 to 2016 . Under five years , seasonality of all-cause mortality largely disappeared after the 1

In [50]:
text = f"""
You are a knowledgable medical researcher.
Given this text: {df['article'].iloc[0]}. 
Summarize the medical document such that the quality of summary is best in terms of Relevance, Readability, and Factuality.
"""

device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



You are a knowledgable medical researcher.
Given this text: In temperate climates , winter deaths exceed summer ones . However , there is limited information on the timing and the relative magnitudes of maximum and minimum mortality , by local climate , age group , sex and medical cause of death . We used geo-coded mortality data and wavelets to analyse the seasonality of mortality by age group and sex from 1980 to 2016 in the USA and its subnational climatic regions . Death rates in men and women ≥ 45 years peaked in December to February and were lowest in June to August , driven by cardiorespiratory diseases and injuries . In these ages , percent difference in death rates between peak and minimum months did not vary across climate regions , nor changed from 1980 to 2016 . Under five years , seasonality of all-cause mortality largely disappeared after the 1990s . In adolescents and young adults , especially in males , death rates peaked in June/July and were lowest in December/Januar

In [34]:
!pip install -U accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[0mInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.21.0
    Uninstalling accelerate-0.21.0:
      Successfully uninstalled accelerate-0.21.0
Successfully installed accelerate-0.28.0
