In [4]:
import warnings
warnings.filterwarnings("ignore", message="The current process just got forked")
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



from transformers import TrainerCallback, TrainingArguments, Trainer
from typing import Dict, Union
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
import os

In [5]:
class PubMedDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.abstracts = self.load_abstracts(file_path)
        print(f"File path: {file_path}")
        print(f"File exists: {os.path.exists(file_path)}")
        if os.path.exists(file_path):
            print(f"File size: {os.path.getsize(file_path)} bytes")
        print(f"Number of abstracts loaded: {len(self.abstracts)}")
        if len(self.abstracts) > 0:
            print(f"First few characters of the first abstract: {self.abstracts[0][:200]}")
        else:
            print("No abstracts were loaded. Check the file content and format.")

    def load_abstracts(self, file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            abstracts = [abs.strip() for abs in content.split('###') if abs.strip()]
            return abstracts
        except Exception as e:
            print(f"Error loading abstracts: {str(e)}")
            return []

    def __len__(self):
        return len(self.abstracts)

    def __getitem__(self, idx):
        abstract = self.abstracts[idx]
        encoded = self.tokenizer.encode_plus(
            abstract,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten()
        }

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

auth_token = "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"
model_name = "meta-llama/Meta-Llama-3-8B"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=auth_token)
    tokenizer.pad_token = tokenizer.eos_token
    
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    
    # Try loading with bfloat16 first
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=auth_token,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )
    except RuntimeError:
        # If bfloat16 fails, try with float16
        print("Loading with bfloat16 failed, trying float16...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=auth_token,
            torch_dtype=torch.float16,
            device_map="auto"
        )
    
    model.config.pad_token_id = tokenizer.pad_token_id
    
    print(f"Model loaded: {model_name}")
    print(f"Model dtype: {model.dtype}")
    print(f"Tokenizer vocabulary size: {len(tokenizer)}")
    print(f"Padding token: {tokenizer.pad_token}")
    print(f"Padding token ID: {tokenizer.pad_token_id}")

except ImportError as e:
    print(f"ImportError: {e}")
    print("Try installing the required libraries with:")
    print("pip install accelerate transformers torch")

except Exception as e:
    print(f"An error occurred: {e}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


CUDA available: True
Number of GPUs: 4


Downloading shards: 100%|██████████| 4/4 [00:44<00:00, 11.14s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.17it/s]


Model loaded: meta-llama/Meta-Llama-3-8B
Model dtype: torch.bfloat16
Tokenizer vocabulary size: 128256
Padding token: <|end_of_text|>
Padding token ID: 128001


In [8]:
lora_config = LoraConfig(
    r=8,  # Increased rank
    lora_alpha=4,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [9]:
train_dataset = PubMedDataset('pub-200k/train.txt', tokenizer)
print(f"Loaded {len(train_dataset)} abstracts for training")

if len(train_dataset) > 0:
    example_item = train_dataset[0]
    input_ids = example_item['input_ids']
    attention_mask = example_item['attention_mask']
    
    print("\nExample of tokenized input:")
    print("Input IDs (first 10):", input_ids[:10].tolist())
    print("Attention Mask (first 10):", attention_mask[:10].tolist())
    
    decoded_text = tokenizer.decode(input_ids)
    print("\nDecoded text (first 100 characters):")
    print(decoded_text[:100])

File path: pub-200k/train.txt
File exists: True
File size: 357620996 bytes
Number of abstracts loaded: 190657
First few characters of the first abstract: 24491034
BACKGROUND	The emergence of HIV as a chronic condition means that people living with HIV ar
Loaded 190657 abstracts for training

Example of tokenized input:
Input IDs (first 10): [128000, 13719, 21056, 1958, 198, 82522, 33026, 49179, 315, 23495]
Attention Mask (first 10): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Decoded text (first 100 characters):
<|begin_of_text|>24491034
BACKGROUND	The emergence of HIV as a chronic condition means that people l


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,  # Reduced batch size
    gradient_accumulation_steps=4,  # Increased gradient accumulation
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=200,
    fp16=True,
    gradient_checkpointing=True,
    learning_rate=1e-4,  # Slightly reduced learning rate
    max_grad_norm=0.3,  # Reduced max gradient norm for stability
    remove_unused_columns=False,  # Important for some LoRA setups
)


In [11]:


class CustomCallback(TrainerCallback):
    def __init__(self):
        self.training_loss = 0.0
        self.logging_steps = 0

    def on_log(self, args: TrainingArguments, state, control, logs: Dict[str, float] = None, **kwargs):
        if state.is_local_process_zero:
            if 'loss' in logs:
                self.training_loss += logs['loss']
                self.logging_steps += 1
            
            if state.global_step % args.logging_steps == 0:
                avg_loss = self.training_loss / self.logging_steps if self.logging_steps > 0 else 0
                print(f"Step {state.global_step}: "
                      f"Loss: {avg_loss:.4f}, "
                      f"Learning Rate: {logs.get('learning_rate', 0):.2e}")
                self.training_loss = 0.0
                self.logging_steps = 0

    def on_evaluate(self, args: TrainingArguments, state, control, metrics: Dict[str, float] = None, **kwargs):
        if state.is_local_process_zero:
            print(f"Evaluation at step {state.global_step}:")
            for key, value in metrics.items():
                print(f"  {key}: {value:.4f}")

# Assuming you've already defined your model, tokenizer, dataset, and training arguments

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# Add the custom callback to the trainer
custom_callback = CustomCallback()
trainer.add_callback(custom_callback)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [12]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

def load_model_and_tokenizer(base_model_name, peft_model_path):
    # Load the base model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)

    # Load the LoRA weights
    model = PeftModel.from_pretrained(model, peft_model_path)
    return model, tokenizer

def generate_prediction(model, tokenizer, prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Load the model and tokenizer
base_model_name = "meta-llama/Meta-Llama-3-8B"  # Replace with your base model name
peft_model_path = "./results/checkpoint-10"  # Replace with your LoRA checkpoint path
model, tokenizer = load_model_and_tokenizer(base_model_name, peft_model_path)

# Example usage
prompt = "Summarize the main findings of recent studies on climate change:"
generated_text = generate_prediction(model, tokenizer, prompt)
print("Generated text:")
print(generated_text)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.50s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Generated text:
Summarize the main findings of recent studies on climate change: what are the main impacts of climate change in the areas of land use, water, biodiversity, and human health?
The main impacts of climate change on land use, water, biodiversity, and human health include:
Land use:
Rising temperatures and changing


In [19]:
prompts = [
    "Summarize the key points of a recent medical study on diabetes:",
    "What are the main symptoms of COVID-19?",
    "Explain the process of photosynthesis in simple terms:",
]

for prompt in prompts:
    generated_text = generate_prediction(model, tokenizer, prompt)
    print(f"Prompt: {prompt}\nGenerated text: {generated_text}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt: Summarize the key points of a recent medical study on diabetes:
Generated text: Summarize the key points of a recent medical study on diabetes: the 3-year Diabetes Prevention Program (DPP) trial
The 3-year Diabetes Prevention Program (DPP) trial, which included 3,234 adults with impaired glucose tolerance, was conducted to determine whether intensive lifestyle intervention or metformin



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt: What are the main symptoms of COVID-19?
Generated text: What are the main symptoms of COVID-19? The main symptoms of COVID-19 are fever, cough and tiredness. Other symptoms may include loss of smell and taste, headache, sore throat, muscle pain, nasal congestion, nausea, diarrhoea and vomiting. Some people may develop more severe

Prompt: Explain the process of photosynthesis in simple terms:
Generated text: Explain the process of photosynthesis in simple terms: What is the chemical formula for photosynthesis? Why is it important?
What is photosynthesis? What are its products? How does it differ from cellular respiration? What are the differences between C3, C4, and CAM photosynthesis?
What



In [26]:
def compare_models(base_model, lora_model, tokenizer, prompt):
       print(f"Prompt: {prompt}")
       
       base_output = generate_prediction(base_model, tokenizer, prompt)
       print("Base model output:")
       print(base_output)
       
       lora_output = generate_prediction(lora_model, tokenizer, prompt)
       print("\nLoRA model output:")
       print(lora_output)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")

prompt = "Mean initial clinical activity score was 4.75 1.2 and 5 1.3 for group I and group II before treatment ,"
compare_models(base_model, model, tokenizer, prompt)

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.87s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Prompt: Mean initial clinical activity score was 4.75 1.2 and 5 1.3 for group I and group II before treatment ,


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Base model output:
Mean initial clinical activity score was 4.75 1.2 and 5 1.3 for group I and group II before treatment, respectively. The mean clinical activity score was 1.75 1.1 and 1.7 1.1 for group I and group II after treatment, respectively. The mean clinical activity score was 2.25 1.1

LoRA model output:
Mean initial clinical activity score was 4.75 1.2 and 5 1.3 for group I and group II before treatment, respectively (p = 0.17).
Mean initial clinical activity score was 4.75 1.2 and 5 1.3 for group I and group II before treatment, respectively (p = 0.17).
Mean initial
