In [1]:
!pip install datasets bitsandbytes trl==0.12.1 transformers peft huggingface-hub accelerate safetensors pandas matplotlib numpy==1.26.4



In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    #AutoPeftModelForCausalLM, # Removed from transformers
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer, SFTConfig
# from trl.trainer.utils import DataCollatorForCompletionOnlyLM
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM, PeftConfig # Added to peft
from huggingface_hub import notebook_login
from trl import SFTTrainer, SFTConfig, setup_chat_format, DataCollatorForCompletionOnlyLM

In [3]:
support = torch.cuda.is_bf16_supported(including_emulation=False)
calculate_dtype = torch.bfloat16 if support else torch.float32

In [4]:
print(calculate_dtype)

torch.bfloat16


In [5]:
support = torch.cuda.is_bf16_supported(including_emulation=False)
calculate_dtype = torch.bfloat16 if support else torch.float32
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= calculate_dtype, #calculate_dtype can be bf16 or float32- use bf16 if supported
    bnb_4bit_use_double_quant= True
    )
repo = "microsoft/phi-1"
model = AutoModelForCausalLM.from_pretrained(repo, quantization_config= bnb_config, device_map= "cuda:0")

In [6]:
print(model.get_memory_footprint()/1024/1024)

977.1367797851562


In [7]:
model

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear4bit(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_layernorm): La

In [8]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r = 4, #. rank of LoRA - [4-16]
    bias = "none", # ["all", "lora_only"] - for train bias term
    lora_alpha = 8, # scalling factor
    lora_dropout = 0.10, # prevent overfit- used for regularisation
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type = "CAUSAL_LM"

)

model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2048)
        (layers): ModuleList(
          (0-23): 24 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(

In [9]:
print(model.get_memory_footprint()/1024/1024)

1382.7734985351562


In [10]:
print(model.get_base_model)

<bound method PeftModel.get_base_model of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2048)
        (layers): ModuleList(
          (0-23): 24 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              

In [11]:
print(model.get_memory_footprint()/1e6)

1449.943104


In [12]:
trainable_params, total_params = model.get_nb_trainable_parameters()
percentage = (trainable_params / total_params) * 100

print(f"Trainable Parameters: {trainable_params:,}")
print(f"Total Parameters: {total_params:,}")
print(f"Percentage Trainable: {percentage:.2f}%")

Trainable Parameters: 1,179,648
Total Parameters: 1,419,450,368
Percentage Trainable: 0.08%


In [13]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load dataset
dataset = load_dataset("mteb/tweet_sentiment_extraction")

# Map into input-output format
def format_for_phi(example):
    return {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant that classifies tweets into sentiment: negative, neutral, or positive."},
            {"role": "user", "content": f"Classify the sentiment of this tweet:\n\n{example['text']}"},
            {"role": "assistant", "content": example["label_text"]}
        ]
    }

train_dataset = dataset["train"].select(range(500)).map(format_for_phi) # Select 2000 samples

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Print some samples to inspect the data
print("=== Dataset Info ===")
print(f"Original dataset: {dataset}")
print(f"Train dataset size: {len(train_dataset)}")
print(f"Dataset features: {train_dataset.features}")

print("\n=== Sample Data Inspection ===")
for i in range(3):
    print(f"\nSample {i+1}:")
    print(f"Original text: {dataset['train'][i]['text']}")
    print(f"Original label: {dataset['train'][i]['label_text']}")
    print(f"Formatted messages:")
    for msg in train_dataset[i]['messages']:
        print(f"  {msg['role']}: {msg['content']}")

# Custom function to format messages (since Phi-1 doesn't have chat template)
def format_conversation(messages):
    """Convert messages to a formatted string for Phi-1"""
    conversation = ""
    for message in messages:
        if message["role"] == "system":
            conversation += f"System: {message['content']}\n"
        elif message["role"] == "user":
            conversation += f"User: {message['content']}\n"
        elif message["role"] == "assistant":
            conversation += f"Assistant: {message['content']}"
    return conversation

# Test custom formatting
print("\n=== Custom Formatting Test ===")
sample_messages = train_dataset[0]['messages']
formatted_text = format_conversation(sample_messages)
print(f"Formatted conversation:\n{formatted_text}")

# Tokenize function for the dataset
def tokenize_function(examples):
    # Format conversations using our custom function
    texts = []
    for messages in examples["messages"]:
        conversation = format_conversation(messages)
        texts.append(conversation)

    # Tokenize the formatted conversations
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    # For causal language modeling, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

# Apply tokenization
print("\n=== Tokenizing Dataset ===")
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

print(f"Tokenized dataset: {tokenized_dataset}")
print(f"Dataset features: {tokenized_dataset.features}")

# Print tokenized sample details
print("\n=== Tokenized Sample Analysis ===")
sample_idx = 0
input_ids = tokenized_dataset[sample_idx]['input_ids']
attention_mask = tokenized_dataset[sample_idx]['attention_mask']
labels = tokenized_dataset[sample_idx]['labels']

print(f"Input IDs length: {len(input_ids)}")
print(f"Attention mask length: {len(attention_mask)}")
print(f"Labels length: {len(labels)}")
print(f"Sequence length: {len(input_ids)}")
print(f"Number of padding tokens: {input_ids.count(tokenizer.pad_token_id)}")

print(f"\nFirst 10 tokens: {input_ids[:10]}")
print(f"Decoded first 50 tokens: {tokenizer.decode(input_ids[:50])}")

# Check dataset statistics
print(f"\n=== Dataset Statistics ===")
sequence_lengths = [len(item['input_ids']) for item in tokenized_dataset]
print(f"Average sequence length: {sum(sequence_lengths) / len(sequence_lengths):.2f}")
print(f"Max sequence length: {max(sequence_lengths)}")
print(f"Min sequence length: {min(sequence_lengths)}")

print(f"\n=== Ready for Training Setup ===")
print("Dataset is tokenized and ready for Trainer!")

=== Dataset Info ===
Original dataset: DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 26732
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 3432
    })
})
Train dataset size: 500
Dataset features: {'id': Value('string'), 'text': Value('string'), 'label': Value('int64'), 'label_text': Value('string'), 'messages': List({'content': Value('string'), 'role': Value('string')})}

=== Sample Data Inspection ===

Sample 1:
Original text:  I`d have responded, if I were going
Original label: neutral
Formatted messages:
  system: You are a helpful assistant that classifies tweets into sentiment: negative, neutral, or positive.
  user: Classify the sentiment of this tweet:

 I`d have responded, if I were going
  assistant: neutral

Sample 2:
Original text:  Sooo SAD I will miss you here in San Diego!!!
Original label: negative
Formatted messages:
  system: You are a helpful assistant

In [14]:
import gc
import torch
import os

# Ultra aggressive memory management
# Set memory management environment variables - already set in a previous cell, but keeping for clarity
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Force garbage collection and clear cache
gc.collect()
torch.cuda.empty_cache()

# Optimized parameters for better training stability and performance - Further reduced for memory
min_effective_batch_size = 1  # Absolute minimum batch size per device
lr = 1e-4  # Further reduced learning rate
max_seq_length = 32  # Significantly reduced max sequence length
collator_fn = None
packing = False # Disable packing for memory saving
steps = 5  # Save/log more frequently to see progress, but might increase memory slightly during saves
num_train_epochs = 1  # Only 1 epoch for initial test
warmup_ratio = 0.0 # No warmup

sft_config = SFTConfig(
    output_dir = '/content/drive/MyDrive/phi1-sentiment-lora',
    packing = packing,
    max_seq_length = max_seq_length,
    gradient_checkpointing = True,
    gradient_checkpointing_kwargs = {'use_reentrant': False},
    gradient_accumulation_steps = 8, # Increased accumulation steps
    per_device_train_batch_size = min_effective_batch_size,
    auto_find_batch_size = False, # Disable auto batch size finding
    bf16 = True,
    fp16 = False,
    num_train_epochs = num_train_epochs,
    learning_rate = lr,
    lr_scheduler_type = "constant", # Simpler scheduler
    warmup_ratio = warmup_ratio,
    weight_decay = 0.01,
    max_grad_norm = 1.0,
    report_to = 'none', # Disable reporting to save memory
    run_name = "Phi1-Sentiment-LoRA-Finetune-Minimal",
    logging_dir = '/content/drive/MyDrive/phi1-sentiment-lora/logs_minimal',
    logging_strategy = 'steps',
    save_strategy = 'steps',
    logging_steps = steps,
    save_steps = steps,
    save_total_limit = 1, # Keep only the last checkpoint
    dataloader_pin_memory=False, # Disable pin memory
    remove_unused_columns=True, # Remove unused columns from dataset
    optim="adamw_torch", # Explicitly set optimizer
    group_by_length=False, # Disable group by length
    dataloader_num_workers=0, # Set workers to 0
)

# Clear memory before trainer creation
gc.collect()
torch.cuda.empty_cache()

print("Creating trainer with minimal settings...")
trainer = SFTTrainer(
    model = model,
    train_dataset = tokenized_dataset, # Using the tokenized dataset from cell before
    processing_class = tokenizer,
    data_collator = collator_fn, # Use default collator
    args = sft_config,
)

print("Starting minimal training...")
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training failed with error: {e}")
    print("Model might still be too large for available GPU memory")
    print("Consider using a smaller model or upgrading GPU")

Creating trainer with minimal settings...


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 50256, 'bos_token_id': 50256, 'pad_token_id': 50256}.


Starting minimal training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Training failed with error: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 22.16 GiB of which 59.38 MiB is free. Process 145802 has 22.10 GiB memory in use. Of the allocated memory 21.48 GiB is allocated by PyTorch, and 391.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Model might still be too large for available GPU memory
Consider using a smaller model or upgrading GPU


In [17]:
# # Step 1: Save and merge the model locally first
# trainer.save_model('/content/phi1-sentiment-lora-saved')

# # Step 2: Load and merge the adapter
# from peft import AutoPeftModelForCausalLM

# # Load the saved PEFT model (use the same path as Step 1)
# peft_model = AutoPeftModelForCausalLM.from_pretrained('/content/phi1-sentiment-lora-saved')

# # Merge and unload adapter
# merged_model = peft_model.merge_and_unload()

# # Step 3: Save the merged model
# merged_model.save_pretrained('/content/phi1-sentiment-merged')
# tokenizer.save_pretrained('/content/phi1-sentiment-merged')

# # Step 4: Initialize Hugging Face API and upload the merged model
# from huggingface_hub import HfApi

# api = HfApi()
# api.upload_folder(
#     folder_path='/content/phi1-sentiment-merged',
#     repo_id="your-username/phi1-sentiment-classifier",  # Update with your HF username
#     repo_type="model",
#     commit_message="Upload Phi-1 fine-tuned on tweet sentiment classification dataset with LoRA"
# )