In [1]:
!pip install pyarrow==14.0.2
!pip uninstall cudf-cu12
!pip install transformers datasets peft fvcore

[0mCollecting pyarrow==14.0.2
  Using cached pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Using cached pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)
[0mInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 17.0.0
    Uninstalling pyarrow-17.0.0:
      Successfully uninstalled pyarrow-17.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.0.1 requires pyarrow>=15.0.0, but you have pyarrow 14.0.2 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-14.0.2
Collecting fvcore
  Using cached fvcore-0.1.5.post20221221-py3-none-any.whl
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting iopath>=0.1.7 (from fvcore)
  Using cached iopath-0.1.10-py3-none-

In [2]:
# PyTorch imports
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW, SGD  # For optimization

# Hugging Face transformers and datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from datasets import load_dataset

# LoRA imports (from peft package)
from peft import LoraConfig, get_peft_model

# Utility imports
from tqdm.auto import tqdm  # For progress bars

In [3]:
# Load the Pythia model and tokenizer
model_name = "EleutherAI/pythia-1.4b"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)  # Use fast tokenizer
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

lora_config = LoraConfig(r=8)

model = get_peft_model(model, lora_config)

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to GPU

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


PeftModel(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50304, 2048)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXSdpaAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=6144, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048

In [9]:
# Load the dataset (ultrachat)
train_set = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft[:1%]")
test_set = load_dataset("HuggingFaceH4/ultrachat_200k", split="test_sft[:1%]")

print(train_set)
print(test_set)

Dataset({
    features: ['prompt', 'prompt_id', 'messages'],
    num_rows: 2079
})
Dataset({
    features: ['prompt', 'prompt_id', 'messages'],
    num_rows: 231
})


In [10]:
def preprocess_all_conversations(dataset):
    inputs = []
    outputs = []

    # Iterate over every conversation in the dataset
    for conversation in dataset:
        dialogue = ""
        for message in conversation["messages"]:
            if message["role"] == "user":
                # Append user's input to the dialogue history
                dialogue += f"User: {message['content']} "
            elif message["role"] == "assistant":
                # Create input-output pairs
                inputs.append(dialogue.strip())
                outputs.append(f"Assistant: {message['content']}")
                # Add assistant's response to the dialogue history for context
                dialogue += f"Assistant: {message['content']} "

    return inputs, outputs

In [12]:
# Tokenize both inputs and outputs in a batched manner
train_inputs, train_outputs = preprocess_all_conversations(train_set)
train_inputs = train_inputs[:100]
train_outputs = train_outputs[:100]
train_tokenized_inputs = tokenizer(train_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
train_tokenized_outputs = tokenizer(train_outputs, padding=True, truncation=True, max_length=512, return_tensors="pt")

# Convert dataset into PyTorch Datasets and DataLoader
class ChatDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = ChatDataset(
    input_ids=train_tokenized_inputs['input_ids'],
    attention_mask=train_tokenized_inputs['attention_mask'],
    labels=train_tokenized_outputs['input_ids']
)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Preprocess the validation set similarly
test_inputs, test_outputs = preprocess_all_conversations(test_set)
test_inputs = test_inputs[:10]
test_outputs = test_outputs[:10]
test_tokenized_inputs = tokenizer(test_inputs, padding=True, truncation=True, max_length=512, return_tensors="pt")
test_tokenized_outputs = tokenizer(test_outputs, padding=True, truncation=True, max_length=512, return_tensors="pt")

test_dataset = ChatDataset(
    input_ids=test_tokenized_inputs['input_ids'],
    attention_mask=test_tokenized_inputs['attention_mask'],
    labels=test_tokenized_outputs['input_ids']
)

test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [13]:
# Initialize optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)  # Using SGD with momentum

# Optionally, you can set a learning rate scheduler
num_epochs = 3  # Set the number of epochs
train_steps_per_epoch = len(train_inputs) // 4  # Assuming batch size of 4
num_training_steps = num_epochs * train_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [15]:
import time
import matplotlib.pyplot as plt
from fvcore.nn import FlopCountAnalysis
from tqdm.auto import tqdm

# Initialize lists to track losses for both training schemes
train_losses_sgd = []
val_losses_sgd = []

train_losses_ff = []
val_losses_ff = []

# Track FLOPs and training time
total_flops_sgd = 0
total_flops_ff = 0
start_time = time.time()  # Start the timer for total training time

# Function to measure FLOPs for a single forward pass
def get_flops(model, batch):
    try:
        flops = FlopCountAnalysis(model, batch["input_ids"]).total()
    except Exception as e:
        # print(f"Warning: Could not calculate FLOPs due to unsupported operator: {e}")
        flops = 0  # Return zero FLOPs for unsupported operations
    return flops

# Part 1: Traditional SGD/LoRA Training
final_sgd_loss = None

# Training loop for SGD/LoRA
model.train()
for epoch in range(num_epochs):
    epoch_start_time = time.time()  # Track time for each epoch
    total_train_loss = 0

    for batch in tqdm(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}

        # Measure FLOPs for one forward pass
        flops_per_batch = get_flops(model, batch)
        total_flops_sgd += flops_per_batch

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses_sgd.append(avg_train_loss)
    print(f"Epoch {epoch + 1}, Average Training Loss (SGD): {avg_train_loss}")

    # Validation phase
    model.eval()  # Set model to evaluation mode
    total_val_loss = 0

    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(test_dataloader)
    val_losses_sgd.append(avg_val_loss)
    print(f"Epoch {epoch + 1}, Validation Loss (SGD): {avg_val_loss}")

    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    print(f"Epoch {epoch + 1} (SGD) completed in {epoch_time:.2f} seconds")

    model.train()  # Return the model to training mode after validation

# Record the final loss for comparison
final_sgd_loss = avg_val_loss
epsilon = 0.25  # Difference to allow for Fast Forward
target_loss_ff = final_sgd_loss - epsilon

print(f"Final SGD/LoRA Loss after 5 epochs: {final_sgd_loss}")
print(f"Target Loss for Fast Forwarding (within epsilon): {target_loss_ff}")

# End of traditional SGD/LoRA training
total_training_time_sgd = time.time() - start_time
print(f"Total Training Time for SGD: {total_training_time_sgd:.2f} seconds")
print(f"Total FLOPs for SGD training: {total_flops_sgd / 1e12:.2f} TFLOPs")

# Part 2: Fast Forward Training
start_time = time.time()
Tinterval = 6  # Number of regular SGD steps before Fast Forward starts
current_val_loss_ff = float('inf')

# Reset optimizer, scheduler, and FLOP counter
total_flops_ff = 0

# Fast Forward Training Loop
model.train()
step = 0

while current_val_loss_ff > target_loss_ff:
    step += 1
    total_train_loss = 0
    for _ in range(Tinterval):  # Perform regular SGD for Tinterval steps
        for batch in tqdm(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}

            # Measure FLOPs for one forward pass
            flops_per_batch = get_flops(model, batch)
            total_flops_ff += flops_per_batch

            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            total_train_loss += loss.item()

    # Fast Forward Phase (Repeat same update without recomputing gradient)
    model.eval()  # Set model to evaluation mode
    prev_grad = None

    with torch.no_grad():
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            prev_grad = batch  # Take the last gradient

            # Fast Forward step in the same gradient direction
            for _ in range(100):  # Try fast-forwarding up to 100 steps
                for batch in test_dataloader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    flops_per_batch = get_flops(model, prev_grad)
                    total_flops_ff += flops_per_batch

                    outputs = model(**prev_grad)
                    val_loss = outputs.loss
                    if val_loss.item() > target_loss_ff:  # If validation loss stops decreasing
                        break
                    else:
                        current_val_loss_ff = val_loss.item()

    print(f"Fast Forward Training step {step}, Validation Loss: {current_val_loss_ff}")

print(f"Fast Forward Training completed with final loss {current_val_loss_ff}")

# End of Fast Forward training
total_training_time_ff = time.time() - start_time
print(f"Total Training Time for Fast Forward: {total_training_time_ff:.2f} seconds")
print(f"Total FLOPs for Fast Forward training: {total_flops_ff / 1e12:.2f} TFLOPs")

# Plotting Comparison of Training Methods
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses_sgd) + 1), train_losses_sgd, label="SGD Training Loss")
plt.plot(range(1, len(train_losses_ff) + 1), train_losses_ff, label="Fast Forward Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Comparison of SGD and Fast Forward Training")
plt.legend()
plt.show()

  0%|          | 0/25 [00:00<?, ?it/s]

base_model.model.gpt_neox.layers.0.attention.attention_dropout, base_model.model.gpt_neox.layers.1.attention.attention_dropout, base_model.model.gpt_neox.layers.10.attention.attention_dropout, base_model.model.gpt_neox.layers.11.attention.attention_dropout, base_model.model.gpt_neox.layers.12.attention.attention_dropout, base_model.model.gpt_neox.layers.13.attention.attention_dropout, base_model.model.gpt_neox.layers.14.attention.attention_dropout, base_model.model.gpt_neox.layers.15.attention.attention_dropout, base_model.model.gpt_neox.layers.16.attention.attention_dropout, base_model.model.gpt_neox.layers.17.attention.attention_dropout, base_model.model.gpt_neox.layers.18.attention.attention_dropout, base_model.model.gpt_neox.layers.19.attention.attention_dropout, base_model.model.gpt_neox.layers.2.attention.attention_dropout, base_model.model.gpt_neox.layers.20.attention.attention_dropout, base_model.model.gpt_neox.layers.21.attention.attention_dropout, base_model.model.gpt_neox.la

OutOfMemoryError: CUDA out of memory. Tried to allocate 394.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 79.06 MiB is free. Process 215462 has 14.67 GiB memory in use. Of the allocated memory 14.42 GiB is allocated by PyTorch, and 116.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)