In [1]:
!pip install transformers datasets accelerate peft
!pip install -U bitsandbytes

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
import time
from peft import get_peft_model, LoraConfig
from transformers import DataCollatorWithPadding

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
# Load SNLI dataset
dataset = load_dataset("snli")

# Select samples for training, validation, and testing
train_dataset = dataset['train'].select(range(0, 550000, 550))  # Every 550th sample
val_dataset = dataset['validation'].select(range(0, 10000, 100))  # Every 100th sample
test_dataset = dataset['test'].select(range(0, 10000, 100))  # Every 100th sample

# Print dataset sizes
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Training samples: 1000
Validation samples: 100
Test samples: 100


In [3]:
# Load the Phi2 model and tokenizer from Hugging Face
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/phi-2",
    num_labels=3,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [4]:
# Tokenize the datasets with padding
def preprocess_function(examples):
    encoding = tokenizer(
        examples['premise'],
        examples['hypothesis'],
        truncation=True,
        padding="max_length",  # Pad to max length
        max_length=512         # Set a max length
    )
    encoding['label'] = examples['label']  # Ensure that label is included

    # Debugging: Print the encoding structure to confirm correct keys
    print(f"Encoding keys: {list(encoding.keys())}")
    return encoding
# Filter out any samples with unexpected label values
def filter_labels(dataset, allowed_labels=[0, 1, 2]):
    return dataset.filter(lambda example: example['label'] in allowed_labels)

# Apply filter to train, validation, and test datasets
train_dataset = filter_labels(train_dataset)
val_dataset = filter_labels(val_dataset)
test_dataset = filter_labels(test_dataset)

# Print dataset sizes after filtering
print(f"Filtered Training samples: {len(train_dataset)}")
print(f"Filtered Validation samples: {len(val_dataset)}")
print(f"Filtered Test samples: {len(test_dataset)}")

# Tokenize datasets
train_tokenized = train_dataset.map(preprocess_function, batched=True)
val_tokenized = val_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)
# Confirm the presence of columns before setting format
# Debugging: Verify that 'input_ids', 'attention_mask', and 'label' are present
print(f"Columns in train_tokenized: {train_tokenized.column_names}")
print(f"Sample entry in train_tokenized: {train_tokenized[0]}")
print(f"Keys in sample entry: {list(train_tokenized[0].keys())}")
print(f"Train tokenized format before setting format: {train_tokenized.column_names}")
# Set the format for PyTorch
train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filtered Training samples: 1000
Filtered Validation samples: 99
Filtered Test samples: 100


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Encoding keys: ['input_ids', 'attention_mask', 'label']


Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Encoding keys: ['input_ids', 'attention_mask', 'label']


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Encoding keys: ['input_ids', 'attention_mask', 'label']
Columns in train_tokenized: ['premise', 'hypothesis', 'label', 'input_ids', 'attention_mask']
Sample entry in train_tokenized: {'premise': 'A person on a horse jumps over a broken down airplane.', 'hypothesis': 'A person is training his horse for a competition.', 'label': 1, 'input_ids': [32, 1048, 319, 257, 8223, 18045, 625, 257, 5445, 866, 19401, 13, 32, 1048, 318, 3047, 465, 8223, 329, 257, 5449, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

In [5]:
# Ensure that the data types for input_ids and attention_mask are int64 and that labels are within range [0, 2]
def check_dataset(dataset):
    for sample in dataset:
        if sample['label'] not in [0, 1, 2]:
            print("Unexpected label value found:", sample['label'])
        if sample['input_ids'].dtype != torch.int64 or sample['attention_mask'].dtype != torch.int64:
            print("Data type mismatch found.")

# Check train, validation, and test datasets for any inconsistencies
check_dataset(train_tokenized)
check_dataset(val_tokenized)
check_dataset(test_tokenized)

In [6]:
# Configure PEFT with LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    r=16,  # Rank for LoRA
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
)

# Wrap the model with PEFT
model = get_peft_model(model, lora_config)



In [7]:
import torch
from torch.utils.data import DataLoader

# Assuming model and tokenizer are already defined and loaded
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Assuming eos_token can serve as padding

# Set pad_token_id for the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Create a DataLoader for the test dataset
test_loader = DataLoader(test_tokenized, batch_size=4, shuffle=False)  # Adjust batch_size as needed

# Initialize variables for tracking predictions and labels
all_predictions = []
all_labels = []

# Set the model to evaluation mode
model.eval()

# Disable gradient calculation
with torch.no_grad():
    for batch in test_loader:
        # Move the batch to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get logits and predictions
        logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0]
        predictions = torch.argmax(logits, dim=-1)

        # Store predictions and labels if available
        all_predictions.append(predictions.cpu())  # Move predictions to CPU
        if 'label' in batch:  # Ensure 'label' is the correct key in your dataset
            all_labels.append(batch['label'].cpu())  # Store true labels

# Concatenate predictions and labels
all_predictions = torch.cat(all_predictions)
if all_labels:
    all_labels = torch.cat(all_labels)

    # Calculate accuracy only if true labels are present
    accuracy = (all_predictions == all_labels).float().mean().item()
    print(f"Accuracy on test set: {accuracy:.4f}")

    # Additional evaluation metrics can be calculated here (e.g., F1 score, precision, recall)

print(f"Pre trained model accuracy: {accuracy:.4f}")

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Accuracy on test set: 0.3600
Pre trained model accuracy: 0.3600
