In [None]:
! pip install peft==0.4.0 datasets

  pid, fd = os.forkpty()


Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.4.0


## Take a look at the possible modules


In [None]:
from transformers import DistilBertForSequenceClassification
import torch

def print_modules(model, indent=0):
    for name, module in model.named_children():
        print('  ' * indent + f"└─ {name}: {type(module).__name__}")
        if list(module.children()):
            print_modules(module, indent + 1)
        else:
            for param_name, param in module.named_parameters(recurse=False):
                print('  ' * (indent + 1) + f"└─ {param_name}: {param.shape}")

# Load the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Print all modules
print("All modules in DistilBERT:")
print_modules(model)

# Print all module names in a flat list
print("\nAll module names (flat list):")
for name, _ in model.named_modules():
    print(name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All modules in DistilBERT:
└─ distilbert: DistilBertModel
  └─ embeddings: Embeddings
    └─ word_embeddings: Embedding
      └─ weight: torch.Size([30522, 768])
    └─ position_embeddings: Embedding
      └─ weight: torch.Size([512, 768])
    └─ LayerNorm: LayerNorm
      └─ weight: torch.Size([768])
      └─ bias: torch.Size([768])
    └─ dropout: Dropout
  └─ transformer: Transformer
    └─ layer: ModuleList
      └─ 0: TransformerBlock
        └─ attention: MultiHeadSelfAttention
          └─ dropout: Dropout
          └─ q_lin: Linear
            └─ weight: torch.Size([768, 768])
            └─ bias: torch.Size([768])
          └─ k_lin: Linear
            └─ weight: torch.Size([768, 768])
            └─ bias: torch.Size([768])
          └─ v_lin: Linear
            └─ weight: torch.Size([768, 768])
            └─ bias: torch.Size([768])
          └─ out_lin: Linear
            └─ weight: torch.Size([768, 768])
            └─ bias: torch.Size([768])
        └─ sa_layer_norm: Layer

## Fine tune the model using LoRA and commit to Hugging Face

In [None]:
import torch
from datasets import load_dataset, load_metric
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import HfApi, HfFolder
from getpass import getpass

# Prompt for Hugging Face access token
hf_token = getpass("Enter your Hugging Face access token: ")

# Set up GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset and tokenizer
dataset = load_dataset('glue', 'sst2')
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding=True)

encoded_dataset = dataset.map(tokenize_function, batched=True)

# Load the accuracy metric
metric = load_metric('accuracy')

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=1)
    return metric.compute(predictions=predictions, references=labels)

# Set up training arguments for GPU
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=True,
    hub_token=hf_token,
    fp16=True  # Enable mixed precision training
)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Print model architecture
print(model)

# Identify potential target modules
target_modules = [name for name, module in model.named_modules() if "lin" in name]
print("Potential target modules:", target_modules)

# Define LoRA Config
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=target_modules
)

# Get the PEFT model and move to GPU
peft_model = get_peft_model(model, peft_config)
peft_model.to(device)
peft_model.print_trainable_parameters()

# Initialize the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate after fine-tuning
print("Evaluating after fine-tuning...")
post_finetune_results = trainer.evaluate()
print("Results after fine-tuning:", post_finetune_results)

# Save the PEFT model locally
peft_model.save_pretrained('./peft-model')

# Push the PEFT model to Hugging Face's Model Hub
peft_model.push_to_hub("my-peft-distilbert", use_auth_token=hf_token)

Enter your Hugging Face access token:  ·····································


Using device: cuda


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The repository for accuracy contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/accuracy.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y




model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2507,0.283276,0.880734
2,0.2677,0.295353,0.883028
3,0.2256,0.289473,0.893349


Evaluating after fine-tuning...


Results after fine-tuning: {'eval_loss': 0.2894730865955353, 'eval_accuracy': 0.893348623853211, 'eval_runtime': 1.3155, 'eval_samples_per_second': 662.888, 'eval_steps_per_second': 41.811, 'epoch': 3.0}




adapter_model.safetensors:   0%|          | 0.00/7.69M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Benuehlinger/my-peft-distilbert/commit/b4ecedd371a0cdda52445f9eacc6151cd1701f30', commit_message='Upload model', commit_description='', oid='b4ecedd371a0cdda52445f9eacc6151cd1701f30', pr_url=None, pr_revision=None, pr_num=None)

## Evaluate the base model, the full finetune model and the LoRA model on out-of-sample sentences.

Note: 20 sentences of certain sentiment, 1 sentence of mixed sentiment, and 1 sentence of nonsense characters.



In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch

# Check if GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the original, fine-tuned, and PEFT models
original_model_name = "distilbert-base-uncased"
fine_tuned_model_name = "Benuehlinger/my-fine-tuned-distilbert"
peft_model_name = "Benuehlinger/my-peft-distilbert"  # Replace with your actual PEFT model name
tokenizer = AutoTokenizer.from_pretrained(original_model_name)

# Load models and move them to the appropriate device
original_model = AutoModelForSequenceClassification.from_pretrained(original_model_name).to(device)
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_name).to(device)

# Load the PEFT model
peft_config = PeftConfig.from_pretrained(peft_model_name)
peft_model = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path).to(device)
peft_model = PeftModel.from_pretrained(peft_model, peft_model_name).to(device)

# Set up pipelines for sequence classification
original_classifier = pipeline("text-classification", model=original_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
fine_tuned_classifier = pipeline("text-classification", model=fine_tuned_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
peft_classifier = pipeline("text-classification", model=peft_model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Generate new sentences
new_sentences = [
    "I love the new movie, it's fantastic!",
    "This product is terrible, I regret buying it.",
    "The weather today is amazing, perfect for a picnic!",
    "I feel so happy and excited about the upcoming event.",
    "The service at this restaurant was extremely slow and disappointing.",
    "I am very satisfied with the quality of this product.",
    "The performance of the team was outstanding in the match.",
    "The book I read recently was very engaging and well-written.",
    "I had a great experience shopping at this store.",
    "The movie I watched last night was a complete waste of time.",
    "The customer service at this company needs improvement.",
    "The food at the restaurant was delicious and well-prepared.",
    "I strongly recommend this product to everyone.",
    "The hotel I stayed in during my vacation was luxurious and comfortable.",
    "The new feature added to the app is very user-friendly.",
    "The concert I attended last week was amazing!",
    "I had a terrible experience with the customer support.",
    "The new design of the website is sleek and modern.",
    "The movie had a predictable plot and was not very entertaining.",
    "The delivery of my order was delayed and caused inconvenience.",
    "This Movie had a good plot but weak special effects.",
    "xyzdoaskeqw"
]

# Compare predictions between the original, fine-tuned, and PEFT models for each sentence
for sentence in new_sentences:
    # Get predictions from all models
    original_prediction = original_classifier(sentence)[0]
    fine_tuned_prediction = fine_tuned_classifier(sentence)[0]
    peft_prediction = peft_classifier(sentence)[0]

    # Print predictions
    print("Sentence:", sentence)
    print("Original Model Prediction:", original_prediction)
    print("Fine-Tuned Model Prediction:", fine_tuned_prediction)
    print("PEFT Model Prediction:", peft_prediction)
    print()

Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/7.69M [00:00<?, ?B/s]

The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification'

Sentence: I love the new movie, it's fantastic!
Original Model Prediction: {'label': 'LABEL_0', 'score': 0.5196182727813721}
Fine-Tuned Model Prediction: {'label': 'LABEL_1', 'score': 0.9996926784515381}
PEFT Model Prediction: {'label': 'LABEL_1', 'score': 0.9993448853492737}

Sentence: This product is terrible, I regret buying it.
Original Model Prediction: {'label': 'LABEL_0', 'score': 0.5233114361763}
Fine-Tuned Model Prediction: {'label': 'LABEL_0', 'score': 0.9982353448867798}
PEFT Model Prediction: {'label': 'LABEL_0', 'score': 0.9869326949119568}

Sentence: The weather today is amazing, perfect for a picnic!
Original Model Prediction: {'label': 'LABEL_0', 'score': 0.5181533098220825}
Fine-Tuned Model Prediction: {'label': 'LABEL_1', 'score': 0.9995694756507874}
PEFT Model Prediction: {'label': 'LABEL_1', 'score': 0.9985925555229187}

Sentence: I feel so happy and excited about the upcoming event.
Original Model Prediction: {'label': 'LABEL_0', 'score': 0.5294833183288574}
Fine-T

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Sentence: The movie I watched last night was a complete waste of time.
Original Model Prediction: {'label': 'LABEL_0', 'score': 0.5215651392936707}
Fine-Tuned Model Prediction: {'label': 'LABEL_0', 'score': 0.9990159273147583}
PEFT Model Prediction: {'label': 'LABEL_0', 'score': 0.9972447156906128}

Sentence: The customer service at this company needs improvement.
Original Model Prediction: {'label': 'LABEL_0', 'score': 0.5050338506698608}
Fine-Tuned Model Prediction: {'label': 'LABEL_0', 'score': 0.9972436428070068}
PEFT Model Prediction: {'label': 'LABEL_0', 'score': 0.6307623386383057}

Sentence: The food at the restaurant was delicious and well-prepared.
Original Model Prediction: {'label': 'LABEL_0', 'score': 0.5133419036865234}
Fine-Tuned Model Prediction: {'label': 'LABEL_1', 'score': 0.9996544122695923}
PEFT Model Prediction: {'label': 'LABEL_1', 'score': 0.9976463913917542}

Sentence: I strongly recommend this product to everyone.
Original Model Prediction: {'label': 'LABEL_0'