In [None]:
%pip install --upgrade datasets fsspec transformers

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

In [42]:
batchSize = 16
lr = 5e-5
epochs = 3
temperature = 2.0
alpha = 0.5
maxLen = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
raw = load_dataset("tweet_eval", "sentiment")
labelFeatures = raw["train"].features["label"]

In [10]:
print("Label Name:", labelFeatures.names)

Label Name: ['negative', 'neutral', 'positive']


In [21]:
trainData = raw["train"].shuffle(seed = 42).select(range(2500))
validationData = raw["validation"]

In [16]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize(example):
  return tokenizer(example["text"], truncation = True, max_length=maxLen)

tokenized = {}
tokenized['train'] = trainData.map(tokenize, batched = True, remove_columns = ['text'])
tokenized['validation'] = validationData.map(tokenize, batched = True, remove_columns = ['text'])

In [23]:
tokenized

{'train': Dataset({
     features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 2500
 }),
 'validation': Dataset({
     features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 2000
 })}

In [24]:
collator = DataCollatorWithPadding(tokenizer = tokenizer, pad_to_multiple_of=8)
trainingD1 = DataLoader(tokenized["train"], batch_size= batchSize, shuffle=True, collate_fn=collator)
validationD1 = DataLoader(tokenized["validation"], batch_size= batchSize, shuffle= False, collate_fn= collator)

In [25]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

labels = 3
teacherModel = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased", num_labels = labels).to(device)
studentModel = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = labels).to(device)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/391 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-large-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [27]:
teacherModel

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [26]:
studentModel

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [30]:
# Freezing Teacher Model
for p in teacherModel.parameters():
  p.requires_grad = False

teacherModel.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [32]:
crossEntropyLoss = nn.CrossEntropyLoss()
KLLoss = nn.KLDivLoss(reduction = "batchmean")
optimizer = optim.Adam(studentModel.parameters(), lr = lr)

In [33]:
from transformers import get_scheduler
lrScheduler = get_scheduler(
    name = "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = epochs * len(trainingD1)
)

In [34]:
lrScheduler

<torch.optim.lr_scheduler.LambdaLR at 0x7c010bb1a000>

In [40]:
from tqdm.auto import tqdm
def distillation():
  studentModel.train()
  pbar = tqdm(trainingD1, desc = "Train")
  for batch in pbar:
    inputIds = batch["input_ids"].to(device)
    attentionMask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    with torch.no_grad():
      teacherLogits = teacherModel(inputIds, attentionMask).logits
      teacherProbs = torch.softmax(teacherLogits/temperature, dim = -1)

    studentLogits = studentModel(inputIds, attentionMask).logits
    studentProbs = torch.log_softmax(studentLogits/temperature, dim = -1)


    hardLoss = crossEntropyLoss(studentLogits, labels)
    softLoss = KLLoss(studentProbs, teacherProbs) * (temperature**2)

    loss = alpha*(hardLoss) + (1-alpha)*(softLoss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    lrScheduler.step()
    pbar.set_postfix({"loss": f"{loss.item():.3f}"})

In [37]:
def evaluate():
  studentModel.eval()
  correct, total = 0, 0
  with torch.no_grad():
    for batch in validationD1:
      ids = batch["input_ids"].to(device)
      attn = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)
      output = studentModel(ids, attn).logits
      prediction = output.argmax(dim = 1)
      correct += (prediction == labels).sum().item()
      total += len(labels)
  return round(correct/total * 100, 2)

In [43]:
for epoch in range(1, epochs+1):
  distillation()
  accuracy = evaluate()
  print(f"Epoch: {epoch}/{epochs} Accuracy: {accuracy}%")

Train:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 1/3 Accuracy: 65.15%


Train:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 2/3 Accuracy: 65.15%


Train:   0%|          | 0/157 [00:00<?, ?it/s]

Epoch: 3/3 Accuracy: 65.15%


In [45]:
studentModel.save_pretrained("distelledStudentModel")
tokenizer.save_pretrained("distelledStudentModel")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('distelledStudentModel/tokenizer_config.json',
 'distelledStudentModel/tokenizer.json')

In [47]:
testingData = load_dataset("tweet_eval", "sentiment", split = "test[:500]")
tokenizedTest = testingData.map(tokenize, batched = True, remove_columns = ["text"])
testingD1 = DataLoader(tokenizedTest, batch_size = batchSize, shuffle = False, collate_fn = collator)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [53]:
def PredictionAndEvaluation(model, name, testingD1):
    model.eval()
    preds, all_labels = [], []
    start = time.time()

    with torch.no_grad():
        for batch in testingD1:
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            batch_labels = batch["labels"].to(device)

            logits = model(ids, attention_mask=attn).logits
            prediction = logits.argmax(dim=1)

            preds.extend(prediction.cpu().tolist())
            all_labels.extend(batch_labels.cpu().tolist())

    totalTime = time.time() - start
    accuracy = accuracy_score(all_labels, preds)

    avgTime = totalTime / len(testingD1.dataset)

    print(f"\n{name}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    print(f"Total Inference Time: {totalTime:.4f} seconds")
    print(f"Average Time per Sample: {avgTime:.6f} seconds")

    return accuracy, totalTime, avgTime

In [None]:
PredictionAndEvaluation(teacherModel, "Teacher Model", testingD1)
PredictionAndEvaluation(studentModel, "Distilled Student Model", testingD1)

### LLM

In [22]:
%pip install transformers accelerate bitsandbytes

In [23]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig

In [24]:
teacherModelID = "microsoft/phi-2"
studentModelID = "microsoft/phi-1_5"

In [25]:
teacherTokenizer = AutoTokenizer.from_pretrained(teacherModelID)
studentTokenizer = AutoTokenizer.from_pretrained(studentModelID)

if teacherTokenizer.pad_token is None:
    teacherTokenizer.pad_token = teacherTokenizer.eos_token

if studentTokenizer.pad_token is None:
    studentTokenizer.pad_token = studentTokenizer.eos_token

In [26]:
bnbConfig = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [27]:
teacherConfig = AutoConfig.from_pretrained(teacherModelID)
teacherConfig.pad_token_id = teacherTokenizer.eos_token_id

teacherModel = AutoModelForCausalLM.from_pretrained(
    teacherModelID,
    config=teacherConfig,
    quantization_config=bnbConfig,
    device_map="auto"
)

Loading weights:   0%|          | 0/453 [00:00<?, ?it/s]

In [28]:
studentConfig = AutoConfig.from_pretrained(studentModelID)
studentConfig.pad_token_id = studentTokenizer.eos_token_id

studentModel = AutoModelForCausalLM.from_pretrained(
    studentModelID,
    config=studentConfig,
    quantization_config=bnbConfig,
    device_map="auto"
)

Loading weights:   0%|          | 0/341 [00:00<?, ?it/s]

In [29]:
teacherModel.eval()
for p in teacherModel.parameters():
    p.requires_grad = False

In [30]:
prompts = [
    "Explain why the sky is blue. ### The sky appears blue because molecules in Earth's atmosphere scatter sunlight, and blue light is scattered more than other colors due to its shorter wavelength.",
    "What is the capital of France? ### The capital of France is Paris.",
    "Write a short story about a robot and a cat. ### Once upon a time, a lonely robot found a stray cat. They became best friends, exploring the city together, and the robot learned the meaning of companionship."
]

In [32]:
temperature = 2.0
alpha_soft  = 0.7
CELoss     = nn.CrossEntropyLoss(ignore_index=studentTokenizer.pad_token_id)
KLLoss     = nn.KLDivLoss(reduction="batchmean")
optimizer   = optim.AdamW(studentModel.parameters(), lr=2e-5)

In [None]:
for prompt in prompts:
    t_inputs = teacherTokenizer(prompt, return_tensors="pt", padding=True).to(teacherModel.device)
    s_inputs = studentTokenizer(prompt, return_tensors="pt", padding=True).to(studentModel.device)

    with torch.no_grad():
        t_logits = teacherModel(**t_inputs).logits[:, :-1, :]
        t_soft = torch.softmax(t_logits / temperature, dim=-1)
        t_soft = torch.clamp(t_soft, min=1e-8)

    s_logits = studentModel(**s_inputs).logits[:, :-1, :]
    s_log_soft = torch.log_softmax(s_logits / temperature, dim=-1)

    labels = s_inputs["input_ids"][:, 1:].contiguous()

    loss_hard = CELoss(s_logits.reshape(-1, s_logits.size(-1)), labels.reshape(-1))
    loss_soft = KLLoss(s_log_soft, t_soft) * (temperature ** 2)

    loss = alpha_soft * loss_soft + (1 - alpha_soft) * loss_hard

    if torch.isnan(loss):
        print("NaN detected on prompt:", prompt[:50])
        continue

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Prompt: {prompt[:40]}..., Loss: {loss.item():.4f}")