# Intent Detection

## Load libraries

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math
from pprint import pprint

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd
import datasets
import transformers
datasets.logging.set_verbosity_error()
transformers.logging.set_verbosity_error()

from datasets import load_dataset, load_metric
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, 
                          TextClassificationPipeline, AdamW, get_linear_schedule_with_warmup)

from transformerlab.pruning import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using transformers v{transformers.__version__} and datasets v{datasets.__version__}")
print(f"Running on device: {device}")

Using transformers v4.1.1 and datasets v1.2.0
Running on device: cuda


## Load and inspect data

In [None]:
clinc = load_dataset("clinc_oos", "plus")
clinc

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
})

In [None]:
clinc.rename_column_("intent", "labels")

In [None]:
clinc['train'][0]

{'labels': 61,
 'text': 'what expression would i use to say i love you if i were an italian'}

In [None]:
clinc.set_format('pandas')

In [None]:
df = clinc['train'][:]
df.head()

Unnamed: 0,labels,text
0,61,what expression would i use to say i love you ...
1,61,can you tell me how to say 'i do not speak muc...
2,61,"what is the equivalent of, 'life is good' in f..."
3,61,"tell me how to say, 'it is a beautiful morning..."
4,61,"if i were mongolian, how would i say that i am..."


In [None]:
df['labels'].value_counts().unique()

array([250, 100])

In [None]:
df['labels'].nunique()

151

In [None]:
df[df['labels'] == 42]

Unnamed: 0,labels,text
15000,42,how much is an overdraft fee for bank
15001,42,why are exponents preformed before multiplicat...
15002,42,what size wipers does this car take
15003,42,where is the dipstick
15004,42,how much is 1 share of aapl
...,...,...
15245,42,how can i get involved in yoga
15246,42,is yoga healthy
15247,42,what's the alma mater of the man that started ...
15248,42,who has the most subscribers on youtube


In [None]:
clinc.reset_format()

## Metrics

In [None]:
accuracy_score = load_metric('accuracy')

In [None]:
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_score.compute(predictions=predictions, references=labels)

## Config

In [None]:
num_labels = 151

## Fine-tune BERT-large

In [None]:
bl_ckpt = "bert-large-uncased-whole-word-masking"
bl_tokenizer = AutoTokenizer.from_pretrained(bl_ckpt)

In [None]:
def tokenize_and_encode(x, tokenizer): return tokenizer(x['text'], truncation=True)

clinc_enc = clinc.map(tokenize_and_encode, fn_kwargs={'tokenizer' : bl_tokenizer}, batched=True)

In [None]:
bl_model = AutoModelForSequenceClassification.from_pretrained(bl_ckpt, num_labels=num_labels).to(device)

batch_size = 64
learning_rate = 2e-5
num_train_epochs = 3
logging_steps = len(clinc_enc['train']) // batch_size

args = TrainingArguments(
    output_dir='checkpoints',
    evaluation_strategy='epoch',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=0.01,
    logging_steps=logging_steps,
    disable_tqdm=False
)

bl_trainer = Trainer(
    args=args,
    model= bl_model,
    train_dataset=clinc_enc['train'],
    eval_dataset=clinc_enc['validation'],
    tokenizer=bl_tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
bl_trainer.evaluate()

{'eval_loss': 5.149877548217773, 'eval_accuracy': 0.0064516129032258064}

In [None]:
bl_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.028045,1.064417,0.905806
2,0.689029,0.384993,0.951613
3,0.290054,0.303301,0.954839


TrainOutput(global_step=717, training_loss=1.3311438547184944)

In [None]:
bl_trainer.save_model("models/bert-large-uncased-wwm-finetuned-clinc")

## Fine-tune BERT-base

In [None]:
bb_ckpt = "bert-base-uncased"
bb_tokenizer = AutoTokenizer.from_pretrained(bb_ckpt)

In [None]:
def model_init(): 
    return AutoModelForSequenceClassification.from_pretrained(bb_ckpt, num_labels=num_labels).to(device)

batch_size = 64
learning_rate = 2e-5
num_train_epochs = 6
logging_steps = len(clinc_enc['train']) // batch_size

args = TrainingArguments(
    output_dir='checkpoints',
    evaluation_strategy='epoch',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=0.01,
    logging_steps=logging_steps,
    disable_tqdm=False
)

trainer = Trainer(
    args=args,
    model_init= model_init,
    train_dataset=clinc_enc['train'],
    eval_dataset=clinc_enc['validation'],
    tokenizer=bb_tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# trainer.evaluate()

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.428613,3.549596,0.603226
2,2.973511,2.29946,0.858065
3,1.980071,1.550097,0.915484
4,1.365313,1.12765,0.935484
5,1.027216,0.923742,0.941613
6,0.872311,0.860549,0.942903


TrainOutput(global_step=1434, training_loss=2.1027118009170893)

In [None]:
trainer.save_model("models/bert-base-uncased-finetuned-clinc")

## Fine-tune DistilBERT

In [None]:
dbert_ckpt = "distilbert-base-uncased"
dbert_tokenizer = AutoTokenizer.from_pretrained(dbert_ckpt)

clinc_enc = clinc.map(tokenize_and_encode, fn_kwargs={'tokenizer' : dbert_tokenizer}, batched=True)

def model_init(): 
    return AutoModelForSequenceClassification.from_pretrained(dbert_ckpt, num_labels=num_labels).to(device)

batch_size = 64
learning_rate = 2e-5
num_train_epochs = 6
logging_steps = len(clinc_enc['train']) // batch_size

args = TrainingArguments(
    output_dir='checkpoints',
    evaluation_strategy='epoch',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=0.01,
    logging_steps=logging_steps,
    disable_tqdm=False
)

trainer = Trainer(
    args=args,
    model_init= model_init,
    train_dataset=clinc_enc['train'],
    eval_dataset=clinc_enc['validation'],
    tokenizer=dbert_tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# trainer.evaluate()

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.430927,3.567706,0.678387
2,2.968706,2.220834,0.823871
3,1.869094,1.402024,0.877097
4,1.229911,0.984138,0.906452
5,0.891678,0.794795,0.914839
6,0.749713,0.740458,0.919677


TrainOutput(global_step=1434, training_loss=2.017932115738362)

In [None]:
trainer.save_model("models/distilbert-base-uncased-finetuned-clinc")

## Distill from BERT-large to BERT-base

The main thing we need to implement task-specific distillation is augment the standard cross-entropy loss with a distillation term (see above equation). We can implement this by overriding the `compute_loss` method of the `QuestionAnsweringTrainer`, but first let's define the training arguments we'll need:

In [None]:
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha_ce=0.5, alpha_distil=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha_ce = alpha_ce
        self.alpha_distil = alpha_distil
        self.temperature = temperature
        self.disable_tqdm = False

For the trainer, we'll need a few ingredients:

* We need two models (a teacher and student), and since the `model` attribute is the one that is optimized, we'll just add an attribute for the teacher
* When we pass the question and context to the student or teacher, we get a range of scores (logits) for the start and end positions. Since we want to minimize the distance between the teacher and student predictions , we'll use the KL-divergence as our distillation loss
* Once the distillation loss is computed, we take a linear combination with the cross-entropy to obtain our final loss function

The following code does the trick:

In [None]:
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.teacher.eval()
        self.train_dataset.set_format(
            type=self.train_dataset.format["type"], columns=list(self.train_dataset.features.keys()))

    def compute_loss(self, model, inputs):
        inputs_stu = {
            "input_ids": inputs['input_ids'],
            "attention_mask": inputs['attention_mask'],
            "labels": inputs['labels']
            }
        if "token_type_ids" in inputs:
            inputs_stu['token_type_ids'] = inputs['token_type_ids']
        outputs_stu = model(**inputs_stu)
        loss = outputs_stu.loss
        logits_stu = outputs_stu.logits
        
        with torch.no_grad():
            outputs_tea = self.teacher(
                input_ids=inputs["input_ids"], 
                token_type_ids=inputs["token_type_ids"],
                attention_mask=inputs["attention_mask"],
                labels=inputs["labels"])
            logits_tea = outputs_tea.logits
        assert logits_tea.size() == logits_stu.size()
        
        loss_fct = nn.KLDivLoss(reduction="batchmean")
        loss_logits = (loss_fct(
            F.log_softmax(logits_stu / self.args.temperature, dim=-1),
            F.softmax(logits_tea / self.args.temperature, dim=-1)) * (self.args.temperature ** 2))
        loss = self.args.alpha_distil * loss_logits + self.args.alpha_ce * loss
        return loss

It's then a similar process to configure and initialise the trainer:

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

student_training_args = DistillationTrainingArguments(
    output_dir=f"checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_steps=logging_steps,
)

In [None]:
teacher_checkpoint = "models/bert-large-uncased-wwm-finetuned-clinc150"
student_checkpoint = "bert-base-uncased"
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_checkpoint, num_labels=num_labels).to(device)

def student_init():
    return AutoModelForSequenceClassification.from_pretrained(student_checkpoint, num_labels=num_labels).to(device)

student_tokenizer = AutoTokenizer.from_pretrained(student_checkpoint)

distil_trainer = DistillationTrainer(
    model_init=student_init,
    teacher_model=teacher_model,
    args=student_training_args,
    train_dataset=clinc_enc['train'],
    eval_dataset=clinc_enc['validation'],
    compute_metrics=compute_metrics,
    tokenizer=student_tokenizer)

In [None]:
distil_trainer.evaluate()

{'eval_loss': 5.067701816558838, 'eval_accuracy': 0.005483870967741935}

In [None]:
distil_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.354883,3.548548,0.622581
2,2.252753,2.219483,0.82871
3,1.514512,1.468975,0.890968
4,1.08153,1.065988,0.917097
5,0.839902,0.876521,0.920645
6,0.733032,0.816523,0.92871


TrainOutput(global_step=1434, training_loss=1.6256008710156258)

### 60:40 teacher / student

In [None]:
def set_distill_ratio(r=0.5):
    distil_trainer.args.alpha_distil = r
    distil_trainer.args.alpha_ce = 1 - r

In [None]:
set_distill_ratio(0.6)

In [None]:
distil_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.070265,3.434687,0.656774
2,2.070854,2.180539,0.859677
3,1.414529,1.447803,0.905806
4,1.014256,1.041266,0.929355
5,0.791487,0.849802,0.937742
6,0.689871,0.791683,0.942258


TrainOutput(global_step=1434, training_loss=1.5051100949030707)

### 70:30 teacher / student

In [None]:
set_distill_ratio(0.7)

In [None]:
distil_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.852746,3.416343,0.653548
2,1.920609,2.159794,0.848387
3,1.317353,1.435072,0.89871
4,0.94908,1.032846,0.924839
5,0.74241,0.842547,0.933871
6,0.648488,0.785621,0.938065


TrainOutput(global_step=1434, training_loss=1.4019301113889473)

In [None]:
distil_trainer.save_model('models/bert-base-uncased-distilled-clinc')

## Distill from BERT-base to DistilBERT

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

student_training_args = DistillationTrainingArguments(
    output_dir=f"checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_steps=logging_steps,
)

In [None]:
teacher_checkpoint = "models/bert-base-uncased-finetuned-clinc"
student_checkpoint = "distilbert-base-uncased"
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_checkpoint, num_labels=num_labels).to(device)

def student_init():
    return AutoModelForSequenceClassification.from_pretrained(student_checkpoint, num_labels=num_labels).to(device)

student_tokenizer = AutoTokenizer.from_pretrained(student_checkpoint)

distil_trainer = DistillationTrainer(
    model_init=student_init,
    teacher_model=teacher_model,
    args=student_training_args,
    train_dataset=clinc_enc['train'],
    eval_dataset=clinc_enc['validation'],
    compute_metrics=compute_metrics,
    tokenizer=student_tokenizer)

In [None]:
distil_trainer.train()

## Speed test

As a simple benchmark, here we compare the time it takes for our teacher and student to generate 1,000 predictions on a CPU (to simulate a production environment). First, we load our fine-tuned models:

In [None]:
student_model_ckpt = 'models/bert-base-uncased-distilled-clinc150'
teacher_model_ckpt = 'models/bert-large-uncased-wwm-finetuned-clinc150'

student_tokenizer = AutoTokenizer.from_pretrained(student_model_ckpt)
student_model = AutoModelForSequenceClassification.from_pretrained(student_model_ckpt, num_labels=num_labels).to('cpu')

teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_ckpt)
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_ckpt, num_labels=num_labels).to('cpu')

Next we create two pipelines for the student and teacher:

In [None]:
student_pipe = TextClassificationPipeline(model=student_model, tokenizer=student_tokenizer)
teacher_pipe = TextClassificationPipeline(model=teacher_model, tokenizer=teacher_tokenizer)

And then run the inference test:

In [None]:
%%time

for idx in range(100):
    teacher_pipe(clinc['test'][idx]['text'])

CPU times: user 39min, sys: 26.3 s, total: 39min 26s
Wall time: 5min 54s


In [None]:
%%time

for idx in range(100):
    student_pipe(clinc['test'][idx]['text'])

CPU times: user 25min 57s, sys: 9.58 s, total: 26min 7s
Wall time: 4min 1s


From this example, we see roughly a 2x speedup from using a distilled model with less than 3% drop in Exact Match / F1-score!

## Movement Pruning

In [None]:
class PruningTrainingArguments(TrainingArguments):
    def __init__(self, *args, initial_threshold=1., final_threshold=0.1, initial_warmup=1, final_warmup=2, final_lambda=0.,
                 mask_scores_learning_rate=0., **kwargs): 
        super().__init__(*args, **kwargs)

        self.initial_threshold = initial_threshold
        self.final_threshold = final_threshold
        self.initial_warmup = initial_warmup
        self.final_warmup = final_warmup
        self.final_lambda = final_lambda
        self.mask_scores_learning_rate = mask_scores_learning_rate
        self.disable_tqdm = False

In [None]:
class PruningTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        if self.args.max_steps > 0:
            self.t_total = self.args.max_steps
            self.args.num_train_epochs = self.args.max_steps // (len(self.get_train_dataloader()) // self.args.gradient_accumulation_steps) + 1
        else:
            self.t_total = len(self.get_train_dataloader()) // self.args.gradient_accumulation_steps * self.args.num_train_epochs
            
        
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if "mask_score" in n and p.requires_grad],
                "lr": self.args.mask_scores_learning_rate,
            },
            {
                "params": [
                    p
                    for n, p in self.model.named_parameters()
                    if "mask_score" not in n and p.requires_grad and not any(nd in n for nd in no_decay)
                ],
                "lr": self.args.learning_rate,
                "weight_decay": self.args.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.model.named_parameters()
                    if "mask_score" not in n and p.requires_grad and any(nd in n for nd in no_decay)
                ],
                "lr": self.args.learning_rate,
                "weight_decay": 0.0,
            },
        ]

        self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        self.lr_scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.t_total
        )
        
        
    def compute_loss(self, model, inputs):
            
        threshold, regu_lambda = self._schedule_threshold(
            step=self.state.global_step+1,
            total_step=self.t_total,
            warmup_steps=self.args.warmup_steps,
            final_threshold=self.args.final_threshold,
            initial_threshold=self.args.initial_threshold,
            final_warmup=self.args.final_warmup,
            initial_warmup=self.args.initial_warmup,
            final_lambda=self.args.final_lambda,
        )
        inputs["threshold"] = threshold  
        outputs = model(**inputs)
        loss, logits = outputs
        return loss
    
    def _schedule_threshold(
        self,
        step: int,
        total_step: int,
        warmup_steps: int,
        initial_threshold: float,
        final_threshold: float,
        initial_warmup: int,
        final_warmup: int,
        final_lambda: float,
    ):
        if step <= initial_warmup * warmup_steps:
            threshold = initial_threshold
        elif step > (total_step - final_warmup * warmup_steps):
            threshold = final_threshold
        else:
            spars_warmup_steps = initial_warmup * warmup_steps
            spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
            mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
            threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3)
        regu_lambda = final_lambda * threshold / final_threshold
        return threshold, regu_lambda

In [None]:
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

masked_config = MaskedBertConfig(pruning_method='topK', mask_init='constant', mask_scale=0., num_labels=num_labels)

def model_init():
    return MaskedBertForSequenceClassification.from_pretrained(model_ckpt, config=masked_config).to(device)

Here we're using a `model_init` function so that we can perform multiple runs wih the same trainer. Next we specify the hyperparameter that will be fixed across each run:

### 0% pruning

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

# pruning params
initial_threshold = 1.
final_threshold = 1
initial_warmup = 1
final_warmup = 2
final_lambda = 0
num_train_epochs = 6
warmup_steps = 0 #logging_steps * num_train_epochs * 0.1
mask_scores_learning_rate = 0 #1e-2

pruning_training_args = PruningTrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=3e-5, # reduce?
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    initial_threshold=initial_threshold,
    final_threshold=final_threshold,
    initial_warmup=initial_warmup,
    final_warmup=final_warmup,
    final_lambda=final_lambda,
    warmup_steps=warmup_steps,
    num_train_epochs=num_train_epochs,
    mask_scores_learning_rate=mask_scores_learning_rate,
    weight_decay=0.01
    
)

In [None]:
# need to do this manually for now!
train_ds = clinc_enc['train']
eval_ds = clinc_enc['validation'].map(lambda x : {'threshold': final_threshold})

In [None]:
pruning_trainer = PruningTrainer(
    model_init=model_init,
    args=pruning_training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
pruning_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.184197,2.965764,0.72129
2,2.223854,1.439246,0.920323
3,1.080129,0.748266,0.945484
4,0.545227,0.471417,0.952903
5,0.326512,0.372631,0.954839
6,0.240158,0.346093,0.955806


TrainOutput(global_step=1434, training_loss=1.4284142205904717)

### 90% weights

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

# pruning params
initial_threshold = 1.
final_threshold = 0.9
initial_warmup = 1
final_warmup = 2
final_lambda = 0
num_train_epochs = 10
warmup_steps = logging_steps * num_train_epochs * 0.1
mask_scores_learning_rate = 1e-2

pruning_training_args = PruningTrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    initial_threshold=initial_threshold,
    final_threshold=final_threshold,
    initial_warmup=initial_warmup,
    final_warmup=final_warmup,
    final_lambda=final_lambda,
    warmup_steps=warmup_steps,
    num_train_epochs=num_train_epochs,
    mask_scores_learning_rate=mask_scores_learning_rate,
    weight_decay=0.01
    
)

In [None]:
# need to do this manually for now!
train_ds = clinc_enc['train']
eval_ds = clinc_enc['validation'].map(lambda x : {'threshold': final_threshold})

In [None]:
pruning_trainer = PruningTrainer(
    model_init=model_init,
    args=pruning_training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
pruning_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.870851,5.123807,0.006452
2,2.779082,2.396365,0.752258
3,1.001543,0.83431,0.913226
4,0.421721,0.543108,0.919677
5,0.221068,0.408917,0.939032
6,0.137422,0.368939,0.94
7,0.091896,0.318676,0.946129
8,0.070028,0.33023,0.944839
9,0.058563,0.311045,0.949355
10,0.053597,0.31024,0.949355


TrainOutput(global_step=2390, training_loss=0.9667764479645127)

In [None]:
pruning_trainer.save_model("models/prunebert-base-uncased-90-finetuned-clinc")

### 70% weights

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

# pruning params
initial_threshold = 1.
final_threshold = 0.7
initial_warmup = 1
final_warmup = 2
final_lambda = 0
num_train_epochs = 10
warmup_steps = logging_steps * num_train_epochs * 0.1
mask_scores_learning_rate = 1e-2

pruning_training_args = PruningTrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    initial_threshold=initial_threshold,
    final_threshold=final_threshold,
    initial_warmup=initial_warmup,
    final_warmup=final_warmup,
    final_lambda=final_lambda,
    warmup_steps=warmup_steps,
    num_train_epochs=num_train_epochs,
    mask_scores_learning_rate=mask_scores_learning_rate,
    weight_decay=0.01
    
)

In [None]:
# need to do this manually for now!
train_ds = clinc_enc['train']
eval_ds = clinc_enc['validation'].map(lambda x : {'threshold': final_threshold})

In [None]:
pruning_trainer = PruningTrainer(
    model_init=model_init,
    args=pruning_training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
pruning_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.870851,5.09547,0.003226
2,2.681642,3.792091,0.316129
3,1.095286,1.268739,0.832258
4,0.60882,0.734605,0.89
5,0.391964,0.615032,0.9
6,0.259445,0.557274,0.906774
7,0.178657,0.462829,0.924839
8,0.127867,0.431092,0.930968
9,0.089605,0.375977,0.93871
10,0.073907,0.373035,0.938387


TrainOutput(global_step=2390, training_loss=1.0338097955392493)

In [None]:
pruning_trainer.save_model("models/prunebert-base-uncased-70-finetuned-clinc")

### 50% weights

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

# pruning params
initial_threshold = 1.
final_threshold = 0.5
initial_warmup = 1
final_warmup = 2
final_lambda = 0
num_train_epochs = 10
warmup_steps = logging_steps * num_train_epochs * 0.1
mask_scores_learning_rate = 1e-2

pruning_training_args = PruningTrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    initial_threshold=initial_threshold,
    final_threshold=final_threshold,
    initial_warmup=initial_warmup,
    final_warmup=final_warmup,
    final_lambda=final_lambda,
    warmup_steps=warmup_steps,
    num_train_epochs=num_train_epochs,
    mask_scores_learning_rate=mask_scores_learning_rate,
    weight_decay=0.01
    
)

In [None]:
# need to do this manually for now!
train_ds = clinc_enc['train']
eval_ds = clinc_enc['validation'].map(lambda x : {'threshold': final_threshold})

In [None]:
pruning_trainer = PruningTrainer(
    model_init=model_init,
    args=pruning_training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
pruning_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.870851,5.135061,0.006774
2,2.746694,3.625696,0.417742
3,1.297235,1.415797,0.815161
4,0.778661,0.848564,0.861613
5,0.535028,0.670781,0.879677
6,0.358197,0.580886,0.903871
7,0.250263,0.518113,0.912903
8,0.166789,0.483631,0.919355
9,0.116787,0.447267,0.923871
10,0.088873,0.42312,0.929355


TrainOutput(global_step=2390, training_loss=1.1166658250357815)

In [None]:
pruning_trainer.save_model("models/prunebert-base-uncased-50-finetuned-clinc")

### 30% weights

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

# pruning params
initial_threshold = 1.
final_threshold = 0.3
initial_warmup = 1
final_warmup = 2
final_lambda = 0
num_train_epochs = 10
warmup_steps = logging_steps * num_train_epochs * 0.1
mask_scores_learning_rate = 1e-2

pruning_training_args = PruningTrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    initial_threshold=initial_threshold,
    final_threshold=final_threshold,
    initial_warmup=initial_warmup,
    final_warmup=final_warmup,
    final_lambda=final_lambda,
    warmup_steps=warmup_steps,
    num_train_epochs=num_train_epochs,
    mask_scores_learning_rate=mask_scores_learning_rate,
    weight_decay=0.01
    
)

In [None]:
# need to do this manually for now!
train_ds = clinc_enc['train']
eval_ds = clinc_enc['validation'].map(lambda x : {'threshold': final_threshold})

In [None]:
pruning_trainer = PruningTrainer(
    model_init=model_init,
    args=pruning_training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
pruning_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.870851,5.142257,0.001935
2,2.859436,4.495208,0.103871
3,1.440854,2.592907,0.585161
4,0.882373,1.055322,0.833871
5,0.55892,0.709714,0.87129
6,0.379529,0.596916,0.888387
7,0.26278,0.484533,0.910968
8,0.16643,0.434604,0.922258
9,0.117576,0.4003,0.927097
10,0.091225,0.394097,0.92871


TrainOutput(global_step=2390, training_loss=1.1585227120122152)

In [None]:
pruning_trainer.save_model("models/prunebert-base-uncased-30-finetuned-clinc")

### 10% weights

In [None]:
batch_size = 64
logging_steps = len(clinc_enc['train']) // batch_size

# pruning params
initial_threshold = 1.
final_threshold = 0.1
initial_warmup = 1
final_warmup = 2
final_lambda = 0
num_train_epochs = 10
warmup_steps = logging_steps * num_train_epochs * 0.1
mask_scores_learning_rate = 1e-2

pruning_training_args = PruningTrainingArguments(
    output_dir="checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
    initial_threshold=initial_threshold,
    final_threshold=final_threshold,
    initial_warmup=initial_warmup,
    final_warmup=final_warmup,
    final_lambda=final_lambda,
    warmup_steps=warmup_steps,
    num_train_epochs=num_train_epochs,
    mask_scores_learning_rate=mask_scores_learning_rate,
    weight_decay=0.01
    
)

In [None]:
# need to do this manually for now!
train_ds = clinc_enc['train']
eval_ds = clinc_enc['validation'].map(lambda x : {'threshold': final_threshold})

In [None]:
pruning_trainer = PruningTrainer(
    model_init=model_init,
    args=pruning_training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
pruning_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.870851,5.037233,0.006452
2,2.989605,5.098973,0.006452
3,1.583201,5.098977,0.006774
4,0.924001,5.037778,0.015484
5,0.580629,3.960984,0.342258
6,0.39231,0.73544,0.895161
7,0.265442,0.451762,0.921935
8,0.181168,0.38665,0.929032
9,0.125114,0.371531,0.932258
10,0.102001,0.356328,0.932903


TrainOutput(global_step=2390, training_loss=1.1968463150527189)

In [None]:
pruning_trainer.save_model("models/prunebert-base-uncased-10-finetuned-clinc")